31 files changed, 19279 insertions, 0 deletions
diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_arithmetic.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_arithmetic.hpp
new file mode 100644
index 0000000000..5b3fef6623
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_arithmetic.hpp
@@ -0,0 +1,152 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_ARITHMETIC_HPP
+#define XSIMD_GENERIC_ARITHMETIC_HPP
+
+#include <complex>
+#include <type_traits>
+
+#include "./xsimd_generic_details.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+
+        using namespace types;
+
+        // bitwise_lshift
+        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return detail::apply([](T x, T y) noexcept
+                                 { return x << y; },
+                                 self, other);
+        }
+
+        // bitwise_rshift
+        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return detail::apply([](T x, T y) noexcept
+                                 { return x >> y; },
+                                 self, other);
+        }
+
+        // div
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> div(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return detail::apply([](T x, T y) noexcept -> T
+                                 { return x / y; },
+                                 self, other);
+        }
+
+        // fma
+        template <class A, class T>
+        inline batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
+        {
+            return x * y + z;
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> fma(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            auto res_r = fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real()));
+            auto res_i = fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag()));
+            return { res_r, res_i };
+        }
+
+        // fms
+        template <class A, class T>
+        inline batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
+        {
+            return x * y - z;
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> fms(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            auto res_r = fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real()));
+            auto res_i = fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag()));
+            return { res_r, res_i };
+        }
+
+        // fnma
+        template <class A, class T>
+        inline batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
+        {
+            return -x * y + z;
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> fnma(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            auto res_r = -fms(x.real(), y.real(), fma(x.imag(), y.imag(), z.real()));
+            auto res_i = -fma(x.real(), y.imag(), fms(x.imag(), y.real(), z.imag()));
+            return { res_r, res_i };
+        }
+
+        // fnms
+        template <class A, class T>
+        inline batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<generic>) noexcept
+        {
+            return -x * y - z;
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> fnms(batch<std::complex<T>, A> const& x, batch<std::complex<T>, A> const& y, batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            auto res_r = -fms(x.real(), y.real(), fms(x.imag(), y.imag(), z.real()));
+            auto res_i = -fma(x.real(), y.imag(), fma(x.imag(), y.real(), z.imag()));
+            return { res_r, res_i };
+        }
+
+        // mul
+        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
+        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return detail::apply([](T x, T y) noexcept -> T
+                                 { return x * y; },
+                                 self, other);
+        }
+
+        // sadd
+        template <class A>
+        inline batch<float, A> sadd(batch<float, A> const& self, batch<float, A> const& other, requires_arch<generic>) noexcept
+        {
+            return add(self, other); // no saturated arithmetic on floating point numbers
+        }
+        template <class A>
+        inline batch<double, A> sadd(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
+        {
+            return add(self, other); // no saturated arithmetic on floating point numbers
+        }
+
+        // ssub
+        template <class A>
+        inline batch<float, A> ssub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<generic>) noexcept
+        {
+            return sub(self, other); // no saturated arithmetic on floating point numbers
+        }
+        template <class A>
+        inline batch<double, A> ssub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
+        {
+            return sub(self, other); // no saturated arithmetic on floating point numbers
+        }
+
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_complex.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_complex.hpp
new file mode 100644
index 0000000000..ede95ee937
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_complex.hpp
@@ -0,0 +1,96 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_COMPLEX_HPP
+#define XSIMD_GENERIC_COMPLEX_HPP
+
+#include <complex>
+
+#include "./xsimd_generic_details.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+
+        using namespace types;
+
+        // real
+        template <class A, class T>
+        inline batch<T, A> real(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self;
+        }
+
+        template <class A, class T>
+        inline batch<T, A> real(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self.real();
+        }
+
+        // imag
+        template <class A, class T>
+        inline batch<T, A> imag(batch<T, A> const& /*self*/, requires_arch<generic>) noexcept
+        {
+            return batch<T, A>(T(0));
+        }
+
+        template <class A, class T>
+        inline batch<T, A> imag(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self.imag();
+        }
+
+        // arg
+        template <class A, class T>
+        inline real_batch_type_t<batch<T, A>> arg(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return atan2(imag(self), real(self));
+        }
+
+        // conj
+        template <class A, class T>
+        inline complex_batch_type_t<batch<T, A>> conj(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return { real(self), -imag(self) };
+        }
+
+        // norm
+        template <class A, class T>
+        inline real_batch_type_t<batch<T, A>> norm(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return { fma(real(self), real(self), imag(self) * imag(self)) };
+        }
+
+        // proj
+        template <class A, class T>
+        inline complex_batch_type_t<batch<T, A>> proj(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = complex_batch_type_t<batch<T, A>>;
+            using real_batch = typename batch_type::real_batch;
+            using real_value_type = typename real_batch::value_type;
+            auto cond = xsimd::isinf(real(self)) || xsimd::isinf(imag(self));
+            return select(cond,
+                          batch_type(constants::infinity<real_batch>(),
+                                     copysign(real_batch(real_value_type(0)), imag(self))),
+                          batch_type(self));
+        }
+
+        template <class A, class T>
+        inline batch_bool<T, A> isnan(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            return batch_bool<T, A>(isnan(self.real()) || isnan(self.imag()));
+        }
+    }
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_details.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_details.hpp
new file mode 100644
index 0000000000..fd66e5d03c
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_details.hpp
@@ -0,0 +1,239 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_DETAILS_HPP
+#define XSIMD_GENERIC_DETAILS_HPP
+
+#include <complex>
+
+#include "../../math/xsimd_rem_pio2.hpp"
+#include "../../types/xsimd_generic_arch.hpp"
+#include "../../types/xsimd_utils.hpp"
+#include "../xsimd_constants.hpp"
+
+namespace xsimd
+{
+    // Forward declaration. Should we put them in a separate file?
+    template <class T, class A>
+    inline batch<T, A> abs(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> abs(batch<std::complex<T>, A> const& self) noexcept;
+    template <class T, class A>
+    inline bool any(batch_bool<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> atan2(batch<T, A> const& self, batch<T, A> const& other) noexcept;
+    template <class A, class T_out, class T_in>
+    inline batch<T_out, A> batch_cast(batch<T_in, A> const&, batch<T_out, A> const& out) noexcept;
+    template <class T, class A>
+    inline batch<T, A> bitofsign(batch<T, A> const& self) noexcept;
+    template <class T_out, class T_in, class A>
+    inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> cos(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> cosh(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> exp(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept;
+    template <class T, class A>
+    inline batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept;
+    template <class T, class A>
+    inline batch<T, A> frexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& e) noexcept;
+    template <class T, class A, uint64_t... Coefs>
+    inline batch<T, A> horner(const batch<T, A>& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> hypot(const batch<T, A>& self) noexcept;
+    template <class T, class A>
+    inline batch_bool<T, A> is_even(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch_bool<T, A> is_flint(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch_bool<T, A> is_odd(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch_bool<T, A> isinf(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline typename batch<T, A>::batch_bool_type isnan(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> ldexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& e) noexcept;
+    template <class T, class A>
+    inline batch<T, A> log(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> nearbyint(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<as_integer_t<T>, A> nearbyint_as_int(const batch<T, A>& x) noexcept;
+    template <class T, class A>
+    inline T reduce_add(batch<T, A> const&) noexcept;
+    template <class T, class A>
+    inline batch<T, A> select(batch_bool<T, A> const&, batch<T, A> const&, batch<T, A> const&) noexcept;
+    template <class T, class A>
+    inline batch<std::complex<T>, A> select(batch_bool<T, A> const&, batch<std::complex<T>, A> const&, batch<std::complex<T>, A> const&) noexcept;
+    template <class T, class A>
+    inline batch<T, A> sign(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> signnz(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> sin(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> sinh(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline std::pair<batch<T, A>, batch<T, A>> sincos(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> sqrt(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> tan(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<as_float_t<T>, A> to_float(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<as_integer_t<T>, A> to_int(batch<T, A> const& self) noexcept;
+    template <class T, class A>
+    inline batch<T, A> trunc(batch<T, A> const& self) noexcept;
+
+    namespace kernel
+    {
+
+        namespace detail
+        {
+            template <class F, class A, class T, class... Batches>
+            inline batch<T, A> apply(F&& func, batch<T, A> const& self, batch<T, A> const& other) noexcept
+            {
+                constexpr std::size_t size = batch<T, A>::size;
+                alignas(A::alignment()) T self_buffer[size];
+                alignas(A::alignment()) T other_buffer[size];
+                self.store_aligned(&self_buffer[0]);
+                other.store_aligned(&other_buffer[0]);
+                for (std::size_t i = 0; i < size; ++i)
+                {
+                    self_buffer[i] = func(self_buffer[i], other_buffer[i]);
+                }
+                return batch<T, A>::load_aligned(self_buffer);
+            }
+
+            template <class U, class F, class A, class T>
+            inline batch<U, A> apply_transform(F&& func, batch<T, A> const& self) noexcept
+            {
+                static_assert(batch<T, A>::size == batch<U, A>::size,
+                              "Source and destination sizes must match");
+                constexpr std::size_t src_size = batch<T, A>::size;
+                constexpr std::size_t dest_size = batch<U, A>::size;
+                alignas(A::alignment()) T self_buffer[src_size];
+                alignas(A::alignment()) U other_buffer[dest_size];
+                self.store_aligned(&self_buffer[0]);
+                for (std::size_t i = 0; i < src_size; ++i)
+                {
+                    other_buffer[i] = func(self_buffer[i]);
+                }
+                return batch<U, A>::load_aligned(other_buffer);
+            }
+        }
+
+        namespace detail
+        {
+            // Generic conversion handling machinery. Each architecture must define
+            // conversion function when such conversions exits in the form of
+            // intrinsic. Then we use that information to automatically decide whether
+            // to use scalar or vector conversion when doing load / store / batch_cast
+            struct with_fast_conversion
+            {
+            };
+            struct with_slow_conversion
+            {
+            };
+
+            template <class A, class From, class To, class = void>
+            struct conversion_type_impl
+            {
+                using type = with_slow_conversion;
+            };
+
+            using xsimd::detail::void_t;
+
+            template <class A, class From, class To>
+            struct conversion_type_impl<A, From, To,
+                                        void_t<decltype(fast_cast(std::declval<const batch<From, A>&>(),
+                                                                  std::declval<const batch<To, A>&>(),
+                                                                  std::declval<const A&>()))>>
+            {
+                using type = with_fast_conversion;
+            };
+
+            template <class A, class From, class To>
+            using conversion_type = typename conversion_type_impl<A, From, To>::type;
+        }
+
+        namespace detail
+        {
+            /* origin: boost/simdfunction/horn.hpp*/
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class B, uint64_t c>
+            inline B coef() noexcept
+            {
+                using value_type = typename B::value_type;
+                return B(bit_cast<value_type>(as_unsigned_integer_t<value_type>(c)));
+            }
+            template <class B>
+            inline B horner(const B&) noexcept
+            {
+                return B(typename B::value_type(0.));
+            }
+
+            template <class B, uint64_t c0>
+            inline B horner(const B&) noexcept
+            {
+                return coef<B, c0>();
+            }
+
+            template <class B, uint64_t c0, uint64_t c1, uint64_t... args>
+            inline B horner(const B& self) noexcept
+            {
+                return fma(self, horner<B, c1, args...>(self), coef<B, c0>());
+            }
+
+            /* origin: boost/simdfunction/horn1.hpp*/
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class B>
+            inline B horner1(const B&) noexcept
+            {
+                return B(1.);
+            }
+
+            template <class B, uint64_t c0>
+            inline B horner1(const B& x) noexcept
+            {
+                return x + detail::coef<B, c0>();
+            }
+
+            template <class B, uint64_t c0, uint64_t c1, uint64_t... args>
+            inline B horner1(const B& x) noexcept
+            {
+                return fma(x, horner1<B, c1, args...>(x), detail::coef<B, c0>());
+            }
+        }
+
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_logical.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_logical.hpp
new file mode 100644
index 0000000000..dd446e83dd
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_logical.hpp
@@ -0,0 +1,163 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_LOGICAL_HPP
+#define XSIMD_GENERIC_LOGICAL_HPP
+
+#include "./xsimd_generic_details.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+
+        using namespace types;
+
+        // from  mask
+        template <class A, class T>
+        inline batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<generic>) noexcept
+        {
+            alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
+            // This is inefficient but should never be called. It's just a
+            // temporary implementation until arm support is added.
+            for (size_t i = 0; i < batch_bool<T, A>::size; ++i)
+                buffer[i] = mask & (1ull << i);
+            return batch_bool<T, A>::load_aligned(buffer);
+        }
+
+        // ge
+        template <class A, class T>
+        inline batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return other <= self;
+        }
+
+        // gt
+        template <class A, class T>
+        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return other < self;
+        }
+
+        // is_even
+        template <class A, class T>
+        inline batch_bool<T, A> is_even(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return is_flint(self * T(0.5));
+        }
+
+        // is_flint
+        template <class A, class T>
+        inline batch_bool<T, A> is_flint(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            auto frac = select(isnan(self - self), constants::nan<batch<T, A>>(), self - trunc(self));
+            return frac == T(0.);
+        }
+
+        // is_odd
+        template <class A, class T>
+        inline batch_bool<T, A> is_odd(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return is_even(self - T(1.));
+        }
+
+        // isinf
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> isinf(batch<T, A> const&, requires_arch<generic>) noexcept
+        {
+            return batch_bool<T, A>(false);
+        }
+        template <class A>
+        inline batch_bool<float, A> isinf(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            return abs(self) == std::numeric_limits<float>::infinity();
+        }
+        template <class A>
+        inline batch_bool<double, A> isinf(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            return abs(self) == std::numeric_limits<double>::infinity();
+        }
+
+        // isfinite
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> isfinite(batch<T, A> const&, requires_arch<generic>) noexcept
+        {
+            return batch_bool<T, A>(true);
+        }
+        template <class A>
+        inline batch_bool<float, A> isfinite(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            return (self - self) == 0.f;
+        }
+        template <class A>
+        inline batch_bool<double, A> isfinite(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            return (self - self) == 0.;
+        }
+
+        // isnan
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> isnan(batch<T, A> const&, requires_arch<generic>) noexcept
+        {
+            return batch_bool<T, A>(false);
+        }
+
+        // le
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return (self < other) || (self == other);
+        }
+
+        // neq
+        template <class A, class T>
+        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return !(other == self);
+        }
+
+        // logical_and
+        template <class A, class T>
+        inline batch<T, A> logical_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return detail::apply([](T x, T y) noexcept
+                                 { return x && y; },
+                                 self, other);
+        }
+
+        // logical_or
+        template <class A, class T>
+        inline batch<T, A> logical_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return detail::apply([](T x, T y) noexcept
+                                 { return x || y; },
+                                 self, other);
+        }
+
+        // mask
+        template <class A, class T>
+        inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
+            self.store_aligned(buffer);
+            // This is inefficient but should never be called. It's just a
+            // temporary implementation until arm support is added.
+            uint64_t res = 0;
+            for (size_t i = 0; i < batch_bool<T, A>::size; ++i)
+                if (buffer[i])
+                    res |= 1ul << i;
+            return res;
+        }
+    }
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp
new file mode 100644
index 0000000000..ea2f1567e4
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp
@@ -0,0 +1,2418 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_MATH_HPP
+#define XSIMD_GENERIC_MATH_HPP
+
+#include "../xsimd_scalar.hpp"
+#include "./xsimd_generic_details.hpp"
+#include "./xsimd_generic_trigo.hpp"
+
+#include <type_traits>
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+
+        using namespace types;
+        // abs
+        template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
+        inline batch<T, A> abs(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            if (std::is_unsigned<T>::value)
+                return self;
+            else
+            {
+                auto sign = bitofsign(self);
+                auto inv = self ^ sign;
+                return inv - sign;
+            }
+        }
+
+        template <class A, class T>
+        inline batch<T, A> abs(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            return hypot(z.real(), z.imag());
+        }
+
+        // batch_cast
+        template <class A, class T>
+        inline batch<T, A> batch_cast(batch<T, A> const& self, batch<T, A> const&, requires_arch<generic>) noexcept
+        {
+            return self;
+        }
+
+        namespace detail
+        {
+            template <class A, class T_out, class T_in>
+            inline batch<T_out, A> batch_cast(batch<T_in, A> const& self, batch<T_out, A> const& out, requires_arch<generic>, with_fast_conversion) noexcept
+            {
+                return fast_cast(self, out, A {});
+            }
+            template <class A, class T_out, class T_in>
+            inline batch<T_out, A> batch_cast(batch<T_in, A> const& self, batch<T_out, A> const&, requires_arch<generic>, with_slow_conversion) noexcept
+            {
+                static_assert(!std::is_same<T_in, T_out>::value, "there should be no conversion for this type combination");
+                using batch_type_in = batch<T_in, A>;
+                using batch_type_out = batch<T_out, A>;
+                static_assert(batch_type_in::size == batch_type_out::size, "compatible sizes");
+                alignas(A::alignment()) T_in buffer_in[batch_type_in::size];
+                alignas(A::alignment()) T_out buffer_out[batch_type_out::size];
+                self.store_aligned(&buffer_in[0]);
+                std::copy(std::begin(buffer_in), std::end(buffer_in), std::begin(buffer_out));
+                return batch_type_out::load_aligned(buffer_out);
+            }
+
+        }
+
+        template <class A, class T_out, class T_in>
+        inline batch<T_out, A> batch_cast(batch<T_in, A> const& self, batch<T_out, A> const& out, requires_arch<generic>) noexcept
+        {
+            return detail::batch_cast(self, out, A {}, detail::conversion_type<A, T_in, T_out> {});
+        }
+
+        // bitofsign
+        template <class A, class T>
+        inline batch<T, A> bitofsign(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            static_assert(std::is_integral<T>::value, "int type implementation");
+            if (std::is_unsigned<T>::value)
+                return batch<T, A>(0);
+            else
+                return self >> (T)(8 * sizeof(T) - 1);
+        }
+
+        template <class A>
+        inline batch<float, A> bitofsign(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self & constants::minuszero<batch<float, A>>();
+        }
+        template <class A>
+        inline batch<double, A> bitofsign(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self & constants::minuszero<batch<double, A>>();
+        }
+
+        // bitwise_cast
+        template <class A, class T>
+        inline batch<T, A> bitwise_cast(batch<T, A> const& self, batch<T, A> const&, requires_arch<generic>) noexcept
+        {
+            return self;
+        }
+
+        // cbrt
+        /* origin: boost/simd/arch/common/simd/function/cbrt.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A>
+        inline batch<float, A> cbrt(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            batch_type z = abs(self);
+#ifndef XSIMD_NO_DENORMALS
+            auto denormal = z < constants::smallestposval<batch_type>();
+            z = select(denormal, z * constants::twotonmb<batch_type>(), z);
+            batch_type f = select(denormal, constants::twotonmbo3<batch_type>(), batch_type(1.));
+#endif
+            const batch_type CBRT2(bit_cast<float>(0x3fa14518));
+            const batch_type CBRT4(bit_cast<float>(0x3fcb2ff5));
+            const batch_type CBRT2I(bit_cast<float>(0x3f4b2ff5));
+            const batch_type CBRT4I(bit_cast<float>(0x3f214518));
+            using i_type = as_integer_t<batch_type>;
+            i_type e;
+            batch_type x = frexp(z, e);
+            x = detail::horner<batch_type,
+                               0x3ece0609,
+                               0x3f91eb77,
+                               0xbf745265,
+                               0x3f0bf0fe,
+                               0xbe09e49a>(x);
+            auto flag = e >= i_type(0);
+            i_type e1 = abs(e);
+            i_type rem = e1;
+            e1 /= i_type(3);
+            rem -= e1 * i_type(3);
+            e = e1 * sign(e);
+            const batch_type cbrt2 = select(batch_bool_cast<float>(flag), CBRT2, CBRT2I);
+            const batch_type cbrt4 = select(batch_bool_cast<float>(flag), CBRT4, CBRT4I);
+            batch_type fact = select(batch_bool_cast<float>(rem == i_type(1)), cbrt2, batch_type(1.));
+            fact = select(batch_bool_cast<float>(rem == i_type(2)), cbrt4, fact);
+            x = ldexp(x * fact, e);
+            x -= (x - z / (x * x)) * batch_type(1.f / 3.f);
+#ifndef XSIMD_NO_DENORMALS
+            x = (x | bitofsign(self)) * f;
+#else
+            x = x | bitofsign(self);
+#endif
+#ifndef XSIMD_NO_INFINITIES
+            return select(self == batch_type(0.) || isinf(self), self, x);
+#else
+            return select(self == batch_type(0.), self, x);
+#endif
+        }
+
+        template <class A>
+        inline batch<double, A> cbrt(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            batch_type z = abs(self);
+#ifndef XSIMD_NO_DENORMALS
+            auto denormal = z < constants::smallestposval<batch_type>();
+            z = select(denormal, z * constants::twotonmb<batch_type>(), z);
+            batch_type f = select(denormal, constants::twotonmbo3<batch_type>(), batch_type(1.));
+#endif
+            const batch_type CBRT2(bit_cast<double>(int64_t(0x3ff428a2f98d728b)));
+            const batch_type CBRT4(bit_cast<double>(int64_t(0x3ff965fea53d6e3d)));
+            const batch_type CBRT2I(bit_cast<double>(int64_t(0x3fe965fea53d6e3d)));
+            const batch_type CBRT4I(bit_cast<double>(int64_t(0x3fe428a2f98d728b)));
+            using i_type = as_integer_t<batch_type>;
+            i_type e;
+            batch_type x = frexp(z, e);
+            x = detail::horner<batch_type,
+                               0x3fd9c0c12122a4feull,
+                               0x3ff23d6ee505873aull,
+                               0xbfee8a4ca3ba37b8ull,
+                               0x3fe17e1fc7e59d58ull,
+                               0xbfc13c93386fdff6ull>(x);
+            auto flag = e >= typename i_type::value_type(0);
+            i_type e1 = abs(e);
+            i_type rem = e1;
+            e1 /= i_type(3);
+            rem -= e1 * i_type(3);
+            e = e1 * sign(e);
+            const batch_type cbrt2 = select(batch_bool_cast<double>(flag), CBRT2, CBRT2I);
+            const batch_type cbrt4 = select(batch_bool_cast<double>(flag), CBRT4, CBRT4I);
+            batch_type fact = select(batch_bool_cast<double>(rem == i_type(1)), cbrt2, batch_type(1.));
+            fact = select(batch_bool_cast<double>(rem == i_type(2)), cbrt4, fact);
+            x = ldexp(x * fact, e);
+            x -= (x - z / (x * x)) * batch_type(1. / 3.);
+            x -= (x - z / (x * x)) * batch_type(1. / 3.);
+#ifndef XSIMD_NO_DENORMALS
+            x = (x | bitofsign(self)) * f;
+#else
+            x = x | bitofsign(self);
+#endif
+#ifndef XSIMD_NO_INFINITIES
+            return select(self == batch_type(0.) || isinf(self), self, x);
+#else
+            return select(self == batch_type(0.), self, x);
+#endif
+        }
+
+        // clip
+        template <class A, class T>
+        inline batch<T, A> clip(batch<T, A> const& self, batch<T, A> const& lo, batch<T, A> const& hi, requires_arch<generic>) noexcept
+        {
+            return min(hi, max(self, lo));
+        }
+
+        // copysign
+        template <class A, class T, class _ = typename std::enable_if<std::is_floating_point<T>::value, void>::type>
+        inline batch<T, A> copysign(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return abs(self) | bitofsign(other);
+        }
+
+        // erf
+
+        namespace detail
+        {
+            /* origin: boost/simd/arch/common/detail/generic/erf_kernel.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class B>
+            struct erf_kernel;
+
+            template <class A>
+            struct erf_kernel<batch<float, A>>
+            {
+                using batch_type = batch<float, A>;
+                // computes erf(a0)/a0
+                // x is sqr(a0) and 0 <= abs(a0) <= 2/3
+                static inline batch_type erf1(const batch_type& x) noexcept
+                {
+                    return detail::horner<batch_type,
+                                          0x3f906eba, //   1.128379154774254e+00
+                                          0xbec0937e, //  -3.761252839094832e-01
+                                          0x3de70f22, //   1.128218315189123e-01
+                                          0xbcdb61f4, //  -2.678010670585737e-02
+                                          0x3ba4468d, //   5.013293006147870e-03
+                                          0xba1fc83b //  -6.095205117313012e-04
+                                          >(x);
+                }
+
+                // computes erfc(x)*exp(sqr(x))
+                // x >=  2/3
+                static inline batch_type erfc2(const batch_type& x) noexcept
+                {
+                    return detail::horner<batch_type,
+                                          0x3f0a0e8b, //   5.392844046572836e-01
+                                          0xbf918a62, //  -1.137035586823118e+00
+                                          0x3e243828, //   1.603704761054187e-01
+                                          0x3ec4ca6e, //   3.843569094305250e-01
+                                          0x3e1175c7, //   1.420508523645926e-01
+                                          0x3e2006f0, //   1.562764709849380e-01
+                                          0xbfaea865, //  -1.364514006347145e+00
+                                          0x4050b063, //   3.260765682222576e+00
+                                          0xc0cd1a85, //  -6.409487379234005e+00
+                                          0x40d67e3b, //   6.702908785399893e+00
+                                          0xc0283611 //  -2.628299919293280e+00
+                                          >(x);
+                }
+
+                static inline batch_type erfc3(const batch_type& x) noexcept
+                {
+                    return (batch_type(1.) - x) * detail::horner<batch_type,
+                                                                 0x3f7ffffe, //   9.9999988e-01
+                                                                 0xbe036d7e, //  -1.2834737e-01
+                                                                 0xbfa11698, //  -1.2585020e+00
+                                                                 0xbffc9284, //  -1.9732213e+00
+                                                                 0xc016c985, //  -2.3560498e+00
+                                                                 0x3f2cff3b, //   6.7576951e-01
+                                                                 0xc010d956, //  -2.2632651e+00
+                                                                 0x401b5680, //   2.4271545e+00
+                                                                 0x41aa8e55 //   2.1319498e+01
+                                                                 >(x);
+                }
+            };
+
+            template <class A>
+            struct erf_kernel<batch<double, A>>
+            {
+                using batch_type = batch<double, A>;
+                // computes erf(a0)/a0
+                // x is sqr(a0) and 0 <= abs(a0) <= 0.65
+                static inline batch_type erf1(const batch_type& x) noexcept
+                {
+                    return detail::horner<batch_type,
+                                          0x3ff20dd750429b61ull, // 1.12837916709551
+                                          0x3fc16500f106c0a5ull, // 0.135894887627278
+                                          0x3fa4a59a4f02579cull, // 4.03259488531795E-02
+                                          0x3f53b7664358865aull, // 1.20339380863079E-03
+                                          0x3f110512d5b20332ull // 6.49254556481904E-05
+                                          >(x)
+                        / detail::horner<batch_type,
+                                         0x3ff0000000000000ull, // 1
+                                         0x3fdd0a84eb1ca867ull, // 0.453767041780003
+                                         0x3fb64536ca92ea2full, // 8.69936222615386E-02
+                                         0x3f8166f75999dbd1ull, // 8.49717371168693E-03
+                                         0x3f37ea4332348252ull // 3.64915280629351E-04
+                                         >(x);
+                }
+
+                // computes erfc(x)*exp(x*x)
+                // 0.65 <= abs(x) <= 2.2
+                static inline batch_type erfc2(const batch_type& x) noexcept
+                {
+                    return detail::horner<batch_type,
+                                          0x3feffffffbbb552bull, // 0.999999992049799
+                                          0x3ff54dfe9b258a60ull, // 1.33154163936765
+                                          0x3fec1986509e687bull, // 0.878115804155882
+                                          0x3fd53dd7a67c7e9full, // 0.331899559578213
+                                          0x3fb2488a6b5cb5e5ull, // 7.14193832506776E-02
+                                          0x3f7cf4cfe0aacbb4ull, // 7.06940843763253E-03
+                                          0x0ull // 0
+                                          >(x)
+                        / detail::horner<batch_type,
+                                         0x3ff0000000000000ull, // 1
+                                         0x4003adeae79b9708ull, // 2.45992070144246
+                                         0x40053b1052dca8bdull, // 2.65383972869776
+                                         0x3ff9e677c2777c3cull, // 1.61876655543871
+                                         0x3fe307622fcff772ull, // 0.594651311286482
+                                         0x3fc033c113a7deeeull, // 0.126579413030178
+                                         0x3f89a996639b0d00ull // 1.25304936549413E-02
+                                         >(x);
+                }
+
+                // computes erfc(x)*exp(x*x)
+                // 2.2 <= abs(x) <= 6
+                static inline batch_type erfc3(const batch_type& x) noexcept
+                {
+                    return detail::horner<batch_type,
+                                          0x3fefff5a9e697ae2ull, // 0.99992114009714
+                                          0x3ff9fa202deb88e5ull, // 1.62356584489367
+                                          0x3ff44744306832aeull, // 1.26739901455873
+                                          0x3fe29be1cff90d94ull, // 0.581528574177741
+                                          0x3fc42210f88b9d43ull, // 0.157289620742839
+                                          0x3f971d0907ea7a92ull, // 2.25716982919218E-02
+                                          0x0ll // 0
+                                          >(x)
+                        / detail::horner<batch_type,
+                                         0x3ff0000000000000ull, // 1
+                                         0x400602f24bf3fdb6ull, // 2.75143870676376
+                                         0x400afd487397568full, // 3.37367334657285
+                                         0x400315ffdfd5ce91ull, // 2.38574194785344
+                                         0x3ff0cfd4cb6cde9full, // 1.05074004614827
+                                         0x3fd1d7ab774bb837ull, // 0.278788439273629
+                                         0x3fa47bd61bbb3843ull // 4.00072964526861E-02
+                                         >(x);
+                }
+
+                // computes erfc(rx)*exp(rx*rx)
+                // x >=  6 rx = 1/x
+                static inline batch_type erfc4(const batch_type& x) noexcept
+                {
+                    return detail::horner<batch_type,
+                                          0xbc7e4ad1ec7d0000ll, // -2.627435221016534e-17
+                                          0x3fe20dd750429a16ll, // 5.641895835477182e-01
+                                          0x3db60000e984b501ll, // 2.000889609806154e-11
+                                          0xbfd20dd753ae5dfdll, // -2.820947949598745e-01
+                                          0x3e907e71e046a820ll, // 2.457786367990903e-07
+                                          0x3fdb1494cac06d39ll, // 4.231311779019112e-01
+                                          0x3f34a451701654f1ll, // 3.149699042180451e-04
+                                          0xbff105e6b8ef1a63ll, // -1.063940737150596e+00
+                                          0x3fb505a857e9ccc8ll, // 8.211757799454056e-02
+                                          0x40074fbabc514212ll, // 2.913930388669777e+00
+                                          0x4015ac7631f7ac4fll, // 5.418419628850713e+00
+                                          0xc0457e03041e9d8bll, // -4.298446704382794e+01
+                                          0x4055803d26c4ec4fll, // 8.600373238783617e+01
+                                          0xc0505fce04ec4ec5ll // -6.549694941594051e+01
+                                          >(x);
+                }
+            };
+        }
+        /* origin: boost/simd/arch/common/simd/function/erf.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+
+        template <class A>
+        inline batch<float, A> erf(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            batch_type x = abs(self);
+            batch_type r1(0.);
+            auto test1 = x < batch_type(2.f / 3.f);
+            if (any(test1))
+            {
+                r1 = self * detail::erf_kernel<batch_type>::erf1(x * x);
+                if (all(test1))
+                    return r1;
+            }
+            batch_type z = x / (batch_type(1.) + x);
+            z -= batch_type(0.4f);
+            batch_type r2 = batch_type(1.) - exp(-x * x) * detail::erf_kernel<batch_type>::erfc2(z);
+            r2 = select(self < batch_type(0.), -r2, r2);
+            r1 = select(test1, r1, r2);
+#ifndef XSIMD_NO_INFINITIES
+            r1 = select(xsimd::isinf(self), sign(self), r1);
+#endif
+            return r1;
+        }
+
+        template <class A>
+        inline batch<double, A> erf(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            batch_type x = abs(self);
+            batch_type xx = x * x;
+            batch_type lim1(0.65);
+            batch_type lim2(2.2);
+            auto test1 = x < lim1;
+            batch_type r1(0.);
+            if (any(test1))
+            {
+                r1 = self * detail::erf_kernel<batch_type>::erf1(xx);
+                if (all(test1))
+                    return r1;
+            }
+            auto test2 = x < lim2;
+            auto test3 = test2 && !test1;
+            batch_type ex = exp(-xx);
+            if (any(test3))
+            {
+                batch_type z = batch_type(1.) - ex * detail::erf_kernel<batch_type>::erfc2(x);
+                batch_type r2 = select(self < batch_type(0.), -z, z);
+                r1 = select(test1, r1, r2);
+                if (all(test1 || test3))
+                    return r1;
+            }
+            batch_type z = batch_type(1.) - ex * detail::erf_kernel<batch_type>::erfc3(x);
+            z = select(self < batch_type(0.), -z, z);
+#ifndef XSIMD_NO_INFINITIES
+            z = select(xsimd::isinf(self), sign(self), z);
+#endif
+            return select(test2, r1, z);
+        }
+
+        // erfc
+        template <class A>
+        inline batch<float, A> erfc(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            batch_type x = abs(self);
+            auto test0 = self < batch_type(0.);
+            batch_type r1(0.);
+            batch_type z = x / (batch_type(1.) + x);
+            if (any(3.f * x < 2.f))
+            {
+                r1 = detail::erf_kernel<batch_type>::erfc3(z);
+            }
+            else
+            {
+                z -= batch_type(0.4f);
+                r1 = exp(-x * x) * detail::erf_kernel<batch_type>::erfc2(z);
+            }
+#ifndef XSIMD_NO_INFINITIES
+            r1 = select(x == constants::infinity<batch_type>(), batch_type(0.), r1);
+#endif
+            return select(test0, batch_type(2.) - r1, r1);
+        }
+
+        template <class A>
+        inline batch<double, A> erfc(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            batch_type x = abs(self);
+            batch_type xx = x * x;
+            batch_type lim1(0.65);
+            batch_type lim2(2.2);
+            auto test0 = self < batch_type(0.);
+            auto test1 = x < lim1;
+            batch_type r1(0.);
+            if (any(test1))
+            {
+                r1 = batch_type(1.) - x * detail::erf_kernel<batch_type>::erf1(xx);
+                if (all(test1))
+                    return select(test0, batch_type(2.) - r1, r1);
+            }
+            auto test2 = x < lim2;
+            auto test3 = test2 && !test1;
+            batch_type ex = exp(-xx);
+            if (any(test3))
+            {
+                batch_type z = ex * detail::erf_kernel<batch_type>::erfc2(x);
+                r1 = select(test1, r1, z);
+                if (all(test1 || test3))
+                    return select(test0, batch_type(2.) - r1, r1);
+            }
+            batch_type z = ex * detail::erf_kernel<batch_type>::erfc3(x);
+            r1 = select(test2, r1, z);
+#ifndef XSIMD_NO_INFINITIES
+            r1 = select(x == constants::infinity<batch_type>(), batch_type(0.), r1);
+#endif
+            return select(test0, batch_type(2.) - r1, r1);
+        }
+
+        // estrin
+        namespace detail
+        {
+
+            template <class B>
+            struct estrin
+            {
+                B x;
+
+                template <typename... Ts>
+                inline B operator()(const Ts&... coefs) noexcept
+                {
+                    return eval(coefs...);
+                }
+
+            private:
+                inline B eval(const B& c0) noexcept
+                {
+                    return c0;
+                }
+
+                inline B eval(const B& c0, const B& c1) noexcept
+                {
+                    return fma(x, c1, c0);
+                }
+
+                template <size_t... Is, class Tuple>
+                inline B eval(::xsimd::detail::index_sequence<Is...>, const Tuple& tuple)
+                {
+                    return estrin { x * x }(std::get<Is>(tuple)...);
+                }
+
+                template <class... Args>
+                inline B eval(const std::tuple<Args...>& tuple) noexcept
+                {
+                    return eval(::xsimd::detail::make_index_sequence<sizeof...(Args)>(), tuple);
+                }
+
+                template <class... Args>
+                inline B eval(const std::tuple<Args...>& tuple, const B& c0) noexcept
+                {
+                    return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0))));
+                }
+
+                template <class... Args>
+                inline B eval(const std::tuple<Args...>& tuple, const B& c0, const B& c1) noexcept
+                {
+                    return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0, c1))));
+                }
+
+                template <class... Args, class... Ts>
+                inline B eval(const std::tuple<Args...>& tuple, const B& c0, const B& c1, const Ts&... coefs) noexcept
+                {
+                    return eval(std::tuple_cat(tuple, std::make_tuple(eval(c0, c1))), coefs...);
+                }
+
+                template <class... Ts>
+                inline B eval(const B& c0, const B& c1, const Ts&... coefs) noexcept
+                {
+                    return eval(std::make_tuple(eval(c0, c1)), coefs...);
+                }
+            };
+        }
+
+        template <class T, class A, uint64_t... Coefs>
+        inline batch<T, A> estrin(const batch<T, A>& self) noexcept
+        {
+            using batch_type = batch<T, A>;
+            return detail::estrin<batch_type> { self }(detail::coef<batch_type, Coefs>()...);
+        }
+
+        // exp
+        /* origin: boost/simd/arch/common/detail/simd/expo_base.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        namespace detail
+        {
+            enum exp_reduction_tag
+            {
+                exp_tag,
+                exp2_tag,
+                exp10_tag
+            };
+
+            template <class B, exp_reduction_tag Tag>
+            struct exp_reduction_base;
+
+            template <class B>
+            struct exp_reduction_base<B, exp_tag>
+            {
+                static constexpr B maxlog() noexcept
+                {
+                    return constants::maxlog<B>();
+                }
+
+                static constexpr B minlog() noexcept
+                {
+                    return constants::minlog<B>();
+                }
+            };
+
+            template <class B>
+            struct exp_reduction_base<B, exp10_tag>
+            {
+                static constexpr B maxlog() noexcept
+                {
+                    return constants::maxlog10<B>();
+                }
+
+                static constexpr B minlog() noexcept
+                {
+                    return constants::minlog10<B>();
+                }
+            };
+
+            template <class B>
+            struct exp_reduction_base<B, exp2_tag>
+            {
+                static constexpr B maxlog() noexcept
+                {
+                    return constants::maxlog2<B>();
+                }
+
+                static constexpr B minlog() noexcept
+                {
+                    return constants::minlog2<B>();
+                }
+            };
+
+            template <class T, class A, exp_reduction_tag Tag>
+            struct exp_reduction;
+
+            template <class A>
+            struct exp_reduction<float, A, exp_tag> : exp_reduction_base<batch<float, A>, exp_tag>
+            {
+                using batch_type = batch<float, A>;
+                static inline batch_type approx(const batch_type& x) noexcept
+                {
+                    batch_type y = detail::horner<batch_type,
+                                                  0x3f000000, //  5.0000000e-01
+                                                  0x3e2aa9a5, //  1.6666277e-01
+                                                  0x3d2aa957, //  4.1665401e-02
+                                                  0x3c098d8b, //  8.3955629e-03
+                                                  0x3ab778cf //  1.3997796e-03
+                                                  >(x);
+                    return ++fma(y, x * x, x);
+                }
+
+                static inline batch_type reduce(const batch_type& a, batch_type& x) noexcept
+                {
+                    batch_type k = nearbyint(constants::invlog_2<batch_type>() * a);
+                    x = fnma(k, constants::log_2hi<batch_type>(), a);
+                    x = fnma(k, constants::log_2lo<batch_type>(), x);
+                    return k;
+                }
+            };
+
+            template <class A>
+            struct exp_reduction<float, A, exp10_tag> : exp_reduction_base<batch<float, A>, exp10_tag>
+            {
+                using batch_type = batch<float, A>;
+                static inline batch_type approx(const batch_type& x) noexcept
+                {
+                    return ++(detail::horner<batch_type,
+                                             0x40135d8e, //    2.3025851e+00
+                                             0x4029a926, //    2.6509490e+00
+                                             0x400237da, //    2.0346589e+00
+                                             0x3f95eb4c, //    1.1712432e+00
+                                             0x3f0aacef, //    5.4170126e-01
+                                             0x3e54dff1 //    2.0788552e-01
+                                             >(x)
+                              * x);
+                }
+
+                static inline batch_type reduce(const batch_type& a, batch_type& x) noexcept
+                {
+                    batch_type k = nearbyint(constants::invlog10_2<batch_type>() * a);
+                    x = fnma(k, constants::log10_2hi<batch_type>(), a);
+                    x -= k * constants::log10_2lo<batch_type>();
+                    return k;
+                }
+            };
+
+            template <class A>
+            struct exp_reduction<float, A, exp2_tag> : exp_reduction_base<batch<float, A>, exp2_tag>
+            {
+                using batch_type = batch<float, A>;
+                static inline batch_type approx(const batch_type& x) noexcept
+                {
+                    batch_type y = detail::horner<batch_type,
+                                                  0x3e75fdf1, //    2.4022652e-01
+                                                  0x3d6356eb, //    5.5502813e-02
+                                                  0x3c1d9422, //    9.6178371e-03
+                                                  0x3ab01218, //    1.3433127e-03
+                                                  0x3922c8c4 //    1.5524315e-04
+                                                  >(x);
+                    return ++fma(y, x * x, x * constants::log_2<batch_type>());
+                }
+
+                static inline batch_type reduce(const batch_type& a, batch_type& x) noexcept
+                {
+                    batch_type k = nearbyint(a);
+                    x = (a - k);
+                    return k;
+                }
+            };
+
+            template <class A>
+            struct exp_reduction<double, A, exp_tag> : exp_reduction_base<batch<double, A>, exp_tag>
+            {
+                using batch_type = batch<double, A>;
+                static inline batch_type approx(const batch_type& x) noexcept
+                {
+                    batch_type t = x * x;
+                    return fnma(t,
+                                detail::horner<batch_type,
+                                               0x3fc555555555553eull,
+                                               0xbf66c16c16bebd93ull,
+                                               0x3f11566aaf25de2cull,
+                                               0xbebbbd41c5d26bf1ull,
+                                               0x3e66376972bea4d0ull>(t),
+                                x);
+                }
+
+                static inline batch_type reduce(const batch_type& a, batch_type& hi, batch_type& lo, batch_type& x) noexcept
+                {
+                    batch_type k = nearbyint(constants::invlog_2<batch_type>() * a);
+                    hi = fnma(k, constants::log_2hi<batch_type>(), a);
+                    lo = k * constants::log_2lo<batch_type>();
+                    x = hi - lo;
+                    return k;
+                }
+
+                static inline batch_type finalize(const batch_type& x, const batch_type& c, const batch_type& hi, const batch_type& lo) noexcept
+                {
+                    return batch_type(1.) - (((lo - (x * c) / (batch_type(2.) - c)) - hi));
+                }
+            };
+
+            template <class A>
+            struct exp_reduction<double, A, exp10_tag> : exp_reduction_base<batch<double, A>, exp10_tag>
+            {
+                using batch_type = batch<double, A>;
+                static inline batch_type approx(const batch_type& x) noexcept
+                {
+                    batch_type xx = x * x;
+                    batch_type px = x * detail::horner<batch_type, 0x40a2b4798e134a01ull, 0x40796b7a050349e4ull, 0x40277d9474c55934ull, 0x3fa4fd75f3062dd4ull>(xx);
+                    batch_type x2 = px / (detail::horner1<batch_type, 0x40a03f37650df6e2ull, 0x4093e05eefd67782ull, 0x405545fdce51ca08ull>(xx) - px);
+                    return ++(x2 + x2);
+                }
+
+                static inline batch_type reduce(const batch_type& a, batch_type&, batch_type&, batch_type& x) noexcept
+                {
+                    batch_type k = nearbyint(constants::invlog10_2<batch_type>() * a);
+                    x = fnma(k, constants::log10_2hi<batch_type>(), a);
+                    x = fnma(k, constants::log10_2lo<batch_type>(), x);
+                    return k;
+                }
+
+                static inline batch_type finalize(const batch_type&, const batch_type& c, const batch_type&, const batch_type&) noexcept
+                {
+                    return c;
+                }
+            };
+
+            template <class A>
+            struct exp_reduction<double, A, exp2_tag> : exp_reduction_base<batch<double, A>, exp2_tag>
+            {
+                using batch_type = batch<double, A>;
+                static inline batch_type approx(const batch_type& x) noexcept
+                {
+                    batch_type t = x * x;
+                    return fnma(t,
+                                detail::horner<batch_type,
+                                               0x3fc555555555553eull,
+                                               0xbf66c16c16bebd93ull,
+                                               0x3f11566aaf25de2cull,
+                                               0xbebbbd41c5d26bf1ull,
+                                               0x3e66376972bea4d0ull>(t),
+                                x);
+                }
+
+                static inline batch_type reduce(const batch_type& a, batch_type&, batch_type&, batch_type& x) noexcept
+                {
+                    batch_type k = nearbyint(a);
+                    x = (a - k) * constants::log_2<batch_type>();
+                    return k;
+                }
+
+                static inline batch_type finalize(const batch_type& x, const batch_type& c, const batch_type&, const batch_type&) noexcept
+                {
+                    return batch_type(1.) + x + x * c / (batch_type(2.) - c);
+                }
+            };
+
+            template <exp_reduction_tag Tag, class A>
+            inline batch<float, A> exp(batch<float, A> const& self) noexcept
+            {
+                using batch_type = batch<float, A>;
+                using reducer_t = exp_reduction<float, A, Tag>;
+                batch_type x;
+                batch_type k = reducer_t::reduce(self, x);
+                x = reducer_t::approx(x);
+                x = select(self <= reducer_t::minlog(), batch_type(0.), ldexp(x, to_int(k)));
+                x = select(self >= reducer_t::maxlog(), constants::infinity<batch_type>(), x);
+                return x;
+            }
+
+            template <exp_reduction_tag Tag, class A>
+            inline batch<double, A> exp(batch<double, A> const& self) noexcept
+            {
+                using batch_type = batch<double, A>;
+                using reducer_t = exp_reduction<double, A, Tag>;
+                batch_type hi, lo, x;
+                batch_type k = reducer_t::reduce(self, hi, lo, x);
+                batch_type c = reducer_t::approx(x);
+                c = reducer_t::finalize(x, c, hi, lo);
+                c = select(self <= reducer_t::minlog(), batch_type(0.), ldexp(c, to_int(k)));
+                c = select(self >= reducer_t::maxlog(), constants::infinity<batch_type>(), c);
+                return c;
+            }
+        }
+
+        template <class A, class T>
+        inline batch<T, A> exp(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::exp<detail::exp_tag>(self);
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> exp(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            auto isincos = sincos(self.imag());
+            return exp(self.real()) * batch_type(std::get<1>(isincos), std::get<0>(isincos));
+        }
+
+        // exp10
+        template <class A, class T>
+        inline batch<T, A> exp10(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::exp<detail::exp10_tag>(self);
+        }
+
+        // exp2
+        template <class A, class T>
+        inline batch<T, A> exp2(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::exp<detail::exp2_tag>(self);
+        }
+
+        // expm1
+        namespace detail
+        {
+            /* origin: boost/simd/arch/common/detail/generic/expm1_kernel.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class A>
+            static inline batch<float, A> expm1(const batch<float, A>& a) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type k = nearbyint(constants::invlog_2<batch_type>() * a);
+                batch_type x = fnma(k, constants::log_2hi<batch_type>(), a);
+                x = fnma(k, constants::log_2lo<batch_type>(), x);
+                batch_type hx = x * batch_type(0.5);
+                batch_type hxs = x * hx;
+                batch_type r = detail::horner<batch_type,
+                                              0X3F800000UL, // 1
+                                              0XBD08887FUL, // -3.3333298E-02
+                                              0X3ACF6DB4UL // 1.582554
+                                              >(hxs);
+                batch_type t = fnma(r, hx, batch_type(3.));
+                batch_type e = hxs * ((r - t) / (batch_type(6.) - x * t));
+                e = fms(x, e, hxs);
+                using i_type = as_integer_t<batch_type>;
+                i_type ik = to_int(k);
+                batch_type two2mk = ::xsimd::bitwise_cast<float>((constants::maxexponent<batch_type>() - ik) << constants::nmb<batch_type>());
+                batch_type y = batch_type(1.) - two2mk - (e - x);
+                return ldexp(y, ik);
+            }
+
+            template <class A>
+            static inline batch<double, A> expm1(const batch<double, A>& a) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type k = nearbyint(constants::invlog_2<batch_type>() * a);
+                batch_type hi = fnma(k, constants::log_2hi<batch_type>(), a);
+                batch_type lo = k * constants::log_2lo<batch_type>();
+                batch_type x = hi - lo;
+                batch_type hxs = x * x * batch_type(0.5);
+                batch_type r = detail::horner<batch_type,
+                                              0X3FF0000000000000ULL,
+                                              0XBFA11111111110F4ULL,
+                                              0X3F5A01A019FE5585ULL,
+                                              0XBF14CE199EAADBB7ULL,
+                                              0X3ED0CFCA86E65239ULL,
+                                              0XBE8AFDB76E09C32DULL>(hxs);
+                batch_type t = batch_type(3.) - r * batch_type(0.5) * x;
+                batch_type e = hxs * ((r - t) / (batch_type(6) - x * t));
+                batch_type c = (hi - x) - lo;
+                e = (x * (e - c) - c) - hxs;
+                using i_type = as_integer_t<batch_type>;
+                i_type ik = to_int(k);
+                batch_type two2mk = ::xsimd::bitwise_cast<double>((constants::maxexponent<batch_type>() - ik) << constants::nmb<batch_type>());
+                batch_type ct1 = batch_type(1.) - two2mk - (e - x);
+                batch_type ct2 = ++(x - (e + two2mk));
+                batch_type y = select(k < batch_type(20.), ct1, ct2);
+                return ldexp(y, ik);
+            }
+
+        }
+
+        template <class A, class T>
+        inline batch<T, A> expm1(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            return select(self < constants::logeps<batch_type>(),
+                          batch_type(-1.),
+                          select(self > constants::maxlog<batch_type>(),
+                                 constants::infinity<batch_type>(),
+                                 detail::expm1(self)));
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> expm1(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            real_batch isin = sin(z.imag());
+            real_batch rem1 = expm1(z.real());
+            real_batch re = rem1 + 1.;
+            real_batch si = sin(z.imag() * 0.5);
+            return { rem1 - 2. * re * si * si, re * isin };
+        }
+
+        // polar
+        template <class A, class T>
+        inline batch<std::complex<T>, A> polar(const batch<T, A>& r, const batch<T, A>& theta, requires_arch<generic>) noexcept
+        {
+            auto sincosTheta = sincos(theta);
+            return { r * sincosTheta.second, r * sincosTheta.first };
+        }
+
+        // fdim
+        template <class A, class T>
+        inline batch<T, A> fdim(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return fmax(batch<T, A>(0), self - other);
+        }
+
+        // fmod
+        template <class A, class T>
+        inline batch<T, A> fmod(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return fnma(trunc(self / other), other, self);
+        }
+
+        // frexp
+        /* origin: boost/simd/arch/common/simd/function/ifrexp.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A, class T>
+        inline batch<T, A> frexp(const batch<T, A>& self, batch<as_integer_t<T>, A>& exp, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            using int_type = as_integer_t<T>;
+            using i_type = batch<int_type, A>;
+            i_type m1f = constants::mask1frexp<batch_type>();
+            i_type r1 = m1f & ::xsimd::bitwise_cast<int_type>(self);
+            batch_type x = self & ::xsimd::bitwise_cast<T>(~m1f);
+            exp = (r1 >> constants::nmb<batch_type>()) - constants::maxexponentm1<batch_type>();
+            exp = select(batch_bool_cast<typename i_type::value_type>(self != batch_type(0.)), exp, i_type(typename i_type::value_type(0)));
+            return select((self != batch_type(0.)), x | ::xsimd::bitwise_cast<T>(constants::mask2frexp<batch_type>()), batch_type(0.));
+        }
+
+        // from bool
+        template <class A, class T>
+        inline batch<T, A> from_bool(batch_bool<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return batch<T, A>(self.data) & batch<T, A>(1);
+        }
+
+        // horner
+        template <class T, class A, uint64_t... Coefs>
+        inline batch<T, A> horner(const batch<T, A>& self) noexcept
+        {
+            return detail::horner<batch<T, A>, Coefs...>(self);
+        }
+
+        // hypot
+        template <class A, class T>
+        inline batch<T, A> hypot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return sqrt(fma(self, self, other * other));
+        }
+
+        // ipow
+        template <class A, class T, class ITy>
+        inline batch<T, A> ipow(batch<T, A> const& self, ITy other, requires_arch<generic>) noexcept
+        {
+            return ::xsimd::detail::ipow(self, other);
+        }
+
+        // ldexp
+        /* origin: boost/simd/arch/common/simd/function/ldexp.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A, class T>
+        inline batch<T, A> ldexp(const batch<T, A>& self, const batch<as_integer_t<T>, A>& other, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            using itype = as_integer_t<batch_type>;
+            itype ik = other + constants::maxexponent<T>();
+            ik = ik << constants::nmb<T>();
+            return self * ::xsimd::bitwise_cast<T>(ik);
+        }
+
+        // lgamma
+        template <class A, class T>
+        inline batch<T, A> lgamma(batch<T, A> const& self, requires_arch<generic>) noexcept;
+
+        namespace detail
+        {
+            /* origin: boost/simd/arch/common/detail/generic/gammaln_kernel.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class A>
+            static inline batch<float, A> gammalnB(const batch<float, A>& x) noexcept
+            {
+                return horner<batch<float, A>,
+                              0x3ed87730, //    4.227843421859038E-001
+                              0x3ea51a64, //    3.224669577325661E-001,
+                              0xbd89f07e, //   -6.735323259371034E-002,
+                              0x3ca89ed8, //    2.058355474821512E-002,
+                              0xbbf164fd, //   -7.366775108654962E-003,
+                              0x3b3ba883, //    2.863437556468661E-003,
+                              0xbaabeab1, //   -1.311620815545743E-003,
+                              0x3a1ebb94 //    6.055172732649237E-004
+                              >(x);
+            }
+
+            template <class A>
+            static inline batch<float, A> gammalnC(const batch<float, A>& x) noexcept
+            {
+                return horner<batch<float, A>,
+                              0xbf13c468, //   -5.772156501719101E-001
+                              0x3f528d34, //    8.224670749082976E-001,
+                              0xbecd27a8, //   -4.006931650563372E-001,
+                              0x3e8a898b, //    2.705806208275915E-001,
+                              0xbe53c04f, //   -2.067882815621965E-001,
+                              0x3e2d4dab, //    1.692415923504637E-001,
+                              0xbe22d329, //   -1.590086327657347E-001,
+                              0x3e0c3c4f //    1.369488127325832E-001
+                              >(x);
+            }
+
+            template <class A>
+            static inline batch<float, A> gammaln2(const batch<float, A>& x) noexcept
+            {
+                return horner<batch<float, A>,
+                              0x3daaaa94, //   8.333316229807355E-002f
+                              0xbb358701, //  -2.769887652139868E-003f,
+                              0x3a31fd69 //   6.789774945028216E-004f
+                              >(x);
+            }
+
+            template <class A>
+            static inline batch<double, A> gammaln1(const batch<double, A>& x) noexcept
+            {
+                return horner<batch<double, A>,
+                              0xc12a0c675418055eull, //  -8.53555664245765465627E5
+                              0xc13a45890219f20bull, //  -1.72173700820839662146E6,
+                              0xc131bc82f994db51ull, //  -1.16237097492762307383E6,
+                              0xc1143d73f89089e5ull, //  -3.31612992738871184744E5,
+                              0xc0e2f234355bb93eull, //  -3.88016315134637840924E4,
+                              0xc09589018ff36761ull //  -1.37825152569120859100E3
+                              >(x)
+                    / horner<batch<double, A>,
+                             0xc13ece4b6a11e14aull, //  -2.01889141433532773231E6
+                             0xc1435255892ff34cull, //  -2.53252307177582951285E6,
+                             0xc131628671950043ull, //  -1.13933444367982507207E6,
+                             0xc10aeb84b9744c9bull, //  -2.20528590553854454839E5,
+                             0xc0d0aa0d7b89d757ull, //  -1.70642106651881159223E4,
+                             0xc075fd0d1cf312b2ull, //  -3.51815701436523470549E2,
+                             0x3ff0000000000000ull //   1.00000000000000000000E0
+                             >(x);
+            }
+
+            template <class A>
+            static inline batch<double, A> gammalnA(const batch<double, A>& x) noexcept
+            {
+                return horner<batch<double, A>,
+                              0x3fb555555555554bull, //    8.33333333333331927722E-2
+                              0xbf66c16c16b0a5a1ull, //   -2.77777777730099687205E-3,
+                              0x3f4a019f20dc5ebbull, //    7.93650340457716943945E-4,
+                              0xbf437fbdb580e943ull, //   -5.95061904284301438324E-4,
+                              0x3f4a985027336661ull //    8.11614167470508450300E-4
+                              >(x);
+            }
+
+            /* origin: boost/simd/arch/common/simd/function/gammaln.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class B>
+            struct lgamma_impl;
+
+            template <class A>
+            struct lgamma_impl<batch<float, A>>
+            {
+                using batch_type = batch<float, A>;
+                static inline batch_type compute(const batch_type& a) noexcept
+                {
+                    auto inf_result = (a <= batch_type(0.)) && is_flint(a);
+                    batch_type x = select(inf_result, constants::nan<batch_type>(), a);
+                    batch_type q = abs(x);
+#ifndef XSIMD_NO_INFINITIES
+                    inf_result = (x == constants::infinity<batch_type>()) || inf_result;
+#endif
+                    auto ltza = a < batch_type(0.);
+                    batch_type r;
+                    batch_type r1 = other(q);
+                    if (any(ltza))
+                    {
+                        r = select(inf_result, constants::infinity<batch_type>(), negative(q, r1));
+                        if (all(ltza))
+                            return r;
+                    }
+                    batch_type r2 = select(ltza, r, r1);
+                    return select(a == constants::minusinfinity<batch_type>(), constants::nan<batch_type>(), select(inf_result, constants::infinity<batch_type>(), r2));
+                }
+
+            private:
+                static inline batch_type negative(const batch_type& q, const batch_type& w) noexcept
+                {
+                    batch_type p = floor(q);
+                    batch_type z = q - p;
+                    auto test2 = z < batch_type(0.5);
+                    z = select(test2, z - batch_type(1.), z);
+                    z = q * sin(z, trigo_pi_tag());
+                    return -log(constants::invpi<batch_type>() * abs(z)) - w;
+                }
+
+                static inline batch_type other(const batch_type& x) noexcept
+                {
+                    auto xlt650 = (x < batch_type(6.5));
+                    batch_type r0x = x;
+                    batch_type r0z = x;
+                    batch_type r0s = batch_type(1.);
+                    batch_type r1 = batch_type(0.);
+                    batch_type p = constants::nan<batch_type>();
+                    if (any(xlt650))
+                    {
+                        batch_type z = batch_type(1.);
+                        batch_type tx = select(xlt650, x, batch_type(0.));
+                        batch_type nx = batch_type(0.);
+                        const batch_type _075 = batch_type(0.75);
+                        const batch_type _150 = batch_type(1.50);
+                        const batch_type _125 = batch_type(1.25);
+                        const batch_type _250 = batch_type(2.50);
+                        auto xge150 = (x >= _150);
+                        auto txgt250 = (tx > _250);
+
+                        // x >= 1.5
+                        while (any(xge150 && txgt250))
+                        {
+                            nx = select(txgt250, nx - batch_type(1.), nx);
+                            tx = select(txgt250, x + nx, tx);
+                            z = select(txgt250, z * tx, z);
+                            txgt250 = (tx > _250);
+                        }
+                        r0x = select(xge150, x + nx - batch_type(2.), x);
+                        r0z = select(xge150, z, r0z);
+                        r0s = select(xge150, batch_type(1.), r0s);
+
+                        // x >= 1.25 && x < 1.5
+                        auto xge125 = (x >= _125);
+                        auto xge125t = xge125 && !xge150;
+                        if (any(xge125))
+                        {
+                            r0x = select(xge125t, x - batch_type(1.), r0x);
+                            r0z = select(xge125t, z * x, r0z);
+                            r0s = select(xge125t, batch_type(-1.), r0s);
+                        }
+
+                        // x >= 0.75 && x < 1.5
+                        batch_bool<float, A> kernelC(false);
+                        auto xge075 = (x >= _075);
+                        auto xge075t = xge075 && !xge125;
+                        if (any(xge075t))
+                        {
+                            kernelC = xge075t;
+                            r0x = select(xge075t, x - batch_type(1.), x);
+                            r0z = select(xge075t, batch_type(1.), r0z);
+                            r0s = select(xge075t, batch_type(-1.), r0s);
+                            p = gammalnC(r0x);
+                        }
+
+                        // tx < 1.5 && x < 0.75
+                        auto txlt150 = (tx < _150) && !xge075;
+                        if (any(txlt150))
+                        {
+                            auto orig = txlt150;
+                            while (any(txlt150))
+                            {
+                                z = select(txlt150, z * tx, z);
+                                nx = select(txlt150, nx + batch_type(1.), nx);
+                                tx = select(txlt150, x + nx, tx);
+                                txlt150 = (tx < _150) && !xge075;
+                            }
+                            r0x = select(orig, r0x + nx - batch_type(2.), r0x);
+                            r0z = select(orig, z, r0z);
+                            r0s = select(orig, batch_type(-1.), r0s);
+                        }
+                        p = select(kernelC, p, gammalnB(r0x));
+                        if (all(xlt650))
+                            return fma(r0x, p, r0s * log(abs(r0z)));
+                    }
+                    r0z = select(xlt650, abs(r0z), x);
+                    batch_type m = log(r0z);
+                    r1 = fma(r0x, p, r0s * m);
+                    batch_type r2 = fma(x - batch_type(0.5), m, constants::logsqrt2pi<batch_type>() - x);
+                    r2 += gammaln2(batch_type(1.) / (x * x)) / x;
+                    return select(xlt650, r1, r2);
+                }
+            };
+
+            template <class A>
+            struct lgamma_impl<batch<double, A>>
+            {
+                using batch_type = batch<double, A>;
+
+                static inline batch_type compute(const batch_type& a) noexcept
+                {
+                    auto inf_result = (a <= batch_type(0.)) && is_flint(a);
+                    batch_type x = select(inf_result, constants::nan<batch_type>(), a);
+                    batch_type q = abs(x);
+#ifndef XSIMD_NO_INFINITIES
+                    inf_result = (q == constants::infinity<batch_type>());
+#endif
+                    auto test = (a < batch_type(-34.));
+                    batch_type r = constants::nan<batch_type>();
+                    if (any(test))
+                    {
+                        r = large_negative(q);
+                        if (all(test))
+                            return select(inf_result, constants::nan<batch_type>(), r);
+                    }
+                    batch_type r1 = other(a);
+                    batch_type r2 = select(test, r, r1);
+                    return select(a == constants::minusinfinity<batch_type>(), constants::nan<batch_type>(), select(inf_result, constants::infinity<batch_type>(), r2));
+                }
+
+            private:
+                static inline batch_type large_negative(const batch_type& q) noexcept
+                {
+                    batch_type w = lgamma(q);
+                    batch_type p = floor(q);
+                    batch_type z = q - p;
+                    auto test2 = (z < batch_type(0.5));
+                    z = select(test2, z - batch_type(1.), z);
+                    z = q * sin(z, trigo_pi_tag());
+                    z = abs(z);
+                    return constants::logpi<batch_type>() - log(z) - w;
+                }
+
+                static inline batch_type other(const batch_type& xx) noexcept
+                {
+                    batch_type x = xx;
+                    auto test = (x < batch_type(13.));
+                    batch_type r1 = batch_type(0.);
+                    if (any(test))
+                    {
+                        batch_type z = batch_type(1.);
+                        batch_type p = batch_type(0.);
+                        batch_type u = select(test, x, batch_type(0.));
+                        auto test1 = (u >= batch_type(3.));
+                        while (any(test1))
+                        {
+                            p = select(test1, p - batch_type(1.), p);
+                            u = select(test1, x + p, u);
+                            z = select(test1, z * u, z);
+                            test1 = (u >= batch_type(3.));
+                        }
+
+                        auto test2 = (u < batch_type(2.));
+                        while (any(test2))
+                        {
+                            z = select(test2, z / u, z);
+                            p = select(test2, p + batch_type(1.), p);
+                            u = select(test2, x + p, u);
+                            test2 = (u < batch_type(2.));
+                        }
+
+                        z = abs(z);
+                        x += p - batch_type(2.);
+                        r1 = x * gammaln1(x) + log(z);
+                        if (all(test))
+                            return r1;
+                    }
+                    batch_type r2 = fma(xx - batch_type(0.5), log(xx), constants::logsqrt2pi<batch_type>() - xx);
+                    batch_type p = batch_type(1.) / (xx * xx);
+                    r2 += gammalnA(p) / xx;
+                    return select(test, r1, r2);
+                }
+            };
+        }
+
+        template <class A, class T>
+        inline batch<T, A> lgamma(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::lgamma_impl<batch<T, A>>::compute(self);
+        }
+
+        // log
+        /* origin: boost/simd/arch/common/simd/function/log.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A>
+        inline batch<float, A> log(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            using int_type = as_integer_t<float>;
+            using i_type = batch<int_type, A>;
+            batch_type x = self;
+            i_type k(0);
+            auto isnez = (self != batch_type(0.));
+#ifndef XSIMD_NO_DENORMALS
+            auto test = (self < constants::smallestposval<batch_type>()) && isnez;
+            if (any(test))
+            {
+                k = select(batch_bool_cast<int_type>(test), k - i_type(23), k);
+                x = select(test, x * batch_type(8388608ul), x);
+            }
+#endif
+            i_type ix = ::xsimd::bitwise_cast<int_type>(x);
+            ix += 0x3f800000 - 0x3f3504f3;
+            k += (ix >> 23) - 0x7f;
+            ix = (ix & i_type(0x007fffff)) + 0x3f3504f3;
+            x = ::xsimd::bitwise_cast<float>(ix);
+            batch_type f = --x;
+            batch_type s = f / (batch_type(2.) + f);
+            batch_type z = s * s;
+            batch_type w = z * z;
+            batch_type t1 = w * detail::horner<batch_type, 0x3eccce13, 0x3e789e26>(w);
+            batch_type t2 = z * detail::horner<batch_type, 0x3f2aaaaa, 0x3e91e9ee>(w);
+            batch_type R = t2 + t1;
+            batch_type hfsq = batch_type(0.5) * f * f;
+            batch_type dk = to_float(k);
+            batch_type r = fma(dk, constants::log_2hi<batch_type>(), fma(s, (hfsq + R), dk * constants::log_2lo<batch_type>()) - hfsq + f);
+#ifndef XSIMD_NO_INFINITIES
+            batch_type zz = select(isnez, select(self == constants::infinity<batch_type>(), constants::infinity<batch_type>(), r), constants::minusinfinity<batch_type>());
+#else
+            batch_type zz = select(isnez, r, constants::minusinfinity<batch_type>());
+#endif
+            return select(!(self >= batch_type(0.)), constants::nan<batch_type>(), zz);
+        }
+
+        template <class A>
+        inline batch<double, A> log(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            using int_type = as_integer_t<double>;
+            using i_type = batch<int_type, A>;
+
+            batch_type x = self;
+            i_type hx = ::xsimd::bitwise_cast<int_type>(x) >> 32;
+            i_type k(0);
+            auto isnez = (self != batch_type(0.));
+#ifndef XSIMD_NO_DENORMALS
+            auto test = (self < constants::smallestposval<batch_type>()) && isnez;
+            if (any(test))
+            {
+                k = select(batch_bool_cast<int_type>(test), k - i_type(54), k);
+                x = select(test, x * batch_type(18014398509481984ull), x);
+            }
+#endif
+            hx += 0x3ff00000 - 0x3fe6a09e;
+            k += (hx >> 20) - 0x3ff;
+            batch_type dk = to_float(k);
+            hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e;
+            x = ::xsimd::bitwise_cast<double>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<int_type>(x)));
+
+            batch_type f = --x;
+            batch_type hfsq = batch_type(0.5) * f * f;
+            batch_type s = f / (batch_type(2.) + f);
+            batch_type z = s * s;
+            batch_type w = z * z;
+
+            batch_type t1 = w * detail::horner<batch_type, 0x3fd999999997fa04ll, 0x3fcc71c51d8e78afll, 0x3fc39a09d078c69fll>(w);
+            batch_type t2 = z * detail::horner<batch_type, 0x3fe5555555555593ll, 0x3fd2492494229359ll, 0x3fc7466496cb03dell, 0x3fc2f112df3e5244ll>(w);
+            batch_type R = t2 + t1;
+            batch_type r = fma(dk, constants::log_2hi<batch_type>(), fma(s, (hfsq + R), dk * constants::log_2lo<batch_type>()) - hfsq + f);
+#ifndef XSIMD_NO_INFINITIES
+            batch_type zz = select(isnez, select(self == constants::infinity<batch_type>(), constants::infinity<batch_type>(), r), constants::minusinfinity<batch_type>());
+#else
+            batch_type zz = select(isnez, r, constants::minusinfinity<batch_type>());
+#endif
+            return select(!(self >= batch_type(0.)), constants::nan<batch_type>(), zz);
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> log(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            return batch<std::complex<T>, A>(log(abs(z)), atan2(z.imag(), z.real()));
+        }
+
+        // log2
+        template <class A>
+        inline batch<float, A> log2(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            using int_type = as_integer_t<float>;
+            using i_type = batch<int_type, A>;
+            batch_type x = self;
+            i_type k(0);
+            auto isnez = (self != batch_type(0.));
+#ifndef XSIMD_NO_DENORMALS
+            auto test = (self < constants::smallestposval<batch_type>()) && isnez;
+            if (any(test))
+            {
+                k = select(batch_bool_cast<int_type>(test), k - i_type(25), k);
+                x = select(test, x * batch_type(33554432ul), x);
+            }
+#endif
+            i_type ix = ::xsimd::bitwise_cast<int_type>(x);
+            ix += 0x3f800000 - 0x3f3504f3;
+            k += (ix >> 23) - 0x7f;
+            ix = (ix & i_type(0x007fffff)) + 0x3f3504f3;
+            x = ::xsimd::bitwise_cast<float>(ix);
+            batch_type f = --x;
+            batch_type s = f / (batch_type(2.) + f);
+            batch_type z = s * s;
+            batch_type w = z * z;
+            batch_type t1 = w * detail::horner<batch_type, 0x3eccce13, 0x3e789e26>(w);
+            batch_type t2 = z * detail::horner<batch_type, 0x3f2aaaaa, 0x3e91e9ee>(w);
+            batch_type R = t1 + t2;
+            batch_type hfsq = batch_type(0.5) * f * f;
+            batch_type dk = to_float(k);
+            batch_type r = fma(fms(s, hfsq + R, hfsq) + f, constants::invlog_2<batch_type>(), dk);
+#ifndef XSIMD_NO_INFINITIES
+            batch_type zz = select(isnez, select(self == constants::infinity<batch_type>(), constants::infinity<batch_type>(), r), constants::minusinfinity<batch_type>());
+#else
+            batch_type zz = select(isnez, r, constants::minusinfinity<batch_type>());
+#endif
+            return select(!(self >= batch_type(0.)), constants::nan<batch_type>(), zz);
+        }
+
+        template <class A>
+        inline batch<double, A> log2(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            using int_type = as_integer_t<double>;
+            using i_type = batch<int_type, A>;
+            batch_type x = self;
+            i_type hx = ::xsimd::bitwise_cast<int_type>(x) >> 32;
+            i_type k(0);
+            auto isnez = (self != batch_type(0.));
+#ifndef XSIMD_NO_DENORMALS
+            auto test = (self < constants::smallestposval<batch_type>()) && isnez;
+            if (any(test))
+            {
+                k = select(batch_bool_cast<typename i_type::value_type>(test), k - i_type(54), k);
+                x = select(test, x * batch_type(18014398509481984ull), x);
+            }
+#endif
+            hx += 0x3ff00000 - 0x3fe6a09e;
+            k += (hx >> 20) - 0x3ff;
+            hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e;
+            x = ::xsimd::bitwise_cast<double>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<int_type>(x)));
+            batch_type f = --x;
+            batch_type s = f / (batch_type(2.) + f);
+            batch_type z = s * s;
+            batch_type w = z * z;
+            batch_type t1 = w * detail::horner<batch_type, 0x3fd999999997fa04ll, 0x3fcc71c51d8e78afll, 0x3fc39a09d078c69fll>(w);
+            batch_type t2 = z * detail::horner<batch_type, 0x3fe5555555555593ll, 0x3fd2492494229359ll, 0x3fc7466496cb03dell, 0x3fc2f112df3e5244ll>(w);
+            batch_type R = t2 + t1;
+            batch_type hfsq = batch_type(0.5) * f * f;
+            batch_type hi = f - hfsq;
+            hi = hi & ::xsimd::bitwise_cast<double>((constants::allbits<i_type>() << 32));
+            batch_type lo = fma(s, hfsq + R, f - hi - hfsq);
+            batch_type val_hi = hi * constants::invlog_2hi<batch_type>();
+            batch_type val_lo = fma(lo + hi, constants::invlog_2lo<batch_type>(), lo * constants::invlog_2hi<batch_type>());
+            batch_type dk = to_float(k);
+            batch_type w1 = dk + val_hi;
+            val_lo += (dk - w1) + val_hi;
+            val_hi = w1;
+            batch_type r = val_lo + val_hi;
+#ifndef XSIMD_NO_INFINITIES
+            batch_type zz = select(isnez, select(self == constants::infinity<batch_type>(), constants::infinity<batch_type>(), r), constants::minusinfinity<batch_type>());
+#else
+            batch_type zz = select(isnez, r, constants::minusinfinity<batch_type>());
+#endif
+            return select(!(self >= batch_type(0.)), constants::nan<batch_type>(), zz);
+        }
+
+        namespace detail
+        {
+            template <class T, class A>
+            inline batch<T, A> logN_complex_impl(const batch<T, A>& z, typename batch<T, A>::value_type base) noexcept
+            {
+                using batch_type = batch<T, A>;
+                using rv_type = typename batch_type::value_type;
+                return log(z) / batch_type(rv_type(base));
+            }
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> log2(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::logN_complex_impl(self, std::log(2));
+        }
+
+        // log10
+        /* origin: FreeBSD /usr/src/lib/msun/src/e_log10f.c */
+        /*
+         * ====================================================
+         * Copyright (C) 1993 by Sun Microsystems, Inc. All rights reserved.
+         *
+         * Developed at SunPro, a Sun Microsystems, Inc. business.
+         * Permission to use, copy, modify, and distribute this
+         * software is freely granted, provided that this notice
+         * is preserved.
+         * ====================================================
+         */
+        template <class A>
+        inline batch<float, A> log10(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            const batch_type
+                ivln10hi(4.3432617188e-01f),
+                ivln10lo(-3.1689971365e-05f),
+                log10_2hi(3.0102920532e-01f),
+                log10_2lo(7.9034151668e-07f);
+            using int_type = as_integer_t<float>;
+            using i_type = batch<int_type, A>;
+            batch_type x = self;
+            i_type k(0);
+            auto isnez = (self != batch_type(0.));
+#ifndef XSIMD_NO_DENORMALS
+            auto test = (self < constants::smallestposval<batch_type>()) && isnez;
+            if (any(test))
+            {
+                k = select(batch_bool_cast<int_type>(test), k - i_type(25), k);
+                x = select(test, x * batch_type(33554432ul), x);
+            }
+#endif
+            i_type ix = ::xsimd::bitwise_cast<int_type>(x);
+            ix += 0x3f800000 - 0x3f3504f3;
+            k += (ix >> 23) - 0x7f;
+            ix = (ix & i_type(0x007fffff)) + 0x3f3504f3;
+            x = ::xsimd::bitwise_cast<float>(ix);
+            batch_type f = --x;
+            batch_type s = f / (batch_type(2.) + f);
+            batch_type z = s * s;
+            batch_type w = z * z;
+            batch_type t1 = w * detail::horner<batch_type, 0x3eccce13, 0x3e789e26>(w);
+            batch_type t2 = z * detail::horner<batch_type, 0x3f2aaaaa, 0x3e91e9ee>(w);
+            batch_type R = t2 + t1;
+            batch_type dk = to_float(k);
+            batch_type hfsq = batch_type(0.5) * f * f;
+            batch_type hibits = f - hfsq;
+            hibits &= ::xsimd::bitwise_cast<float>(i_type(0xfffff000));
+            batch_type lobits = fma(s, hfsq + R, f - hibits - hfsq);
+            batch_type r = fma(dk, log10_2hi,
+                               fma(hibits, ivln10hi,
+                                   fma(lobits, ivln10hi,
+                                       fma(lobits + hibits, ivln10lo, dk * log10_2lo))));
+#ifndef XSIMD_NO_INFINITIES
+            batch_type zz = select(isnez, select(self == constants::infinity<batch_type>(), constants::infinity<batch_type>(), r), constants::minusinfinity<batch_type>());
+#else
+            batch_type zz = select(isnez, r, constants::minusinfinity<batch_type>());
+#endif
+            return select(!(self >= batch_type(0.)), constants::nan<batch_type>(), zz);
+        }
+
+        template <class A>
+        inline batch<double, A> log10(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            const batch_type
+                ivln10hi(4.34294481878168880939e-01),
+                ivln10lo(2.50829467116452752298e-11),
+                log10_2hi(3.01029995663611771306e-01),
+                log10_2lo(3.69423907715893078616e-13);
+            using int_type = as_integer_t<double>;
+            using i_type = batch<int_type, A>;
+            batch_type x = self;
+            i_type hx = ::xsimd::bitwise_cast<int_type>(x) >> 32;
+            i_type k(0);
+            auto isnez = (self != batch_type(0.));
+#ifndef XSIMD_NO_DENORMALS
+            auto test = (self < constants::smallestposval<batch_type>()) && isnez;
+            if (any(test))
+            {
+                k = select(batch_bool_cast<int_type>(test), k - i_type(54), k);
+                x = select(test, x * batch_type(18014398509481984ull), x);
+            }
+#endif
+            hx += 0x3ff00000 - 0x3fe6a09e;
+            k += (hx >> 20) - 0x3ff;
+            hx = (hx & i_type(0x000fffff)) + 0x3fe6a09e;
+            x = ::xsimd::bitwise_cast<double>(hx << 32 | (i_type(0xffffffff) & ::xsimd::bitwise_cast<int_type>(x)));
+            batch_type f = --x;
+            batch_type dk = to_float(k);
+            batch_type s = f / (batch_type(2.) + f);
+            batch_type z = s * s;
+            batch_type w = z * z;
+            batch_type t1 = w * detail::horner<batch_type, 0x3fd999999997fa04ll, 0x3fcc71c51d8e78afll, 0x3fc39a09d078c69fll>(w);
+            batch_type t2 = z * detail::horner<batch_type, 0x3fe5555555555593ll, 0x3fd2492494229359ll, 0x3fc7466496cb03dell, 0x3fc2f112df3e5244ll>(w);
+            batch_type R = t2 + t1;
+            batch_type hfsq = batch_type(0.5) * f * f;
+            batch_type hi = f - hfsq;
+            hi = hi & ::xsimd::bitwise_cast<double>(constants::allbits<i_type>() << 32);
+            batch_type lo = f - hi - hfsq + s * (hfsq + R);
+            batch_type val_hi = hi * ivln10hi;
+            batch_type y = dk * log10_2hi;
+            batch_type val_lo = dk * log10_2lo + (lo + hi) * ivln10lo + lo * ivln10hi;
+            batch_type w1 = y + val_hi;
+            val_lo += (y - w1) + val_hi;
+            val_hi = w1;
+            batch_type r = val_lo + val_hi;
+#ifndef XSIMD_NO_INFINITIES
+            batch_type zz = select(isnez, select(self == constants::infinity<batch_type>(), constants::infinity<batch_type>(), r), constants::minusinfinity<batch_type>());
+#else
+            batch_type zz = select(isnez, r, constants::minusinfinity<batch_type>());
+#endif
+            return select(!(self >= batch_type(0.)), constants::nan<batch_type>(), zz);
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> log10(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            return detail::logN_complex_impl(z, std::log(10));
+        }
+
+        // log1p
+        /* origin: boost/simd/arch/common/simd/function/log1p.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A>
+        inline batch<float, A> log1p(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            using int_type = as_integer_t<float>;
+            using i_type = batch<int_type, A>;
+            const batch_type uf = self + batch_type(1.);
+            auto isnez = (uf != batch_type(0.));
+            i_type iu = ::xsimd::bitwise_cast<int_type>(uf);
+            iu += 0x3f800000 - 0x3f3504f3;
+            i_type k = (iu >> 23) - 0x7f;
+            iu = (iu & i_type(0x007fffff)) + 0x3f3504f3;
+            batch_type f = --(::xsimd::bitwise_cast<float>(iu));
+            batch_type s = f / (batch_type(2.) + f);
+            batch_type z = s * s;
+            batch_type w = z * z;
+            batch_type t1 = w * detail::horner<batch_type, 0x3eccce13, 0x3e789e26>(w);
+            batch_type t2 = z * detail::horner<batch_type, 0x3f2aaaaa, 0x3e91e9ee>(w);
+            batch_type R = t2 + t1;
+            batch_type hfsq = batch_type(0.5) * f * f;
+            batch_type dk = to_float(k);
+            /* correction term ~ log(1+x)-log(u), avoid underflow in c/u */
+            batch_type c = select(batch_bool_cast<float>(k >= i_type(2)), batch_type(1.) - (uf - self), self - (uf - batch_type(1.))) / uf;
+            batch_type r = fma(dk, constants::log_2hi<batch_type>(), fma(s, (hfsq + R), dk * constants::log_2lo<batch_type>() + c) - hfsq + f);
+#ifndef XSIMD_NO_INFINITIES
+            batch_type zz = select(isnez, select(self == constants::infinity<batch_type>(), constants::infinity<batch_type>(), r), constants::minusinfinity<batch_type>());
+#else
+            batch_type zz = select(isnez, r, constants::minusinfinity<batch_type>());
+#endif
+            return select(!(uf >= batch_type(0.)), constants::nan<batch_type>(), zz);
+        }
+
+        template <class A>
+        inline batch<double, A> log1p(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            using int_type = as_integer_t<double>;
+            using i_type = batch<int_type, A>;
+            const batch_type uf = self + batch_type(1.);
+            auto isnez = (uf != batch_type(0.));
+            i_type hu = ::xsimd::bitwise_cast<int_type>(uf) >> 32;
+            hu += 0x3ff00000 - 0x3fe6a09e;
+            i_type k = (hu >> 20) - 0x3ff;
+            /* correction term ~ log(1+x)-log(u), avoid underflow in c/u */
+            batch_type c = select(batch_bool_cast<double>(k >= i_type(2)), batch_type(1.) - (uf - self), self - (uf - batch_type(1.))) / uf;
+            hu = (hu & i_type(0x000fffff)) + 0x3fe6a09e;
+            batch_type f = ::xsimd::bitwise_cast<double>((hu << 32) | (i_type(0xffffffff) & ::xsimd::bitwise_cast<int_type>(uf)));
+            f = --f;
+            batch_type hfsq = batch_type(0.5) * f * f;
+            batch_type s = f / (batch_type(2.) + f);
+            batch_type z = s * s;
+            batch_type w = z * z;
+            batch_type t1 = w * detail::horner<batch_type, 0x3fd999999997fa04ll, 0x3fcc71c51d8e78afll, 0x3fc39a09d078c69fll>(w);
+            batch_type t2 = z * detail::horner<batch_type, 0x3fe5555555555593ll, 0x3fd2492494229359ll, 0x3fc7466496cb03dell, 0x3fc2f112df3e5244ll>(w);
+            batch_type R = t2 + t1;
+            batch_type dk = to_float(k);
+            batch_type r = fma(dk, constants::log_2hi<batch_type>(), fma(s, hfsq + R, dk * constants::log_2lo<batch_type>() + c) - hfsq + f);
+#ifndef XSIMD_NO_INFINITIES
+            batch_type zz = select(isnez, select(self == constants::infinity<batch_type>(), constants::infinity<batch_type>(), r), constants::minusinfinity<batch_type>());
+#else
+            batch_type zz = select(isnez, r, constants::minusinfinity<batch_type>());
+#endif
+            return select(!(uf >= batch_type(0.)), constants::nan<batch_type>(), zz);
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> log1p(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            batch_type u = 1 + self;
+            batch_type logu = log(u);
+            return select(u == batch_type(1.),
+                          self,
+                          select(u.real() <= real_batch(0.),
+                                 logu,
+                                 logu * self / (u - batch_type(1.))));
+        }
+
+        // mod
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> mod(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            return detail::apply([](T x, T y) noexcept -> T
+                                 { return x % y; },
+                                 self, other);
+        }
+
+        // nearbyint
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> nearbyint(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self;
+        }
+        namespace detail
+        {
+            template <class A, class T>
+            inline batch<T, A> nearbyintf(batch<T, A> const& self) noexcept
+            {
+                using batch_type = batch<T, A>;
+                batch_type s = bitofsign(self);
+                batch_type v = self ^ s;
+                batch_type t2n = constants::twotonmb<batch_type>();
+                // Under fast-math, reordering is possible and the compiler optimizes d
+                // to v. That's not what we want, so prevent compiler optimization here.
+                // FIXME: it may be better to emit a memory barrier here (?).
+#ifdef __FAST_MATH__
+                volatile batch_type d0 = v + t2n;
+                batch_type d = *(batch_type*)(void*)(&d0) - t2n;
+#else
+                batch_type d0 = v + t2n;
+                batch_type d = d0 - t2n;
+#endif
+                return s ^ select(v < t2n, d, v);
+            }
+        }
+        template <class A>
+        inline batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::nearbyintf(self);
+        }
+        template <class A>
+        inline batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::nearbyintf(self);
+        }
+
+        // nearbyint_as_int
+        template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> nearbyint_as_int(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self;
+        }
+
+        // nearbyint_as_int
+        template <class A>
+        inline batch<as_integer_t<float>, A>
+        nearbyint_as_int(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using U = as_integer_t<float>;
+            return kernel::detail::apply_transform<U>([](float x) noexcept -> U
+                                                      { return std::lroundf(x); },
+                                                      self);
+        }
+
+        template <class A>
+        inline batch<as_integer_t<double>, A>
+        nearbyint_as_int(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using U = as_integer_t<double>;
+            return kernel::detail::apply_transform<U>([](double x) noexcept -> U
+                                                      { return std::llround(x); },
+                                                      self);
+        }
+
+        // nextafter
+        namespace detail
+        {
+            template <class T, class A, bool is_int = std::is_integral<T>::value>
+            struct nextafter_kernel
+            {
+                using batch_type = batch<T, A>;
+
+                static inline batch_type next(batch_type const& b) noexcept
+                {
+                    return b;
+                }
+
+                static inline batch_type prev(batch_type const& b) noexcept
+                {
+                    return b;
+                }
+            };
+
+            template <class T, class A>
+            struct bitwise_cast_batch;
+
+            template <class A>
+            struct bitwise_cast_batch<float, A>
+            {
+                using type = batch<int32_t, A>;
+            };
+
+            template <class A>
+            struct bitwise_cast_batch<double, A>
+            {
+                using type = batch<int64_t, A>;
+            };
+
+            template <class T, class A>
+            struct nextafter_kernel<T, A, false>
+            {
+                using batch_type = batch<T, A>;
+                using int_batch = typename bitwise_cast_batch<T, A>::type;
+                using int_type = typename int_batch::value_type;
+
+                static inline batch_type next(const batch_type& b) noexcept
+                {
+                    batch_type n = ::xsimd::bitwise_cast<T>(::xsimd::bitwise_cast<int_type>(b) + int_type(1));
+                    return select(b == constants::infinity<batch_type>(), b, n);
+                }
+
+                static inline batch_type prev(const batch_type& b) noexcept
+                {
+                    batch_type p = ::xsimd::bitwise_cast<T>(::xsimd::bitwise_cast<int_type>(b) - int_type(1));
+                    return select(b == constants::minusinfinity<batch_type>(), b, p);
+                }
+            };
+        }
+        template <class A, class T>
+        inline batch<T, A> nextafter(batch<T, A> const& from, batch<T, A> const& to, requires_arch<generic>) noexcept
+        {
+            using kernel = detail::nextafter_kernel<T, A>;
+            return select(from == to, from,
+                          select(to > from, kernel::next(from), kernel::prev(from)));
+        }
+
+        // pow
+        /* origin: boost/simd/arch/common/simd/function/pow.hpp*/
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A, class T>
+        inline batch<T, A> pow(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            const auto zero = batch_type(0.);
+            auto negx = self < zero;
+            auto iszero = self == zero;
+            constexpr T e = static_cast<T>(2.718281828459045);
+            auto adj_self = select(iszero, batch_type(e), abs(self));
+            batch_type z = exp(other * log(adj_self));
+            z = select(iszero, zero, z);
+            z = select(is_odd(other) && negx, -z, z);
+            auto invalid = negx && !(is_flint(other) || isinf(other));
+            return select(invalid, constants::nan<batch_type>(), z);
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> pow(const batch<std::complex<T>, A>& a, const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using cplx_batch = batch<std::complex<T>, A>;
+            using real_batch = typename cplx_batch::real_batch;
+            real_batch absa = abs(a);
+            real_batch arga = arg(a);
+            real_batch x = z.real();
+            real_batch y = z.imag();
+            real_batch r = pow(absa, x);
+            real_batch theta = x * arga;
+            real_batch ze(0);
+            auto cond = (y == ze);
+            r = select(cond, r, r * exp(-y * arga));
+            theta = select(cond, theta, theta + y * log(absa));
+            return select(absa == ze, cplx_batch(ze), cplx_batch(r * cos(theta), r * sin(theta)));
+        }
+
+        // reciprocal
+        template <class T, class A, class = typename std::enable_if<std::is_floating_point<T>::value, void>::type>
+        inline batch<T, A> reciprocal(batch<T, A> const& self,
+                                      requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            return div(batch_type(1), self);
+        }
+
+        // reduce_add
+        template <class A, class T>
+        inline std::complex<T> reduce_add(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+        {
+            return { reduce_add(self.real()), reduce_add(self.imag()) };
+        }
+
+        namespace detail
+        {
+            template <class T, T N>
+            struct split_high
+            {
+                static constexpr T get(T i, T)
+                {
+                    return i >= N ? (i % 2) : i + N;
+                }
+            };
+
+            template <class Op, class A, class T>
+            inline T reduce(Op, batch<T, A> const& self, std::integral_constant<unsigned, 1>) noexcept
+            {
+                return self.get(0);
+            }
+
+            template <class Op, class A, class T, unsigned Lvl>
+            inline T reduce(Op op, batch<T, A> const& self, std::integral_constant<unsigned, Lvl>) noexcept
+            {
+                using index_type = as_unsigned_integer_t<T>;
+                batch<T, A> split = swizzle(self, make_batch_constant<batch<index_type, A>, split_high<index_type, Lvl / 2>>());
+                return reduce(op, op(split, self), std::integral_constant<unsigned, Lvl / 2>());
+            }
+        }
+
+        // reduce_max
+        template <class A, class T>
+        inline T reduce_max(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::reduce([](batch<T, A> const& x, batch<T, A> const& y)
+                                  { return max(x, y); },
+                                  self, std::integral_constant<unsigned, batch<T, A>::size>());
+        }
+
+        // reduce_min
+        template <class A, class T>
+        inline T reduce_min(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::reduce([](batch<T, A> const& x, batch<T, A> const& y)
+                                  { return min(x, y); },
+                                  self, std::integral_constant<unsigned, batch<T, A>::size>());
+        }
+
+        // remainder
+        template <class A>
+        inline batch<float, A> remainder(batch<float, A> const& self, batch<float, A> const& other, requires_arch<generic>) noexcept
+        {
+            return fnma(nearbyint(self / other), other, self);
+        }
+        template <class A>
+        inline batch<double, A> remainder(batch<double, A> const& self, batch<double, A> const& other, requires_arch<generic>) noexcept
+        {
+            return fnma(nearbyint(self / other), other, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> remainder(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            auto mod = self % other;
+            return select(mod <= other / 2, mod, mod - other);
+        }
+
+        // select
+        template <class A, class T>
+        inline batch<std::complex<T>, A> select(batch_bool<T, A> const& cond, batch<std::complex<T>, A> const& true_br, batch<std::complex<T>, A> const& false_br, requires_arch<generic>) noexcept
+        {
+            return { select(cond, true_br.real(), false_br.real()), select(cond, true_br.imag(), false_br.imag()) };
+        }
+
+        // sign
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sign(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type res = select(self > batch_type(0), batch_type(1), batch_type(0)) - select(self < batch_type(0), batch_type(1), batch_type(0));
+            return res;
+        }
+
+        namespace detail
+        {
+            template <class T, class A>
+            inline batch<T, A> signf(batch<T, A> const& self) noexcept
+            {
+                using batch_type = batch<T, A>;
+                batch_type res = select(self > batch_type(0.f), batch_type(1.f), batch_type(0.f)) - select(self < batch_type(0.f), batch_type(1.f), batch_type(0.f));
+#ifdef XSIMD_NO_NANS
+                return res;
+#else
+                return select(isnan(self), constants::nan<batch_type>(), res);
+#endif
+            }
+        }
+
+        template <class A>
+        inline batch<float, A> sign(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::signf(self);
+        }
+        template <class A>
+        inline batch<double, A> sign(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::signf(self);
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> sign(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            auto rz = z.real();
+            auto iz = z.imag();
+            return select(rz != real_batch(0.),
+                          batch_type(sign(rz)),
+                          batch_type(sign(iz)));
+        }
+
+        // signnz
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> signnz(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            return (self >> (sizeof(T) * 8 - 1)) | batch_type(1.);
+        }
+
+        namespace detail
+        {
+            template <class T, class A>
+            inline batch<T, A> signnzf(batch<T, A> const& self) noexcept
+            {
+                using batch_type = batch<T, A>;
+#ifndef XSIMD_NO_NANS
+                return select(isnan(self), constants::nan<batch_type>(), batch_type(1.) | (constants::signmask<batch_type>() & self));
+#else
+                return batch_type(1.) | (constants::signmask<batch_type>() & self);
+#endif
+            }
+        }
+
+        template <class A>
+        inline batch<float, A> signnz(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::signnzf(self);
+        }
+        template <class A>
+        inline batch<double, A> signnz(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::signnzf(self);
+        }
+
+        // sqrt
+        template <class A, class T>
+        inline batch<std::complex<T>, A> sqrt(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+
+            constexpr T csqrt_scale_factor = std::is_same<T, float>::value ? 6.7108864e7f : 1.8014398509481984e16;
+            constexpr T csqrt_scale = std::is_same<T, float>::value ? 1.220703125e-4f : 7.450580596923828125e-9;
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = batch<T, A>;
+            real_batch x = z.real();
+            real_batch y = z.imag();
+            real_batch sqrt_x = sqrt(fabs(x));
+            real_batch sqrt_hy = sqrt(0.5 * fabs(y));
+            auto cond = (fabs(x) > real_batch(4.) || fabs(y) > real_batch(4.));
+            x = select(cond, x * 0.25, x * csqrt_scale_factor);
+            y = select(cond, y * 0.25, y * csqrt_scale_factor);
+            real_batch scale = select(cond, real_batch(2.), real_batch(csqrt_scale));
+            real_batch r = abs(batch_type(x, y));
+
+            auto condxp = x > real_batch(0.);
+            real_batch t0 = select(condxp, xsimd::sqrt(0.5 * (r + x)), xsimd::sqrt(0.5 * (r - x)));
+            real_batch r0 = scale * fabs((0.5 * y) / t0);
+            t0 *= scale;
+            real_batch t = select(condxp, t0, r0);
+            r = select(condxp, r0, t0);
+            batch_type resg = select(y < real_batch(0.), batch_type(t, -r), batch_type(t, r));
+            real_batch ze(0.);
+
+            return select(y == ze,
+                          select(x == ze,
+                                 batch_type(ze, ze),
+                                 select(x < ze, batch_type(ze, sqrt_x), batch_type(sqrt_x, ze))),
+                          select(x == ze,
+                                 select(y > ze, batch_type(sqrt_hy, sqrt_hy), batch_type(sqrt_hy, -sqrt_hy)),
+                                 resg));
+        }
+
+        // tgamma
+
+        namespace detail
+        {
+            /* origin: boost/simd/arch/common/detail/generic/stirling_kernel.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class B>
+            struct stirling_kernel;
+
+            template <class A>
+            struct stirling_kernel<batch<float, A>>
+            {
+                using batch_type = batch<float, A>;
+                static inline batch_type compute(const batch_type& x) noexcept
+                {
+                    return horner<batch_type,
+                                  0x3daaaaab,
+                                  0x3b638e39,
+                                  0xbb2fb930,
+                                  0xb970b359>(x);
+                }
+
+                static inline batch_type split_limit() noexcept
+                {
+                    return batch_type(bit_cast<float>(uint32_t(0x41d628f6)));
+                }
+
+                static inline batch_type large_limit() noexcept
+                {
+                    return batch_type(bit_cast<float>(uint32_t(0x420c28f3)));
+                }
+            };
+
+            template <class A>
+            struct stirling_kernel<batch<double, A>>
+            {
+                using batch_type = batch<double, A>;
+                static inline batch_type compute(const batch_type& x) noexcept
+                {
+                    return horner<batch_type,
+                                  0x3fb5555555555986ull, //   8.33333333333482257126E-2
+                                  0x3f6c71c71b98c5fdull, //   3.47222221605458667310E-3
+                                  0xbf65f72607d44fd7ull, //  -2.68132617805781232825E-3
+                                  0xbf2e166b27e61d7cull, //  -2.29549961613378126380E-4
+                                  0x3f49cc72592d7293ull //   7.87311395793093628397E-4
+                                  >(x);
+                }
+
+                static inline batch_type split_limit() noexcept
+                {
+                    return batch_type(bit_cast<double>(uint64_t(0x4061e083ba3443d4)));
+                }
+
+                static inline batch_type large_limit() noexcept
+                {
+                    return batch_type(bit_cast<double>(uint64_t(0x4065800000000000)));
+                }
+            };
+
+            /* origin: boost/simd/arch/common/simd/function/stirling.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class T, class A>
+            inline batch<T, A> stirling(const batch<T, A>& a) noexcept
+            {
+                using batch_type = batch<T, A>;
+                const batch_type stirlingsplitlim = stirling_kernel<batch_type>::split_limit();
+                const batch_type stirlinglargelim = stirling_kernel<batch_type>::large_limit();
+                batch_type x = select(a >= batch_type(0.), a, constants::nan<batch_type>());
+                batch_type w = batch_type(1.) / x;
+                w = fma(w, stirling_kernel<batch_type>::compute(w), batch_type(1.));
+                batch_type y = exp(-x);
+                auto test = (x < stirlingsplitlim);
+                batch_type z = x - batch_type(0.5);
+                z = select(test, z, batch_type(0.5) * z);
+                batch_type v = exp(z * log(abs(x)));
+                y *= v;
+                y = select(test, y, y * v);
+                y *= constants::sqrt_2pi<batch_type>() * w;
+#ifndef XSIMD_NO_INFINITIES
+                y = select(isinf(x), x, y);
+#endif
+                return select(x > stirlinglargelim, constants::infinity<batch_type>(), y);
+            }
+
+            /* origin: boost/simd/arch/common/detail/generic/gamma_kernel.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class B>
+            struct tgamma_kernel;
+
+            template <class A>
+            struct tgamma_kernel<batch<float, A>>
+            {
+                using batch_type = batch<float, A>;
+                static inline batch_type compute(const batch_type& x) noexcept
+                {
+                    return horner<batch_type,
+                                  0x3f800000UL, //  9.999999757445841E-01
+                                  0x3ed87799UL, //  4.227874605370421E-01
+                                  0x3ed2d411UL, //  4.117741948434743E-01
+                                  0x3da82a34UL, //  8.211174403261340E-02
+                                  0x3d93ae7cUL, //  7.211014349068177E-02
+                                  0x3b91db14UL, //  4.451165155708328E-03
+                                  0x3ba90c99UL, //  5.158972571345137E-03
+                                  0x3ad28b22UL //  1.606319369134976E-03
+                                  >(x);
+                }
+            };
+
+            template <class A>
+            struct tgamma_kernel<batch<double, A>>
+            {
+                using batch_type = batch<double, A>;
+                static inline batch_type compute(const batch_type& x) noexcept
+                {
+                    return horner<batch_type,
+                                  0x3ff0000000000000ULL, // 9.99999999999999996796E-1
+                                  0x3fdfa1373993e312ULL, // 4.94214826801497100753E-1
+                                  0x3fca8da9dcae7d31ULL, // 2.07448227648435975150E-1
+                                  0x3fa863d918c423d3ULL, // 4.76367800457137231464E-2
+                                  0x3f8557cde9db14b0ULL, // 1.04213797561761569935E-2
+                                  0x3f5384e3e686bfabULL, // 1.19135147006586384913E-3
+                                  0x3f24fcb839982153ULL // 1.60119522476751861407E-4
+                                  >(x)
+                        / horner<batch_type,
+                                 0x3ff0000000000000ULL, //  1.00000000000000000320E00
+                                 0x3fb24944c9cd3c51ULL, //  7.14304917030273074085E-2
+                                 0xbfce071a9d4287c2ULL, // -2.34591795718243348568E-1
+                                 0x3fa25779e33fde67ULL, //  3.58236398605498653373E-2
+                                 0x3f8831ed5b1bb117ULL, //  1.18139785222060435552E-2
+                                 0xBf7240e4e750b44aULL, // -4.45641913851797240494E-3
+                                 0x3f41ae8a29152573ULL, //  5.39605580493303397842E-4
+                                 0xbef8487a8400d3aFULL // -2.31581873324120129819E-5
+                                 >(x);
+                }
+            };
+
+            /* origin: boost/simd/arch/common/simd/function/gamma.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class B>
+            inline B tgamma_large_negative(const B& a) noexcept
+            {
+                B st = stirling(a);
+                B p = floor(a);
+                B sgngam = select(is_even(p), -B(1.), B(1.));
+                B z = a - p;
+                auto test2 = z < B(0.5);
+                z = select(test2, z - B(1.), z);
+                z = a * sin(z, trigo_pi_tag());
+                z = abs(z);
+                return sgngam * constants::pi<B>() / (z * st);
+            }
+
+            template <class B, class BB>
+            inline B tgamma_other(const B& a, const BB& test) noexcept
+            {
+                B x = select(test, B(2.), a);
+#ifndef XSIMD_NO_INFINITIES
+                auto inf_result = (a == constants::infinity<B>());
+                x = select(inf_result, B(2.), x);
+#endif
+                B z = B(1.);
+                auto test1 = (x >= B(3.));
+                while (any(test1))
+                {
+                    x = select(test1, x - B(1.), x);
+                    z = select(test1, z * x, z);
+                    test1 = (x >= B(3.));
+                }
+                test1 = (x < B(0.));
+                while (any(test1))
+                {
+                    z = select(test1, z / x, z);
+                    x = select(test1, x + B(1.), x);
+                    test1 = (x < B(0.));
+                }
+                auto test2 = (x < B(2.));
+                while (any(test2))
+                {
+                    z = select(test2, z / x, z);
+                    x = select(test2, x + B(1.), x);
+                    test2 = (x < B(2.));
+                }
+                x = z * tgamma_kernel<B>::compute(x - B(2.));
+#ifndef XSIMD_NO_INFINITIES
+                return select(inf_result, a, x);
+#else
+                return x;
+#endif
+            }
+        }
+
+        template <class A, class T>
+        inline batch<T, A> tgamma(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            auto nan_result = (self < batch_type(0.) && is_flint(self));
+#ifndef XSIMD_NO_INVALIDS
+            nan_result = isnan(self) || nan_result;
+#endif
+            batch_type q = abs(self);
+            auto test = (self < batch_type(-33.));
+            batch_type r = constants::nan<batch_type>();
+            if (any(test))
+            {
+                r = detail::tgamma_large_negative(q);
+                if (all(test))
+                    return select(nan_result, constants::nan<batch_type>(), r);
+            }
+            batch_type r1 = detail::tgamma_other(self, test);
+            batch_type r2 = select(test, r, r1);
+            return select(self == batch_type(0.), copysign(constants::infinity<batch_type>(), self), select(nan_result, constants::nan<batch_type>(), r2));
+        }
+
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp
new file mode 100644
index 0000000000..bb40ddffc6
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp
@@ -0,0 +1,397 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_MEMORY_HPP
+#define XSIMD_GENERIC_MEMORY_HPP
+
+#include <algorithm>
+#include <complex>
+#include <stdexcept>
+
+#include "../../types/xsimd_batch_constant.hpp"
+#include "./xsimd_generic_details.hpp"
+
+namespace xsimd
+{
+    template <class batch_type, typename batch_type::value_type... Values>
+    struct batch_constant;
+
+    namespace kernel
+    {
+
+        using namespace types;
+
+        // extract_pair
+        template <class A, class T>
+        inline batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, requires_arch<generic>) noexcept
+        {
+            constexpr std::size_t size = batch<T, A>::size;
+            assert(i < size && "index in bounds");
+
+            alignas(A::alignment()) T self_buffer[size];
+            self.store_aligned(self_buffer);
+
+            alignas(A::alignment()) T other_buffer[size];
+            other.store_aligned(other_buffer);
+
+            alignas(A::alignment()) T concat_buffer[size];
+
+            for (std::size_t j = 0; j < (size - i); ++j)
+            {
+                concat_buffer[j] = other_buffer[i + j];
+                if (j < i)
+                {
+                    concat_buffer[size - 1 - j] = self_buffer[i - 1 - j];
+                }
+            }
+            return batch<T, A>::load_aligned(concat_buffer);
+        }
+
+        // gather
+        namespace detail
+        {
+            template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N == 0, int>::type = 0>
+            inline batch<T, A> gather(U const* src, batch<V, A> const& index,
+                                      ::xsimd::index<N> I) noexcept
+            {
+                return insert(batch<T, A> {}, static_cast<T>(src[index.get(I)]), I);
+            }
+
+            template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N != 0, int>::type = 0>
+            inline batch<T, A>
+            gather(U const* src, batch<V, A> const& index, ::xsimd::index<N> I) noexcept
+            {
+                static_assert(N <= batch<V, A>::size, "Incorrect value in recursion!");
+
+                const auto test = gather<N - 1, T, A>(src, index, {});
+                return insert(test, static_cast<T>(src[index.get(I)]), I);
+            }
+        } // namespace detail
+
+        template <typename T, typename A, typename V>
+        inline batch<T, A>
+        gather(batch<T, A> const&, T const* src, batch<V, A> const& index,
+               kernel::requires_arch<generic>) noexcept
+        {
+            static_assert(batch<T, A>::size == batch<V, A>::size,
+                          "Index and destination sizes must match");
+
+            return detail::gather<batch<V, A>::size - 1, T, A>(src, index, {});
+        }
+
+        // Gather with runtime indexes and mismatched strides.
+        template <typename T, typename A, typename U, typename V>
+        inline detail::sizes_mismatch_t<T, U, batch<T, A>>
+        gather(batch<T, A> const&, U const* src, batch<V, A> const& index,
+               kernel::requires_arch<generic>) noexcept
+        {
+            static_assert(batch<T, A>::size == batch<V, A>::size,
+                          "Index and destination sizes must match");
+
+            return detail::gather<batch<V, A>::size - 1, T, A>(src, index, {});
+        }
+
+        // Gather with runtime indexes and matching strides.
+        template <typename T, typename A, typename U, typename V>
+        inline detail::stride_match_t<T, U, batch<T, A>>
+        gather(batch<T, A> const&, U const* src, batch<V, A> const& index,
+               kernel::requires_arch<generic>) noexcept
+        {
+            static_assert(batch<T, A>::size == batch<V, A>::size,
+                          "Index and destination sizes must match");
+
+            return batch_cast<T>(kernel::gather(batch<U, A> {}, src, index, A {}));
+        }
+
+        // insert
+        template <class A, class T, size_t I>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept
+        {
+            struct index_mask
+            {
+                static constexpr bool get(size_t index, size_t /* size*/)
+                {
+                    return index != I;
+                }
+            };
+            batch<T, A> tmp(val);
+            return select(make_batch_bool_constant<batch<T, A>, index_mask>(), self, tmp);
+        }
+
+        // get
+        template <class A, size_t I, class T>
+        inline T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept
+        {
+            alignas(A::alignment()) T buffer[batch<T, A>::size];
+            self.store_aligned(&buffer[0]);
+            return buffer[I];
+        }
+
+        template <class A, size_t I, class T>
+        inline T get(batch_bool<T, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept
+        {
+            alignas(A::alignment()) T buffer[batch_bool<T, A>::size];
+            self.store_aligned(&buffer[0]);
+            return buffer[I];
+        }
+
+        template <class A, size_t I, class T>
+        inline auto get(batch<std::complex<T>, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept -> typename batch<std::complex<T>, A>::value_type
+        {
+            alignas(A::alignment()) T buffer[batch<std::complex<T>, A>::size];
+            self.store_aligned(&buffer[0]);
+            return buffer[I];
+        }
+
+        template <class A, class T>
+        inline T get(batch<T, A> const& self, std::size_t i, requires_arch<generic>) noexcept
+        {
+            alignas(A::alignment()) T buffer[batch<T, A>::size];
+            self.store_aligned(&buffer[0]);
+            return buffer[i];
+        }
+
+        template <class A, class T>
+        inline T get(batch_bool<T, A> const& self, std::size_t i, requires_arch<generic>) noexcept
+        {
+            alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
+            self.store_aligned(&buffer[0]);
+            return buffer[i];
+        }
+
+        template <class A, class T>
+        inline auto get(batch<std::complex<T>, A> const& self, std::size_t i, requires_arch<generic>) noexcept -> typename batch<std::complex<T>, A>::value_type
+        {
+            using T2 = typename batch<std::complex<T>, A>::value_type;
+            alignas(A::alignment()) T2 buffer[batch<std::complex<T>, A>::size];
+            self.store_aligned(&buffer[0]);
+            return buffer[i];
+        }
+
+        // load_aligned
+        namespace detail
+        {
+            template <class A, class T_in, class T_out>
+            inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_fast_conversion) noexcept
+            {
+                using batch_type_in = batch<T_in, A>;
+                using batch_type_out = batch<T_out, A>;
+                return fast_cast(batch_type_in::load_aligned(mem), batch_type_out(), A {});
+            }
+            template <class A, class T_in, class T_out>
+            inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_slow_conversion) noexcept
+            {
+                static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct load for this type combination");
+                using batch_type_out = batch<T_out, A>;
+                alignas(A::alignment()) T_out buffer[batch_type_out::size];
+                std::copy(mem, mem + batch_type_out::size, std::begin(buffer));
+                return batch_type_out::load_aligned(buffer);
+            }
+        }
+        template <class A, class T_in, class T_out>
+        inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) noexcept
+        {
+            return detail::load_aligned<A>(mem, cvt, A {}, detail::conversion_type<A, T_in, T_out> {});
+        }
+
+        // load_unaligned
+        namespace detail
+        {
+            template <class A, class T_in, class T_out>
+            inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_fast_conversion) noexcept
+            {
+                using batch_type_in = batch<T_in, A>;
+                using batch_type_out = batch<T_out, A>;
+                return fast_cast(batch_type_in::load_unaligned(mem), batch_type_out(), A {});
+            }
+
+            template <class A, class T_in, class T_out>
+            inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>, with_slow_conversion) noexcept
+            {
+                static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct load for this type combination");
+                return load_aligned<A>(mem, cvt, generic {}, with_slow_conversion {});
+            }
+        }
+        template <class A, class T_in, class T_out>
+        inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) noexcept
+        {
+            return detail::load_unaligned<A>(mem, cvt, generic {}, detail::conversion_type<A, T_in, T_out> {});
+        }
+
+        namespace detail
+        {
+            // Scatter with runtime indexes.
+            template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N == 0, int>::type = 0>
+            inline void scatter(batch<T, A> const& src, U* dst,
+                                batch<V, A> const& index,
+                                ::xsimd::index<N> I) noexcept
+            {
+                dst[index.get(I)] = static_cast<U>(src.get(I));
+            }
+
+            template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N != 0, int>::type = 0>
+            inline void
+            scatter(batch<T, A> const& src, U* dst, batch<V, A> const& index,
+                    ::xsimd::index<N> I) noexcept
+            {
+                static_assert(N <= batch<V, A>::size, "Incorrect value in recursion!");
+
+                kernel::detail::scatter<N - 1, T, A, U, V>(
+                    src, dst, index, {});
+                dst[index.get(I)] = static_cast<U>(src.get(I));
+            }
+        } // namespace detail
+
+        template <typename A, typename T, typename V>
+        inline void
+        scatter(batch<T, A> const& src, T* dst,
+                batch<V, A> const& index,
+                kernel::requires_arch<generic>) noexcept
+        {
+            static_assert(batch<T, A>::size == batch<V, A>::size,
+                          "Source and index sizes must match");
+            kernel::detail::scatter<batch<V, A>::size - 1, T, A, T, V>(
+                src, dst, index, {});
+        }
+
+        template <typename A, typename T, typename U, typename V>
+        inline detail::sizes_mismatch_t<T, U, void>
+        scatter(batch<T, A> const& src, U* dst,
+                batch<V, A> const& index,
+                kernel::requires_arch<generic>) noexcept
+        {
+            static_assert(batch<T, A>::size == batch<V, A>::size,
+                          "Source and index sizes must match");
+            kernel::detail::scatter<batch<V, A>::size - 1, T, A, U, V>(
+                src, dst, index, {});
+        }
+
+        template <typename A, typename T, typename U, typename V>
+        inline detail::stride_match_t<T, U, void>
+        scatter(batch<T, A> const& src, U* dst,
+                batch<V, A> const& index,
+                kernel::requires_arch<generic>) noexcept
+        {
+            static_assert(batch<T, A>::size == batch<V, A>::size,
+                          "Source and index sizes must match");
+            const auto tmp = batch_cast<U>(src);
+            kernel::scatter<A>(tmp, dst, index, A {});
+        }
+
+        // store
+        template <class T, class A>
+        inline void store(batch_bool<T, A> const& self, bool* mem, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            constexpr auto size = batch_bool<T, A>::size;
+            alignas(A::alignment()) T buffer[size];
+            kernel::store_aligned<A>(&buffer[0], batch_type(self), A {});
+            for (std::size_t i = 0; i < size; ++i)
+                mem[i] = bool(buffer[i]);
+        }
+
+        // store_aligned
+        template <class A, class T_in, class T_out>
+        inline void store_aligned(T_out* mem, batch<T_in, A> const& self, requires_arch<generic>) noexcept
+        {
+            static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct store for this type combination");
+            alignas(A::alignment()) T_in buffer[batch<T_in, A>::size];
+            store_aligned(&buffer[0], self);
+            std::copy(std::begin(buffer), std::end(buffer), mem);
+        }
+
+        // store_unaligned
+        template <class A, class T_in, class T_out>
+        inline void store_unaligned(T_out* mem, batch<T_in, A> const& self, requires_arch<generic>) noexcept
+        {
+            static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct store for this type combination");
+            return store_aligned<A>(mem, self, generic {});
+        }
+
+        // swizzle
+        template <class A, class T, class ITy, ITy... Vs>
+        inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<batch<ITy, A>, Vs...> mask, requires_arch<generic>) noexcept
+        {
+            return { swizzle(self.real(), mask), swizzle(self.imag(), mask) };
+        }
+
+        namespace detail
+        {
+            template <class A, class T>
+            inline batch<std::complex<T>, A> load_complex(batch<T, A> const& /*hi*/, batch<T, A> const& /*lo*/, requires_arch<generic>) noexcept
+            {
+                static_assert(std::is_same<T, void>::value, "load_complex not implemented for the required architecture");
+            }
+
+            template <class A, class T>
+            inline batch<T, A> complex_high(batch<std::complex<T>, A> const& /*src*/, requires_arch<generic>) noexcept
+            {
+                static_assert(std::is_same<T, void>::value, "complex_high not implemented for the required architecture");
+            }
+
+            template <class A, class T>
+            inline batch<T, A> complex_low(batch<std::complex<T>, A> const& /*src*/, requires_arch<generic>) noexcept
+            {
+                static_assert(std::is_same<T, void>::value, "complex_low not implemented for the required architecture");
+            }
+        }
+
+        // load_complex_aligned
+        template <class A, class T_out, class T_in>
+        inline batch<std::complex<T_out>, A> load_complex_aligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) noexcept
+        {
+            using real_batch = batch<T_out, A>;
+            T_in const* buffer = reinterpret_cast<T_in const*>(mem);
+            real_batch hi = real_batch::load_aligned(buffer),
+                       lo = real_batch::load_aligned(buffer + real_batch::size);
+            return detail::load_complex(hi, lo, A {});
+        }
+
+        // load_complex_unaligned
+        template <class A, class T_out, class T_in>
+        inline batch<std::complex<T_out>, A> load_complex_unaligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) noexcept
+        {
+            using real_batch = batch<T_out, A>;
+            T_in const* buffer = reinterpret_cast<T_in const*>(mem);
+            real_batch hi = real_batch::load_unaligned(buffer),
+                       lo = real_batch::load_unaligned(buffer + real_batch::size);
+            return detail::load_complex(hi, lo, A {});
+        }
+
+        // store_complex_aligned
+        template <class A, class T_out, class T_in>
+        inline void store_complex_aligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) noexcept
+        {
+            using real_batch = batch<T_in, A>;
+            real_batch hi = detail::complex_high(src, A {});
+            real_batch lo = detail::complex_low(src, A {});
+            T_out* buffer = reinterpret_cast<T_out*>(dst);
+            lo.store_aligned(buffer);
+            hi.store_aligned(buffer + real_batch::size);
+        }
+
+        // store_compelx_unaligned
+        template <class A, class T_out, class T_in>
+        inline void store_complex_unaligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) noexcept
+        {
+            using real_batch = batch<T_in, A>;
+            real_batch hi = detail::complex_high(src, A {});
+            real_batch lo = detail::complex_low(src, A {});
+            T_out* buffer = reinterpret_cast<T_out*>(dst);
+            lo.store_unaligned(buffer);
+            hi.store_unaligned(buffer + real_batch::size);
+        }
+
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_rounding.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_rounding.hpp
new file mode 100644
index 0000000000..b6a79a4515
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_rounding.hpp
@@ -0,0 +1,72 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_ROUNDING_HPP
+#define XSIMD_GENERIC_ROUNDING_HPP
+
+#include "./xsimd_generic_details.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+
+        using namespace types;
+
+        // ceil
+        template <class A, class T>
+        inline batch<T, A> ceil(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            batch<T, A> truncated_self = trunc(self);
+            return select(truncated_self < self, truncated_self + 1, truncated_self);
+        }
+
+        // floor
+        template <class A, class T>
+        inline batch<T, A> floor(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            batch<T, A> truncated_self = trunc(self);
+            return select(truncated_self > self, truncated_self - 1, truncated_self);
+        }
+
+        // round
+        template <class A, class T>
+        inline batch<T, A> round(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            auto v = abs(self);
+            auto c = ceil(v);
+            auto cp = select(c - 0.5 > v, c - 1, c);
+            return select(v > constants::maxflint<batch<T, A>>(), self, copysign(cp, self));
+        }
+
+        // trunc
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> trunc(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return self;
+        }
+        template <class A>
+        inline batch<float, A> trunc(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            return select(abs(self) < constants::maxflint<batch<float, A>>(), to_float(to_int(self)), self);
+        }
+        template <class A>
+        inline batch<double, A> trunc(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            return select(abs(self) < constants::maxflint<batch<double, A>>(), to_float(to_int(self)), self);
+        }
+
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_trigo.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_trigo.hpp
new file mode 100644
index 0000000000..2568a7253f
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_trigo.hpp
@@ -0,0 +1,969 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_TRIGO_HPP
+#define XSIMD_GENERIC_TRIGO_HPP
+
+#include "./xsimd_generic_details.hpp"
+
+#include <array>
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        /* origin: boost/simd/arch/common/detail/simd/trig_base.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+
+        using namespace types;
+
+        // acos
+        template <class A, class T>
+        inline batch<T, A> acos(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type x = abs(self);
+            auto x_larger_05 = x > batch_type(0.5);
+            x = select(x_larger_05, sqrt(fma(batch_type(-0.5), x, batch_type(0.5))), self);
+            x = asin(x);
+            x = select(x_larger_05, x + x, x);
+            x = select(self < batch_type(-0.5), constants::pi<batch_type>() - x, x);
+            return select(x_larger_05, x, constants::pio2<batch_type>() - x);
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> acos(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            batch_type tmp = asin(z);
+            return { constants::pio2<real_batch>() - tmp.real(), -tmp.imag() };
+        }
+
+        // acosh
+        /* origin: boost/simd/arch/common/simd/function/acosh.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A, class T>
+        inline batch<T, A> acosh(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type x = self - batch_type(1.);
+            auto test = x > constants::oneotwoeps<batch_type>();
+            batch_type z = select(test, self, x + sqrt(x + x + x * x));
+            batch_type l1pz = log1p(z);
+            return select(test, l1pz + constants::log_2<batch_type>(), l1pz);
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> acosh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            batch_type w = acos(z);
+            w = batch_type(-w.imag(), w.real());
+            return w;
+        }
+
+        // asin
+        template <class A>
+        inline batch<float, A> asin(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            batch_type x = abs(self);
+            batch_type sign = bitofsign(self);
+            auto x_larger_05 = x > batch_type(0.5);
+            batch_type z = select(x_larger_05, batch_type(0.5) * (batch_type(1.) - x), x * x);
+            x = select(x_larger_05, sqrt(z), x);
+            batch_type z1 = detail::horner<batch_type,
+                                           0x3e2aaae4,
+                                           0x3d9980f6,
+                                           0x3d3a3ec7,
+                                           0x3cc617e3,
+                                           0x3d2cb352>(z);
+            z1 = fma(z1, z * x, x);
+            z = select(x_larger_05, constants::pio2<batch_type>() - (z1 + z1), z1);
+            return z ^ sign;
+        }
+        template <class A>
+        inline batch<double, A> asin(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            batch_type x = abs(self);
+            auto small_cond = x < constants::sqrteps<batch_type>();
+            batch_type ct1 = batch_type(bit_cast<double>(int64_t(0x3fe4000000000000)));
+            batch_type zz1 = batch_type(1.) - x;
+            batch_type vp = zz1 * detail::horner<batch_type, 0x403c896240f3081dull, 0xc03991aaac01ab68ull, 0x401bdff5baf33e6aull, 0xbfe2079259f9290full, 0x3f684fc3988e9f08ull>(zz1) / detail::horner1<batch_type, 0x40756709b0b644beull, 0xc077fe08959063eeull, 0x40626219af6a7f42ull, 0xc035f2a2b6bf5d8cull>(zz1);
+            zz1 = sqrt(zz1 + zz1);
+            batch_type z = constants::pio4<batch_type>() - zz1;
+            zz1 = fms(zz1, vp, constants::pio_2lo<batch_type>());
+            z = z - zz1;
+            zz1 = z + constants::pio4<batch_type>();
+            batch_type zz2 = self * self;
+            z = zz2 * detail::horner<batch_type, 0xc020656c06ceafd5ull, 0x40339007da779259ull, 0xc0304331de27907bull, 0x4015c74b178a2dd9ull, 0xbfe34341333e5c16ull, 0x3f716b9b0bd48ad3ull>(zz2) / detail::horner1<batch_type, 0xc04898220a3607acull, 0x4061705684ffbf9dull, 0xc06265bb6d3576d7ull, 0x40519fc025fe9054ull, 0xc02d7b590b5e0eabull>(zz2);
+            zz2 = fma(x, z, x);
+            return select(x > batch_type(1.), constants::nan<batch_type>(),
+                          select(small_cond, x,
+                                 select(x > ct1, zz1, zz2))
+                              ^ bitofsign(self));
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> asin(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            real_batch x = z.real();
+            real_batch y = z.imag();
+
+            batch_type ct(-y, x);
+            batch_type zz(real_batch(1.) - (x - y) * (x + y), -2 * x * y);
+            zz = log(ct + sqrt(zz));
+            batch_type resg(zz.imag(), -zz.real());
+
+            return select(y == real_batch(0.),
+                          select(fabs(x) > real_batch(1.),
+                                 batch_type(constants::pio2<real_batch>(), real_batch(0.)),
+                                 batch_type(asin(x), real_batch(0.))),
+                          resg);
+        }
+
+        // asinh
+        /* origin: boost/simd/arch/common/simd/function/asinh.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        namespace detail
+        {
+            template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+            inline batch<T, A>
+            average(const batch<T, A>& x1, const batch<T, A>& x2) noexcept
+            {
+                return (x1 & x2) + ((x1 ^ x2) >> 1);
+            }
+
+            template <class A, class T>
+            inline batch<T, A>
+            averagef(const batch<T, A>& x1, const batch<T, A>& x2) noexcept
+            {
+                using batch_type = batch<T, A>;
+                return fma(x1, batch_type(0.5), x2 * batch_type(0.5));
+            }
+            template <class A>
+            inline batch<float, A> average(batch<float, A> const& x1, batch<float, A> const& x2) noexcept
+            {
+                return averagef(x1, x2);
+            }
+            template <class A>
+            inline batch<double, A> average(batch<double, A> const& x1, batch<double, A> const& x2) noexcept
+            {
+                return averagef(x1, x2);
+            }
+        }
+        template <class A>
+        inline batch<float, A> asinh(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            batch_type x = abs(self);
+            auto lthalf = x < batch_type(0.5);
+            batch_type x2 = x * x;
+            batch_type bts = bitofsign(self);
+            batch_type z(0.);
+            if (any(lthalf))
+            {
+                z = detail::horner<batch_type,
+                                   0x3f800000,
+                                   0xbe2aa9ad,
+                                   0x3d9949b1,
+                                   0xbd2ee581,
+                                   0x3ca4d6e6>(x2)
+                    * x;
+                if (all(lthalf))
+                    return z ^ bts;
+            }
+            batch_type tmp = select(x > constants::oneosqrteps<batch_type>(), x, detail::average(x, hypot(batch_type(1.), x)));
+#ifndef XSIMD_NO_NANS
+            return select(isnan(self), constants::nan<batch_type>(), select(lthalf, z, log(tmp) + constants::log_2<batch_type>()) ^ bts);
+#else
+            return select(lthalf, z, log(tmp) + constants::log_2<batch_type>()) ^ bts;
+#endif
+        }
+        template <class A>
+        inline batch<double, A> asinh(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            batch_type x = abs(self);
+            auto test = x > constants::oneosqrteps<batch_type>();
+            batch_type z = select(test, x - batch_type(1.), x + x * x / (batch_type(1.) + hypot(batch_type(1.), x)));
+#ifndef XSIMD_NO_INFINITIES
+            z = select(x == constants::infinity<batch_type>(), x, z);
+#endif
+            batch_type l1pz = log1p(z);
+            z = select(test, l1pz + constants::log_2<batch_type>(), l1pz);
+            return bitofsign(self) ^ z;
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> asinh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            batch_type w = asin(batch_type(-z.imag(), z.real()));
+            w = batch_type(w.imag(), -w.real());
+            return w;
+        }
+
+        // atan
+        namespace detail
+        {
+            template <class A>
+            static inline batch<float, A> kernel_atan(const batch<float, A>& x, const batch<float, A>& recx) noexcept
+            {
+                using batch_type = batch<float, A>;
+                const auto flag1 = x < constants::tan3pio8<batch_type>();
+                const auto flag2 = (x >= batch_type(bit_cast<float>((uint32_t)0x3ed413cd))) && flag1;
+                batch_type yy = select(flag1, batch_type(0.), constants::pio2<batch_type>());
+                yy = select(flag2, constants::pio4<batch_type>(), yy);
+                batch_type xx = select(flag1, x, -recx);
+                xx = select(flag2, (x - batch_type(1.)) / (x + batch_type(1.)), xx);
+                const batch_type z = xx * xx;
+                batch_type z1 = detail::horner<batch_type,
+                                               0xbeaaaa2aul,
+                                               0x3e4c925ful,
+                                               0xbe0e1b85ul,
+                                               0x3da4f0d1ul>(z);
+                z1 = fma(xx, z1 * z, xx);
+                z1 = select(flag2, z1 + constants::pio_4lo<batch_type>(), z1);
+                z1 = select(!flag1, z1 + constants::pio_2lo<batch_type>(), z1);
+                return yy + z1;
+            }
+            template <class A>
+            static inline batch<double, A> kernel_atan(const batch<double, A>& x, const batch<double, A>& recx) noexcept
+            {
+                using batch_type = batch<double, A>;
+                const auto flag1 = x < constants::tan3pio8<batch_type>();
+                const auto flag2 = (x >= constants::tanpio8<batch_type>()) && flag1;
+                batch_type yy = select(flag1, batch_type(0.), constants::pio2<batch_type>());
+                yy = select(flag2, constants::pio4<batch_type>(), yy);
+                batch_type xx = select(flag1, x, -recx);
+                xx = select(flag2, (x - batch_type(1.)) / (x + batch_type(1.)), xx);
+                batch_type z = xx * xx;
+                z *= detail::horner<batch_type,
+                                    0xc0503669fd28ec8eull,
+                                    0xc05eb8bf2d05ba25ull,
+                                    0xc052c08c36880273ull,
+                                    0xc03028545b6b807aull,
+                                    0xbfec007fa1f72594ull>(z)
+                    / detail::horner1<batch_type,
+                                      0x4068519efbbd62ecull,
+                                      0x407e563f13b049eaull,
+                                      0x407b0e18d2e2be3bull,
+                                      0x4064a0dd43b8fa25ull,
+                                      0x4038dbc45b14603cull>(z);
+                z = fma(xx, z, xx);
+                z = select(flag2, z + constants::pio_4lo<batch_type>(), z);
+                z = z + select(flag1, batch_type(0.), constants::pio_2lo<batch_type>());
+                return yy + z;
+            }
+        }
+        template <class A, class T>
+        inline batch<T, A> atan(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            const batch_type absa = abs(self);
+            const batch_type x = detail::kernel_atan(absa, batch_type(1.) / absa);
+            return x ^ bitofsign(self);
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> atan(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            real_batch x = z.real();
+            real_batch y = z.imag();
+            real_batch x2 = x * x;
+            real_batch one(1.);
+            real_batch a = one - x2 - (y * y);
+            real_batch w = 0.5 * atan2(2. * x, a);
+            real_batch num = y + one;
+            num = x2 + num * num;
+            real_batch den = y - one;
+            den = x2 + den * den;
+            batch_type res = select((x == real_batch(0.)) && (y == real_batch(1.)),
+                                    batch_type(real_batch(0.), constants::infinity<real_batch>()),
+                                    batch_type(w, 0.25 * log(num / den)));
+            return res;
+        }
+
+        // atanh
+        /* origin: boost/simd/arch/common/simd/function/acosh.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A, class T>
+        inline batch<T, A> atanh(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type x = abs(self);
+            batch_type t = x + x;
+            batch_type z = batch_type(1.) - x;
+            auto test = x < batch_type(0.5);
+            batch_type tmp = select(test, x, t) / z;
+            return bitofsign(self) ^ (batch_type(0.5) * log1p(select(test, fma(t, tmp, t), tmp)));
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> atanh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            batch_type w = atan(batch_type(-z.imag(), z.real()));
+            w = batch_type(w.imag(), -w.real());
+            return w;
+        }
+
+        // atan2
+        template <class A, class T>
+        inline batch<T, A> atan2(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            const batch_type q = abs(self / other);
+            const batch_type z = detail::kernel_atan(q, batch_type(1.) / q);
+            return select(other > batch_type(0.), z, constants::pi<batch_type>() - z) * signnz(self);
+        }
+
+        // cos
+        namespace detail
+        {
+            template <class T, class A>
+            inline batch<T, A> quadrant(const batch<T, A>& x) noexcept
+            {
+                return x & batch<T, A>(3);
+            }
+
+            template <class A>
+            inline batch<float, A> quadrant(const batch<float, A>& x) noexcept
+            {
+                return to_float(quadrant(to_int(x)));
+            }
+
+            template <class A>
+            inline batch<double, A> quadrant(const batch<double, A>& x) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type a = x * batch_type(0.25);
+                return (a - floor(a)) * batch_type(4.);
+            }
+            /* origin: boost/simd/arch/common/detail/simd/f_trig_evaluation.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+
+            template <class A>
+            inline batch<float, A> cos_eval(const batch<float, A>& z) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type y = detail::horner<batch_type,
+                                              0x3d2aaaa5,
+                                              0xbab60619,
+                                              0x37ccf5ce>(z);
+                return batch_type(1.) + fma(z, batch_type(-0.5), y * z * z);
+            }
+
+            template <class A>
+            inline batch<float, A> sin_eval(const batch<float, A>& z, const batch<float, A>& x) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type y = detail::horner<batch_type,
+                                              0xbe2aaaa2,
+                                              0x3c08839d,
+                                              0xb94ca1f9>(z);
+                return fma(y * z, x, x);
+            }
+
+            template <class A>
+            static inline batch<float, A> base_tancot_eval(const batch<float, A>& z) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type zz = z * z;
+                batch_type y = detail::horner<batch_type,
+                                              0x3eaaaa6f,
+                                              0x3e0896dd,
+                                              0x3d5ac5c9,
+                                              0x3cc821b5,
+                                              0x3b4c779c,
+                                              0x3c19c53b>(zz);
+                return fma(y, zz * z, z);
+            }
+
+            template <class A, class BB>
+            static inline batch<float, A> tan_eval(const batch<float, A>& z, const BB& test) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type y = base_tancot_eval(z);
+                return select(test, y, -batch_type(1.) / y);
+            }
+
+            template <class A, class BB>
+            static inline batch<float, A> cot_eval(const batch<float, A>& z, const BB& test) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type y = base_tancot_eval(z);
+                return select(test, batch_type(1.) / y, -y);
+            }
+
+            /* origin: boost/simd/arch/common/detail/simd/d_trig_evaluation.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class A>
+            static inline batch<double, A> cos_eval(const batch<double, A>& z) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type y = detail::horner<batch_type,
+                                              0x3fe0000000000000ull,
+                                              0xbfa5555555555551ull,
+                                              0x3f56c16c16c15d47ull,
+                                              0xbefa01a019ddbcd9ull,
+                                              0x3e927e4f8e06d9a5ull,
+                                              0xbe21eea7c1e514d4ull,
+                                              0x3da8ff831ad9b219ull>(z);
+                return batch_type(1.) - y * z;
+            }
+
+            template <class A>
+            static inline batch<double, A> sin_eval(const batch<double, A>& z, const batch<double, A>& x) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type y = detail::horner<batch_type,
+                                              0xbfc5555555555548ull,
+                                              0x3f8111111110f7d0ull,
+                                              0xbf2a01a019bfdf03ull,
+                                              0x3ec71de3567d4896ull,
+                                              0xbe5ae5e5a9291691ull,
+                                              0x3de5d8fd1fcf0ec1ull>(z);
+                return fma(y * z, x, x);
+            }
+
+            template <class A>
+            static inline batch<double, A> base_tancot_eval(const batch<double, A>& z) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type zz = z * z;
+                batch_type num = detail::horner<batch_type,
+                                                0xc1711fead3299176ull,
+                                                0x413199eca5fc9dddull,
+                                                0xc0c992d8d24f3f38ull>(zz);
+                batch_type den = detail::horner1<batch_type,
+                                                 0xc189afe03cbe5a31ull,
+                                                 0x4177d98fc2ead8efull,
+                                                 0xc13427bc582abc96ull,
+                                                 0x40cab8a5eeb36572ull>(zz);
+                return fma(z, (zz * (num / den)), z);
+            }
+
+            template <class A, class BB>
+            static inline batch<double, A> tan_eval(const batch<double, A>& z, const BB& test) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type y = base_tancot_eval(z);
+                return select(test, y, -batch_type(1.) / y);
+            }
+
+            template <class A, class BB>
+            static inline batch<double, A> cot_eval(const batch<double, A>& z, const BB& test) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type y = base_tancot_eval(z);
+                return select(test, batch_type(1.) / y, -y);
+            }
+            /* origin: boost/simd/arch/common/detail/simd/trig_reduction.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+
+            struct trigo_radian_tag
+            {
+            };
+            struct trigo_pi_tag
+            {
+            };
+
+            template <class B, class Tag = trigo_radian_tag>
+            struct trigo_reducer
+            {
+                static inline B reduce(const B& x, B& xr) noexcept
+                {
+                    if (all(x <= constants::pio4<B>()))
+                    {
+                        xr = x;
+                        return B(0.);
+                    }
+                    else if (all(x <= constants::pio2<B>()))
+                    {
+                        auto test = x > constants::pio4<B>();
+                        xr = x - constants::pio2_1<B>();
+                        xr -= constants::pio2_2<B>();
+                        xr -= constants::pio2_3<B>();
+                        xr = select(test, xr, x);
+                        return select(test, B(1.), B(0.));
+                    }
+                    else if (all(x <= constants::twentypi<B>()))
+                    {
+                        B xi = nearbyint(x * constants::twoopi<B>());
+                        xr = fnma(xi, constants::pio2_1<B>(), x);
+                        xr -= xi * constants::pio2_2<B>();
+                        xr -= xi * constants::pio2_3<B>();
+                        return quadrant(xi);
+                    }
+                    else if (all(x <= constants::mediumpi<B>()))
+                    {
+                        B fn = nearbyint(x * constants::twoopi<B>());
+                        B r = x - fn * constants::pio2_1<B>();
+                        B w = fn * constants::pio2_1t<B>();
+                        B t = r;
+                        w = fn * constants::pio2_2<B>();
+                        r = t - w;
+                        w = fn * constants::pio2_2t<B>() - ((t - r) - w);
+                        t = r;
+                        w = fn * constants::pio2_3<B>();
+                        r = t - w;
+                        w = fn * constants::pio2_3t<B>() - ((t - r) - w);
+                        xr = r - w;
+                        return quadrant(fn);
+                    }
+                    else
+                    {
+                        static constexpr std::size_t size = B::size;
+                        using value_type = typename B::value_type;
+                        alignas(B) std::array<value_type, size> tmp;
+                        alignas(B) std::array<value_type, size> txr;
+                        alignas(B) std::array<value_type, size> args;
+                        x.store_aligned(args.data());
+
+                        for (std::size_t i = 0; i < size; ++i)
+                        {
+                            double arg = args[i];
+                            if (arg == std::numeric_limits<value_type>::infinity())
+                            {
+                                tmp[i] = 0.;
+                                txr[i] = std::numeric_limits<value_type>::quiet_NaN();
+                            }
+                            else
+                            {
+                                double y[2];
+                                std::int32_t n = ::xsimd::detail::__ieee754_rem_pio2(arg, y);
+                                tmp[i] = value_type(n & 3);
+                                txr[i] = value_type(y[0]);
+                            }
+                        }
+                        xr = B::load_aligned(&txr[0]);
+                        B res = B::load_aligned(&tmp[0]);
+                        return res;
+                    }
+                }
+            };
+
+            template <class B>
+            struct trigo_reducer<B, trigo_pi_tag>
+            {
+                static inline B reduce(const B& x, B& xr) noexcept
+                {
+                    B xi = nearbyint(x * B(2.));
+                    B x2 = x - xi * B(0.5);
+                    xr = x2 * constants::pi<B>();
+                    return quadrant(xi);
+                }
+            };
+
+        }
+        template <class A, class T>
+        inline batch<T, A> cos(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            const batch_type x = abs(self);
+            batch_type xr = constants::nan<batch_type>();
+            const batch_type n = detail::trigo_reducer<batch_type>::reduce(x, xr);
+            auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
+            auto swap_bit = fma(batch_type(-2.), tmp, n);
+            auto sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
+            const batch_type z = xr * xr;
+            const batch_type se = detail::sin_eval(z, xr);
+            const batch_type ce = detail::cos_eval(z);
+            const batch_type z1 = select(swap_bit != batch_type(0.), se, ce);
+            return z1 ^ sign_bit;
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> cos(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            return { cos(z.real()) * cosh(z.imag()), -sin(z.real()) * sinh(z.imag()) };
+        }
+
+        // cosh
+
+        /* origin: boost/simd/arch/common/simd/function/cosh.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+
+        template <class A, class T>
+        inline batch<T, A> cosh(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type x = abs(self);
+            auto test1 = x > (constants::maxlog<batch_type>() - constants::log_2<batch_type>());
+            batch_type fac = select(test1, batch_type(0.5), batch_type(1.));
+            batch_type tmp = exp(x * fac);
+            batch_type tmp1 = batch_type(0.5) * tmp;
+            return select(test1, tmp1 * tmp, detail::average(tmp, batch_type(1.) / tmp));
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> cosh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            auto x = z.real();
+            auto y = z.imag();
+            return { cosh(x) * cos(y), sinh(x) * sin(y) };
+        }
+
+        // sin
+        namespace detail
+        {
+            template <class A, class T, class Tag = trigo_radian_tag>
+            inline batch<T, A> sin(batch<T, A> const& self, Tag = Tag()) noexcept
+            {
+                using batch_type = batch<T, A>;
+                const batch_type x = abs(self);
+                batch_type xr = constants::nan<batch_type>();
+                const batch_type n = detail::trigo_reducer<batch_type, Tag>::reduce(x, xr);
+                auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
+                auto swap_bit = fma(batch_type(-2.), tmp, n);
+                auto sign_bit = bitofsign(self) ^ select(tmp != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
+                const batch_type z = xr * xr;
+                const batch_type se = detail::sin_eval(z, xr);
+                const batch_type ce = detail::cos_eval(z);
+                const batch_type z1 = select(swap_bit == batch_type(0.), se, ce);
+                return z1 ^ sign_bit;
+            }
+        }
+
+        template <class A, class T>
+        inline batch<T, A> sin(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::sin(self);
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> sin(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            return { sin(z.real()) * cosh(z.imag()), cos(z.real()) * sinh(z.imag()) };
+        }
+
+        // sincos
+        template <class A, class T>
+        inline std::pair<batch<T, A>, batch<T, A>> sincos(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            const batch_type x = abs(self);
+            batch_type xr = constants::nan<batch_type>();
+            const batch_type n = detail::trigo_reducer<batch_type>::reduce(x, xr);
+            auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
+            auto swap_bit = fma(batch_type(-2.), tmp, n);
+            const batch_type z = xr * xr;
+            const batch_type se = detail::sin_eval(z, xr);
+            const batch_type ce = detail::cos_eval(z);
+            auto sin_sign_bit = bitofsign(self) ^ select(tmp != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
+            const batch_type sin_z1 = select(swap_bit == batch_type(0.), se, ce);
+            auto cos_sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
+            const batch_type cos_z1 = select(swap_bit != batch_type(0.), se, ce);
+            return std::make_pair(sin_z1 ^ sin_sign_bit, cos_z1 ^ cos_sign_bit);
+        }
+
+        template <class A, class T>
+        inline std::pair<batch<std::complex<T>, A>, batch<std::complex<T>, A>>
+        sincos(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            real_batch rcos = cos(z.real());
+            real_batch rsin = sin(z.real());
+            real_batch icosh = cosh(z.imag());
+            real_batch isinh = sinh(z.imag());
+            return std::make_pair(batch_type(rsin * icosh, rcos * isinh), batch_type(rcos * icosh, -rsin * isinh));
+        }
+
+        // sinh
+        namespace detail
+        {
+            /* origin: boost/simd/arch/common/detail/generic/sinh_kernel.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class A>
+            inline batch<float, A> sinh_kernel(batch<float, A> const& self) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type sqr_self = self * self;
+                return detail::horner<batch_type,
+                                      0x3f800000, // 1.0f
+                                      0x3e2aaacc, // 1.66667160211E-1f
+                                      0x3c087bbe, // 8.33028376239E-3f
+                                      0x39559e2f // 2.03721912945E-4f
+                                      >(sqr_self)
+                    * self;
+            }
+
+            template <class A>
+            inline batch<double, A> sinh_kernel(batch<double, A> const& self) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type sqrself = self * self;
+                return fma(self, (detail::horner<batch_type,
+                                                 0xc115782bdbf6ab05ull, //  -3.51754964808151394800E5
+                                                 0xc0c694b8c71d6182ull, //  -1.15614435765005216044E4,
+                                                 0xc064773a398ff4feull, //  -1.63725857525983828727E2,
+                                                 0xbfe9435fe8bb3cd6ull //  -7.89474443963537015605E-1
+                                                 >(sqrself)
+                                  / detail::horner1<batch_type,
+                                                    0xc1401a20e4f90044ull, //  -2.11052978884890840399E6
+                                                    0x40e1a7ba7ed72245ull, //   3.61578279834431989373E4,
+                                                    0xc0715b6096e96484ull //  -2.77711081420602794433E2,
+                                                    >(sqrself))
+                               * sqrself,
+                           self);
+            }
+        }
+        /* origin: boost/simd/arch/common/simd/function/sinh.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A, class T>
+        inline batch<T, A> sinh(batch<T, A> const& a, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type half(0.5);
+            batch_type x = abs(a);
+            auto lt1 = x < batch_type(1.);
+            batch_type bts = bitofsign(a);
+            batch_type z(0.);
+            if (any(lt1))
+            {
+                z = detail::sinh_kernel(x);
+                if (all(lt1))
+                    return z ^ bts;
+            }
+            auto test1 = x > (constants::maxlog<batch_type>() - constants::log_2<batch_type>());
+            batch_type fac = select(test1, half, batch_type(1.));
+            batch_type tmp = exp(x * fac);
+            batch_type tmp1 = half * tmp;
+            batch_type r = select(test1, tmp1 * tmp, tmp1 - half / tmp);
+            return select(lt1, z, r) ^ bts;
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> sinh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            auto x = z.real();
+            auto y = z.imag();
+            return { sinh(x) * cos(y), cosh(x) * sin(y) };
+        }
+
+        // tan
+        template <class A, class T>
+        inline batch<T, A> tan(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            const batch_type x = abs(self);
+            batch_type xr = constants::nan<batch_type>();
+            const batch_type n = detail::trigo_reducer<batch_type>::reduce(x, xr);
+            auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
+            auto swap_bit = fma(batch_type(-2.), tmp, n);
+            auto test = (swap_bit == batch_type(0.));
+            const batch_type y = detail::tan_eval(xr, test);
+            return y ^ bitofsign(self);
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> tan(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            real_batch d = cos(2 * z.real()) + cosh(2 * z.imag());
+            batch_type winf(constants::infinity<real_batch>(), constants::infinity<real_batch>());
+            real_batch wreal = sin(2 * z.real()) / d;
+            real_batch wimag = sinh(2 * z.imag());
+            batch_type wres = select(isinf(wimag), batch_type(wreal, real_batch(1.)), batch_type(wreal, wimag / d));
+            return select(d == real_batch(0.), winf, wres);
+        }
+
+        // tanh
+        namespace detail
+        {
+            /* origin: boost/simd/arch/common/detail/generic/tanh_kernel.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class B>
+            struct tanh_kernel;
+
+            template <class A>
+            struct tanh_kernel<batch<float, A>>
+            {
+                using batch_type = batch<float, A>;
+                static inline batch_type tanh(const batch_type& x) noexcept
+                {
+                    batch_type sqrx = x * x;
+                    return fma(detail::horner<batch_type,
+                                              0xbeaaaa99, //    -3.33332819422E-1F
+                                              0x3e088393, //    +1.33314422036E-1F
+                                              0xbd5c1e2d, //    -5.37397155531E-2F
+                                              0x3ca9134e, //    +2.06390887954E-2F
+                                              0xbbbaf0ea //    -5.70498872745E-3F
+                                              >(sqrx)
+                                   * sqrx,
+                               x, x);
+                }
+
+                static inline batch_type cotanh(const batch_type& x) noexcept
+                {
+                    return batch_type(1.) / tanh(x);
+                }
+            };
+
+            template <class A>
+            struct tanh_kernel<batch<double, A>>
+            {
+                using batch_type = batch<double, A>;
+                static inline batch_type tanh(const batch_type& x) noexcept
+                {
+                    batch_type sqrx = x * x;
+                    return fma(sqrx * p(sqrx) / q(sqrx), x, x);
+                }
+
+                static inline batch_type cotanh(const batch_type& x) noexcept
+                {
+                    batch_type sqrx = x * x;
+                    batch_type qval = q(sqrx);
+                    return qval / (x * fma(p(sqrx), sqrx, qval));
+                }
+
+                static inline batch_type p(const batch_type& x) noexcept
+                {
+                    return detail::horner<batch_type,
+                                          0xc0993ac030580563, // -1.61468768441708447952E3
+                                          0xc058d26a0e26682d, // -9.92877231001918586564E1,
+                                          0xbfeedc5baafd6f4b // -9.64399179425052238628E-1
+                                          >(x);
+                }
+
+                static inline batch_type q(const batch_type& x) noexcept
+                {
+                    return detail::horner1<batch_type,
+                                           0x40b2ec102442040c, //  4.84406305325125486048E3
+                                           0x40a176fa0e5535fa, //  2.23548839060100448583E3,
+                                           0x405c33f28a581B86 //  1.12811678491632931402E2,
+                                           >(x);
+                }
+            };
+
+        }
+        /* origin: boost/simd/arch/common/simd/function/tanh.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A, class T>
+        inline batch<T, A> tanh(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type one(1.);
+            batch_type x = abs(self);
+            auto test = x < (batch_type(5.) / batch_type(8.));
+            batch_type bts = bitofsign(self);
+            batch_type z = one;
+            if (any(test))
+            {
+                z = detail::tanh_kernel<batch_type>::tanh(x);
+                if (all(test))
+                    return z ^ bts;
+            }
+            batch_type r = fma(batch_type(-2.), one / (one + exp(x + x)), one);
+            return select(test, z, r) ^ bts;
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> tanh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using real_batch = typename batch<std::complex<T>, A>::real_batch;
+            auto x = z.real();
+            auto y = z.imag();
+            real_batch two(2);
+            auto d = cosh(two * x) + cos(two * y);
+            return { sinh(two * x) / d, sin(two * y) / d };
+        }
+
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp
new file mode 100644
index 0000000000..be1da61358
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp
@@ -0,0 +1,1657 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX_HPP
+#define XSIMD_AVX_HPP
+
+#include <complex>
+#include <limits>
+#include <type_traits>
+
+#include "../types/xsimd_avx_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // fwd
+        template <class A, class T, size_t I>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
+
+        namespace detail
+        {
+            inline void split_avx(__m256i val, __m128i& low, __m128i& high) noexcept
+            {
+                low = _mm256_castsi256_si128(val);
+                high = _mm256_extractf128_si256(val, 1);
+            }
+            inline void split_avx(__m256 val, __m128& low, __m128& high) noexcept
+            {
+                low = _mm256_castps256_ps128(val);
+                high = _mm256_extractf128_ps(val, 1);
+            }
+            inline void split_avx(__m256d val, __m128d& low, __m128d& high) noexcept
+            {
+                low = _mm256_castpd256_pd128(val);
+                high = _mm256_extractf128_pd(val, 1);
+            }
+            inline __m256i merge_sse(__m128i low, __m128i high) noexcept
+            {
+                return _mm256_insertf128_si256(_mm256_castsi128_si256(low), high, 1);
+            }
+            inline __m256 merge_sse(__m128 low, __m128 high) noexcept
+            {
+                return _mm256_insertf128_ps(_mm256_castps128_ps256(low), high, 1);
+            }
+            inline __m256d merge_sse(__m128d low, __m128d high) noexcept
+            {
+                return _mm256_insertf128_pd(_mm256_castpd128_pd256(low), high, 1);
+            }
+            template <class F>
+            inline __m256i fwd_to_sse(F f, __m256i self) noexcept
+            {
+                __m128i self_low, self_high;
+                split_avx(self, self_low, self_high);
+                __m128i res_low = f(self_low);
+                __m128i res_high = f(self_high);
+                return merge_sse(res_low, res_high);
+            }
+            template <class F>
+            inline __m256i fwd_to_sse(F f, __m256i self, __m256i other) noexcept
+            {
+                __m128i self_low, self_high, other_low, other_high;
+                split_avx(self, self_low, self_high);
+                split_avx(other, other_low, other_high);
+                __m128i res_low = f(self_low, other_low);
+                __m128i res_high = f(self_high, other_high);
+                return merge_sse(res_low, res_high);
+            }
+            template <class F>
+            inline __m256i fwd_to_sse(F f, __m256i self, int32_t other) noexcept
+            {
+                __m128i self_low, self_high;
+                split_avx(self, self_low, self_high);
+                __m128i res_low = f(self_low, other);
+                __m128i res_high = f(self_high, other);
+                return merge_sse(res_low, res_high);
+            }
+        }
+
+        // abs
+        template <class A>
+        inline batch<float, A> abs(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            __m256 sign_mask = _mm256_set1_ps(-0.f); // -0.f = 1 << 31
+            return _mm256_andnot_ps(sign_mask, self);
+        }
+        template <class A>
+        inline batch<double, A> abs(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            __m256d sign_mask = _mm256_set1_pd(-0.f); // -0.f = 1 << 31
+            return _mm256_andnot_pd(sign_mask, self);
+        }
+
+        // add
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return add(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+        template <class A>
+        inline batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_add_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_add_pd(self, other);
+        }
+
+        // all
+        template <class A>
+        inline bool all(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_testc_ps(self, batch_bool<float, A>(true)) != 0;
+        }
+        template <class A>
+        inline bool all(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_testc_pd(self, batch_bool<double, A>(true)) != 0;
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline bool all(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_testc_si256(self, batch_bool<T, A>(true)) != 0;
+        }
+
+        // any
+        template <class A>
+        inline bool any(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return !_mm256_testz_ps(self, self);
+        }
+        template <class A>
+        inline bool any(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return !_mm256_testz_pd(self, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline bool any(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return !_mm256_testz_si256(self, self);
+        }
+
+        // batch_bool_cast
+        template <class A, class T_out, class T_in>
+        inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<avx>) noexcept
+        {
+            return { bitwise_cast<T_out>(batch<T_in, A>(self.data)).data };
+        }
+
+        // bitwise_and
+        template <class A>
+        inline batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_and_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_and_pd(self, other);
+        }
+
+        template <class A>
+        inline batch_bool<float, A> bitwise_and(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_and_ps(self, other);
+        }
+        template <class A>
+        inline batch_bool<double, A> bitwise_and(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_and_pd(self, other);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+
+        // bitwise_andnot
+        template <class A>
+        inline batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_andnot_ps(other, self);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_andnot_pd(other, self);
+        }
+
+        template <class A>
+        inline batch_bool<float, A> bitwise_andnot(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_andnot_ps(other, self);
+        }
+        template <class A>
+        inline batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_andnot_pd(other, self);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_andnot(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_andnot(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+
+        // bitwise_lshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, int32_t o) noexcept
+                                      { return bitwise_lshift(batch<T, sse4_2>(s), o, sse4_2 {}); },
+                                      self, other);
+        }
+
+        // bitwise_not
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s) noexcept
+                                      { return bitwise_not(batch<T, sse4_2>(s), sse4_2 {}); },
+                                      self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s) noexcept
+                                      { return bitwise_not(batch_bool<T, sse4_2>(s), sse4_2 {}); },
+                                      self);
+        }
+
+        // bitwise_or
+        template <class A>
+        inline batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_or_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_or_pd(self, other);
+        }
+        template <class A>
+        inline batch_bool<float, A> bitwise_or(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_or_ps(self, other);
+        }
+        template <class A>
+        inline batch_bool<double, A> bitwise_or(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_or_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_or(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_or(batch_bool<T, sse4_2>(s), batch_bool<T, sse4_2>(o)); },
+                                      self, other);
+        }
+
+        // bitwise_rshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, int32_t o) noexcept
+                                      { return bitwise_rshift(batch<T, sse4_2>(s), o, sse4_2 {}); },
+                                      self, other);
+        }
+
+        // bitwise_xor
+        template <class A>
+        inline batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_pd(self, other);
+        }
+        template <class A>
+        inline batch_bool<float, A> bitwise_xor(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_ps(self, other);
+        }
+        template <class A>
+        inline batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_xor(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
+                                      self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_xor(batch_bool<T, sse4_2>(s), batch_bool<T, sse4_2>(o), sse4_2 {}); },
+                                      self, other);
+        }
+
+        // bitwise_cast
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<float, A> bitwise_cast(batch<T, A> const& self, batch<float, A> const&, requires_arch<avx>) noexcept
+        {
+            return _mm256_castsi256_ps(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<double, A> bitwise_cast(batch<T, A> const& self, batch<double, A> const&, requires_arch<avx>) noexcept
+        {
+            return _mm256_castsi256_pd(self);
+        }
+        template <class A, class T, class Tp, class = typename std::enable_if<std::is_integral<typename std::common_type<T, Tp>::type>::value, void>::type>
+        inline batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<avx>) noexcept
+        {
+            return batch<Tp, A>(self.data);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_cast(batch<float, A> const& self, batch<double, A> const&, requires_arch<avx>) noexcept
+        {
+            return _mm256_castps_pd(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_cast(batch<float, A> const& self, batch<T, A> const&, requires_arch<avx>) noexcept
+        {
+            return _mm256_castps_si256(self);
+        }
+        template <class A>
+        inline batch<float, A> bitwise_cast(batch<double, A> const& self, batch<float, A> const&, requires_arch<avx>) noexcept
+        {
+            return _mm256_castpd_ps(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_cast(batch<double, A> const& self, batch<T, A> const&, requires_arch<avx>) noexcept
+        {
+            return _mm256_castpd_si256(self);
+        }
+
+        // bitwise_not
+        template <class A>
+        inline batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1)));
+        }
+        template <class A>
+        inline batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1)));
+        }
+        template <class A>
+        inline batch_bool<float, A> bitwise_not(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1)));
+        }
+        template <class A>
+        inline batch_bool<double, A> bitwise_not(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1)));
+        }
+
+        // broadcast
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> broadcast(T val, requires_arch<avx>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm256_set1_epi8(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_set1_epi16(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_set1_epi32(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_set1_epi64x(val);
+            }
+            else
+            {
+                assert(false && "unsupported");
+                return {};
+            }
+        }
+        template <class A>
+        inline batch<float, A> broadcast(float val, requires_arch<avx>) noexcept
+        {
+            return _mm256_set1_ps(val);
+        }
+        template <class A>
+        inline batch<double, A> broadcast(double val, requires_arch<avx>) noexcept
+        {
+            return _mm256_set1_pd(val);
+        }
+
+        // ceil
+        template <class A>
+        inline batch<float, A> ceil(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_ceil_ps(self);
+        }
+        template <class A>
+        inline batch<double, A> ceil(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_ceil_pd(self);
+        }
+
+        namespace detail
+        {
+            // On clang, _mm256_extractf128_ps is built upon build_shufflevector
+            // which require index parameter to be a constant
+            template <int index, class B>
+            inline B get_half_complex_f(const B& real, const B& imag) noexcept
+            {
+                __m128 tmp0 = _mm256_extractf128_ps(real, index);
+                __m128 tmp1 = _mm256_extractf128_ps(imag, index);
+                __m128 tmp2 = _mm_unpackhi_ps(tmp0, tmp1);
+                tmp0 = _mm_unpacklo_ps(tmp0, tmp1);
+                __m256 res = real;
+                res = _mm256_insertf128_ps(res, tmp0, 0);
+                res = _mm256_insertf128_ps(res, tmp2, 1);
+                return res;
+            }
+            template <int index, class B>
+            inline B get_half_complex_d(const B& real, const B& imag) noexcept
+            {
+                __m128d tmp0 = _mm256_extractf128_pd(real, index);
+                __m128d tmp1 = _mm256_extractf128_pd(imag, index);
+                __m128d tmp2 = _mm_unpackhi_pd(tmp0, tmp1);
+                tmp0 = _mm_unpacklo_pd(tmp0, tmp1);
+                __m256d res = real;
+                res = _mm256_insertf128_pd(res, tmp0, 0);
+                res = _mm256_insertf128_pd(res, tmp2, 1);
+                return res;
+            }
+
+            // complex_low
+            template <class A>
+            inline batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<avx>) noexcept
+            {
+                return get_half_complex_f<0>(self.real(), self.imag());
+            }
+            template <class A>
+            inline batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx>) noexcept
+            {
+                return get_half_complex_d<0>(self.real(), self.imag());
+            }
+
+            // complex_high
+            template <class A>
+            inline batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<avx>) noexcept
+            {
+                return get_half_complex_f<1>(self.real(), self.imag());
+            }
+            template <class A>
+            inline batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx>) noexcept
+            {
+                return get_half_complex_d<1>(self.real(), self.imag());
+            }
+        }
+
+        // fast_cast
+        namespace detail
+        {
+            template <class A>
+            inline batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<avx>) noexcept
+            {
+                return _mm256_cvtepi32_ps(self);
+            }
+
+            template <class A>
+            inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<avx>) noexcept
+            {
+                // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
+                // adapted to avx
+                __m256i msk_lo = _mm256_set1_epi32(0xFFFF);
+                __m256 cnst65536f = _mm256_set1_ps(65536.0f);
+
+                __m256i v_lo = bitwise_and(batch<uint32_t, A>(v), batch<uint32_t, A>(msk_lo)); /* extract the 16 lowest significant bits of self                             */
+                __m256i v_hi = bitwise_rshift(batch<uint32_t, A>(v), 16, avx {}); /* 16 most significant bits of v                                                 */
+                __m256 v_lo_flt = _mm256_cvtepi32_ps(v_lo); /* No rounding                                                                   */
+                __m256 v_hi_flt = _mm256_cvtepi32_ps(v_hi); /* No rounding                                                                   */
+                v_hi_flt = _mm256_mul_ps(cnst65536f, v_hi_flt); /* No rounding                                                                   */
+                return _mm256_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer    */
+            }
+
+            template <class A>
+            inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<avx>) noexcept
+            {
+                return _mm256_cvttps_epi32(self);
+            }
+
+            template <class A>
+            inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<avx>) noexcept
+            {
+                return _mm256_castps_si256(
+                    _mm256_blendv_ps(_mm256_castsi256_ps(_mm256_cvttps_epi32(self)),
+                                     _mm256_xor_ps(
+                                         _mm256_castsi256_ps(_mm256_cvttps_epi32(_mm256_sub_ps(self, _mm256_set1_ps(1u << 31)))),
+                                         _mm256_castsi256_ps(_mm256_set1_epi32(1u << 31))),
+                                     _mm256_cmp_ps(self, _mm256_set1_ps(1u << 31), _CMP_GE_OQ)));
+            }
+        }
+
+        // div
+        template <class A>
+        inline batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_div_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_div_pd(self, other);
+        }
+
+        // eq
+        template <class A>
+        inline batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_ps(self, other, _CMP_EQ_OQ);
+        }
+        template <class A>
+        inline batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_pd(self, other, _CMP_EQ_OQ);
+        }
+        template <class A>
+        inline batch_bool<float, A> eq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return ~(self != other);
+        }
+        template <class A>
+        inline batch_bool<double, A> eq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return ~(self != other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return eq(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
+                                      self, other);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return ~(self != other);
+        }
+
+        // floor
+        template <class A>
+        inline batch<float, A> floor(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_floor_ps(self);
+        }
+        template <class A>
+        inline batch<double, A> floor(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_floor_pd(self);
+        }
+
+        // from_mask
+        template <class A>
+        inline batch_bool<float, A> from_mask(batch_bool<float, A> const&, uint64_t mask, requires_arch<avx>) noexcept
+        {
+            alignas(A::alignment()) static const uint64_t lut32[] = {
+                0x0000000000000000ul,
+                0x00000000FFFFFFFFul,
+                0xFFFFFFFF00000000ul,
+                0xFFFFFFFFFFFFFFFFul,
+            };
+            assert(!(mask & ~0xFFul) && "inbound mask");
+            return _mm256_castsi256_ps(_mm256_setr_epi64x(lut32[mask & 0x3], lut32[(mask >> 2) & 0x3], lut32[(mask >> 4) & 0x3], lut32[mask >> 6]));
+        }
+        template <class A>
+        inline batch_bool<double, A> from_mask(batch_bool<double, A> const&, uint64_t mask, requires_arch<avx>) noexcept
+        {
+            alignas(A::alignment()) static const uint64_t lut64[][4] = {
+                { 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul },
+                { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul },
+                { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul },
+                { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul },
+                { 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
+                { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
+                { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
+                { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
+                { 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
+                { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
+                { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
+                { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
+                { 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
+                { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
+                { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
+                { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
+            };
+            assert(!(mask & ~0xFul) && "inbound mask");
+            return _mm256_castsi256_pd(_mm256_load_si256((const __m256i*)lut64[mask]));
+        }
+        template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<avx>) noexcept
+        {
+            alignas(A::alignment()) static const uint32_t lut32[] = {
+                0x00000000,
+                0x000000FF,
+                0x0000FF00,
+                0x0000FFFF,
+                0x00FF0000,
+                0x00FF00FF,
+                0x00FFFF00,
+                0x00FFFFFF,
+                0xFF000000,
+                0xFF0000FF,
+                0xFF00FF00,
+                0xFF00FFFF,
+                0xFFFF0000,
+                0xFFFF00FF,
+                0xFFFFFF00,
+                0xFFFFFFFF,
+            };
+            alignas(A::alignment()) static const uint64_t lut64[] = {
+                0x0000000000000000ul,
+                0x000000000000FFFFul,
+                0x00000000FFFF0000ul,
+                0x00000000FFFFFFFFul,
+                0x0000FFFF00000000ul,
+                0x0000FFFF0000FFFFul,
+                0x0000FFFFFFFF0000ul,
+                0x0000FFFFFFFFFFFFul,
+                0xFFFF000000000000ul,
+                0xFFFF00000000FFFFul,
+                0xFFFF0000FFFF0000ul,
+                0xFFFF0000FFFFFFFFul,
+                0xFFFFFFFF00000000ul,
+                0xFFFFFFFF0000FFFFul,
+                0xFFFFFFFFFFFF0000ul,
+                0xFFFFFFFFFFFFFFFFul,
+            };
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                assert(!(mask & ~0xFFFFFFFFul) && "inbound mask");
+                return _mm256_setr_epi32(lut32[mask & 0xF], lut32[(mask >> 4) & 0xF],
+                                         lut32[(mask >> 8) & 0xF], lut32[(mask >> 12) & 0xF],
+                                         lut32[(mask >> 16) & 0xF], lut32[(mask >> 20) & 0xF],
+                                         lut32[(mask >> 24) & 0xF], lut32[mask >> 28]);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                assert(!(mask & ~0xFFFFul) && "inbound mask");
+                return _mm256_setr_epi64x(lut64[mask & 0xF], lut64[(mask >> 4) & 0xF], lut64[(mask >> 8) & 0xF], lut64[(mask >> 12) & 0xF]);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_castps_si256(from_mask(batch_bool<float, A> {}, mask, avx {}));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_castpd_si256(from_mask(batch_bool<double, A> {}, mask, avx {}));
+            }
+        }
+
+        // haddp
+        template <class A>
+        inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx>) noexcept
+        {
+            // row = (a,b,c,d,e,f,g,h)
+            // tmp0 = (a0+a1, a2+a3, b0+b1, b2+b3, a4+a5, a6+a7, b4+b5, b6+b7)
+            __m256 tmp0 = _mm256_hadd_ps(row[0], row[1]);
+            // tmp1 = (c0+c1, c2+c3, d1+d2, d2+d3, c4+c5, c6+c7, d4+d5, d6+d7)
+            __m256 tmp1 = _mm256_hadd_ps(row[2], row[3]);
+            // tmp1 = (a0+a1+a2+a3, b0+b1+b2+b3, c0+c1+c2+c3, d0+d1+d2+d3,
+            // a4+a5+a6+a7, b4+b5+b6+b7, c4+c5+c6+c7, d4+d5+d6+d7)
+            tmp1 = _mm256_hadd_ps(tmp0, tmp1);
+            // tmp0 = (e0+e1, e2+e3, f0+f1, f2+f3, e4+e5, e6+e7, f4+f5, f6+f7)
+            tmp0 = _mm256_hadd_ps(row[4], row[5]);
+            // tmp2 = (g0+g1, g2+g3, h0+h1, h2+h3, g4+g5, g6+g7, h4+h5, h6+h7)
+            __m256 tmp2 = _mm256_hadd_ps(row[6], row[7]);
+            // tmp2 = (e0+e1+e2+e3, f0+f1+f2+f3, g0+g1+g2+g3, h0+h1+h2+h3,
+            // e4+e5+e6+e7, f4+f5+f6+f7, g4+g5+g6+g7, h4+h5+h6+h7)
+            tmp2 = _mm256_hadd_ps(tmp0, tmp2);
+            // tmp0 = (a0+a1+a2+a3, b0+b1+b2+b3, c0+c1+c2+c3, d0+d1+d2+d3,
+            // e4+e5+e6+e7, f4+f5+f6+f7, g4+g5+g6+g7, h4+h5+h6+h7)
+            tmp0 = _mm256_blend_ps(tmp1, tmp2, 0b11110000);
+            // tmp1 = (a4+a5+a6+a7, b4+b5+b6+b7, c4+c5+c6+c7, d4+d5+d6+d7,
+            // e0+e1+e2+e3, f0+f1+f2+f3, g0+g1+g2+g3, h0+h1+h2+h3)
+            tmp1 = _mm256_permute2f128_ps(tmp1, tmp2, 0x21);
+            return _mm256_add_ps(tmp0, tmp1);
+        }
+        template <class A>
+        inline batch<double, A> haddp(batch<double, A> const* row, requires_arch<avx>) noexcept
+        {
+            // row = (a,b,c,d)
+            // tmp0 = (a0+a1, b0+b1, a2+a3, b2+b3)
+            __m256d tmp0 = _mm256_hadd_pd(row[0], row[1]);
+            // tmp1 = (c0+c1, d0+d1, c2+c3, d2+d3)
+            __m256d tmp1 = _mm256_hadd_pd(row[2], row[3]);
+            // tmp2 = (a0+a1, b0+b1, c2+c3, d2+d3)
+            __m256d tmp2 = _mm256_blend_pd(tmp0, tmp1, 0b1100);
+            // tmp1 = (a2+a3, b2+b3, c2+c3, d2+d3)
+            tmp1 = _mm256_permute2f128_pd(tmp0, tmp1, 0x21);
+            return _mm256_add_pd(tmp1, tmp2);
+        }
+
+        // insert
+        template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<avx>) noexcept
+        {
+#if !defined(_MSC_VER) || _MSC_VER > 1900
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm256_insert_epi8(self, val, I);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_insert_epi16(self, val, I);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_insert_epi32(self, val, I);
+            }
+            else
+            {
+                return insert(self, val, pos, generic {});
+            }
+#endif
+            return insert(self, val, pos, generic {});
+        }
+
+        // isnan
+        template <class A>
+        inline batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_ps(self, self, _CMP_UNORD_Q);
+        }
+        template <class A>
+        inline batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_pd(self, self, _CMP_UNORD_Q);
+        }
+
+        // le
+        template <class A>
+        inline batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_ps(self, other, _CMP_LE_OQ);
+        }
+        template <class A>
+        inline batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_pd(self, other, _CMP_LE_OQ);
+        }
+
+        // load_aligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<avx>) noexcept
+        {
+            return _mm256_load_si256((__m256i const*)mem);
+        }
+        template <class A>
+        inline batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<avx>) noexcept
+        {
+            return _mm256_load_ps(mem);
+        }
+        template <class A>
+        inline batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<avx>) noexcept
+        {
+            return _mm256_load_pd(mem);
+        }
+
+        namespace detail
+        {
+            // load_complex
+            template <class A>
+            inline batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx>) noexcept
+            {
+                using batch_type = batch<float, A>;
+                __m128 tmp0 = _mm256_extractf128_ps(hi, 0);
+                __m128 tmp1 = _mm256_extractf128_ps(hi, 1);
+                __m128 tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0));
+                __m128 tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
+                batch_type real = _mm256_castps128_ps256(tmp_real);
+                batch_type imag = _mm256_castps128_ps256(tmp_imag);
+
+                tmp0 = _mm256_extractf128_ps(lo, 0);
+                tmp1 = _mm256_extractf128_ps(lo, 1);
+                tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0));
+                tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
+                real = _mm256_insertf128_ps(real, tmp_real, 1);
+                imag = _mm256_insertf128_ps(imag, tmp_imag, 1);
+                return { real, imag };
+            }
+            template <class A>
+            inline batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx>) noexcept
+            {
+                using batch_type = batch<double, A>;
+                __m128d tmp0 = _mm256_extractf128_pd(hi, 0);
+                __m128d tmp1 = _mm256_extractf128_pd(hi, 1);
+                batch_type real = _mm256_castpd128_pd256(_mm_unpacklo_pd(tmp0, tmp1));
+                batch_type imag = _mm256_castpd128_pd256(_mm_unpackhi_pd(tmp0, tmp1));
+
+                tmp0 = _mm256_extractf128_pd(lo, 0);
+                tmp1 = _mm256_extractf128_pd(lo, 1);
+                __m256d re_tmp1 = _mm256_insertf128_pd(real, _mm_unpacklo_pd(tmp0, tmp1), 1);
+                __m256d im_tmp1 = _mm256_insertf128_pd(imag, _mm_unpackhi_pd(tmp0, tmp1), 1);
+                real = _mm256_blend_pd(real, re_tmp1, 12);
+                imag = _mm256_blend_pd(imag, im_tmp1, 12);
+                return { real, imag };
+            }
+        }
+
+        // load_unaligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<avx>) noexcept
+        {
+            return _mm256_loadu_si256((__m256i const*)mem);
+        }
+        template <class A>
+        inline batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<avx>) noexcept
+        {
+            return _mm256_loadu_ps(mem);
+        }
+        template <class A>
+        inline batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<avx>) noexcept
+        {
+            return _mm256_loadu_pd(mem);
+        }
+
+        // lt
+        template <class A>
+        inline batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_ps(self, other, _CMP_LT_OQ);
+        }
+        template <class A>
+        inline batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_pd(self, other, _CMP_LT_OQ);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return lt(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+
+        // mask
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2)
+            {
+                __m128i self_low, self_high;
+                detail::split_avx(self, self_low, self_high);
+                return mask(batch_bool<T, sse4_2>(self_low), sse4_2 {}) | (mask(batch_bool<T, sse4_2>(self_high), sse4_2 {}) << (128 / (8 * sizeof(T))));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_movemask_ps(_mm256_castsi256_ps(self));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_movemask_pd(_mm256_castsi256_pd(self));
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        inline uint64_t mask(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_movemask_ps(self);
+        }
+
+        template <class A>
+        inline uint64_t mask(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_movemask_pd(self);
+        }
+
+        // max
+        template <class A>
+        inline batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_max_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_max_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return select(self > other, self, other);
+        }
+
+        // min
+        template <class A>
+        inline batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_min_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_min_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return select(self <= other, self, other);
+        }
+
+        // mul
+        template <class A>
+        inline batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_mul_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_mul_pd(self, other);
+        }
+
+        // nearbyint
+        template <class A>
+        inline batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_round_ps(self, _MM_FROUND_TO_NEAREST_INT);
+        }
+        template <class A>
+        inline batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_round_pd(self, _MM_FROUND_TO_NEAREST_INT);
+        }
+
+        // nearbyint_as_int
+        template <class A>
+        inline batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
+                                                  requires_arch<avx>) noexcept
+        {
+            return _mm256_cvtps_epi32(self);
+        }
+
+        // neg
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> neg(batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return 0 - self;
+        }
+        template <class A>
+        batch<float, A> neg(batch<float, A> const& self, requires_arch<avx>)
+        {
+            return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)));
+        }
+        template <class A>
+        inline batch<double, A> neg(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000)));
+        }
+
+        // neq
+        template <class A>
+        inline batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_ps(self, other, _CMP_NEQ_UQ);
+        }
+        template <class A>
+        inline batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_pd(self, other, _CMP_NEQ_UQ);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return ~(self == other);
+        }
+
+        template <class A>
+        inline batch_bool<float, A> neq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_ps(self, other);
+        }
+        template <class A>
+        inline batch_bool<double, A> neq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(self.data), _mm256_castsi256_ps(other.data)));
+        }
+
+        // reciprocal
+        template <class A>
+        inline batch<float, A> reciprocal(batch<float, A> const& self,
+                                          kernel::requires_arch<avx>) noexcept
+        {
+            return _mm256_rcp_ps(self);
+        }
+
+        // reduce_add
+        template <class A>
+        inline float reduce_add(batch<float, A> const& rhs, requires_arch<avx>) noexcept
+        {
+            // Warning about _mm256_hadd_ps:
+            // _mm256_hadd_ps(a,b) gives
+            // (a0+a1,a2+a3,b0+b1,b2+b3,a4+a5,a6+a7,b4+b5,b6+b7). Hence we can't
+            // rely on a naive use of this method
+            // rhs = (x0, x1, x2, x3, x4, x5, x6, x7)
+            // tmp = (x4, x5, x6, x7, x0, x1, x2, x3)
+            __m256 tmp = _mm256_permute2f128_ps(rhs, rhs, 1);
+            // tmp = (x4+x0, x5+x1, x6+x2, x7+x3, x0+x4, x1+x5, x2+x6, x3+x7)
+            tmp = _mm256_add_ps(rhs, tmp);
+            // tmp = (x4+x0+x5+x1, x6+x2+x7+x3, -, -, -, -, -, -)
+            tmp = _mm256_hadd_ps(tmp, tmp);
+            // tmp = (x4+x0+x5+x1+x6+x2+x7+x3, -, -, -, -, -, -, -)
+            tmp = _mm256_hadd_ps(tmp, tmp);
+            return _mm_cvtss_f32(_mm256_extractf128_ps(tmp, 0));
+        }
+        template <class A>
+        inline double reduce_add(batch<double, A> const& rhs, requires_arch<avx>) noexcept
+        {
+            // rhs = (x0, x1, x2, x3)
+            // tmp = (x2, x3, x0, x1)
+            __m256d tmp = _mm256_permute2f128_pd(rhs, rhs, 1);
+            // tmp = (x2+x0, x3+x1, -, -)
+            tmp = _mm256_add_pd(rhs, tmp);
+            // tmp = (x2+x0+x3+x1, -, -, -)
+            tmp = _mm256_hadd_pd(tmp, tmp);
+            return _mm_cvtsd_f64(_mm256_extractf128_pd(tmp, 0));
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline T reduce_add(batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            __m128i low, high;
+            detail::split_avx(self, low, high);
+            batch<T, sse4_2> blow(low), bhigh(high);
+            return reduce_add(blow) + reduce_add(bhigh);
+        }
+
+        // reduce_max
+        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
+        inline T reduce_max(batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            constexpr auto mask = detail::shuffle(1, 0);
+            batch<T, A> step = _mm256_permute2f128_si256(self, self, mask);
+            batch<T, A> acc = max(self, step);
+            __m128i low = _mm256_castsi256_si128(acc);
+            return reduce_max(batch<T, sse4_2>(low));
+        }
+
+        // reduce_min
+        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
+        inline T reduce_min(batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            constexpr auto mask = detail::shuffle(1, 0);
+            batch<T, A> step = _mm256_permute2f128_si256(self, self, mask);
+            batch<T, A> acc = min(self, step);
+            __m128i low = _mm256_castsi256_si128(acc);
+            return reduce_min(batch<T, sse4_2>(low));
+        }
+
+        // rsqrt
+        template <class A>
+        inline batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx>) noexcept
+        {
+            return _mm256_rsqrt_ps(val);
+        }
+        template <class A>
+        inline batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<avx>) noexcept
+        {
+            return _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(val)));
+        }
+
+        // sadd
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                auto mask = (other >> (8 * sizeof(T) - 1));
+                auto self_pos_branch = min(std::numeric_limits<T>::max() - other, self);
+                auto self_neg_branch = max(std::numeric_limits<T>::min() - other, self);
+                return other + select(batch_bool<T, A>(mask.data), self_neg_branch, self_pos_branch);
+            }
+            else
+            {
+                const auto diffmax = std::numeric_limits<T>::max() - self;
+                const auto mindiff = min(diffmax, other);
+                return self + mindiff;
+            }
+        }
+
+        // select
+        template <class A>
+        inline batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx>) noexcept
+        {
+            return _mm256_blendv_ps(false_br, true_br, cond);
+        }
+        template <class A>
+        inline batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx>) noexcept
+        {
+            return _mm256_blendv_pd(false_br, true_br, cond);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
+        {
+            __m128i cond_low, cond_hi;
+            detail::split_avx(cond, cond_low, cond_hi);
+
+            __m128i true_low, true_hi;
+            detail::split_avx(true_br, true_low, true_hi);
+
+            __m128i false_low, false_hi;
+            detail::split_avx(false_br, false_low, false_hi);
+
+            __m128i res_low = select(batch_bool<T, sse4_2>(cond_low), batch<T, sse4_2>(true_low), batch<T, sse4_2>(false_low), sse4_2 {});
+            __m128i res_hi = select(batch_bool<T, sse4_2>(cond_hi), batch<T, sse4_2>(true_hi), batch<T, sse4_2>(false_hi), sse4_2 {});
+            return detail::merge_sse(res_low, res_hi);
+        }
+        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
+        {
+            return select(batch_bool<T, A> { Values... }, true_br, false_br, avx2 {});
+        }
+
+        template <class A, bool... Values>
+        inline batch<float, A> select(batch_bool_constant<batch<float, A>, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx>) noexcept
+        {
+            constexpr auto mask = batch_bool_constant<batch<float, A>, Values...>::mask();
+            return _mm256_blend_ps(false_br, true_br, mask);
+        }
+
+        template <class A, bool... Values>
+        inline batch<double, A> select(batch_bool_constant<batch<double, A>, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx>) noexcept
+        {
+            constexpr auto mask = batch_bool_constant<batch<double, A>, Values...>::mask();
+            return _mm256_blend_pd(false_br, true_br, mask);
+        }
+
+        // set
+        template <class A, class... Values>
+        inline batch<float, A> set(batch<float, A> const&, requires_arch<avx>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<float, A>::size, "consistent init");
+            return _mm256_setr_ps(values...);
+        }
+
+        template <class A, class... Values>
+        inline batch<double, A> set(batch<double, A> const&, requires_arch<avx>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<double, A>::size, "consistent init");
+            return _mm256_setr_pd(values...);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3) noexcept
+        {
+            return _mm256_set_epi64x(v3, v2, v1, v0);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
+        {
+            return _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
+        {
+            return _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
+                               T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept
+        {
+            return _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
+        }
+
+        template <class A, class T, class... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<avx>, Values... values) noexcept
+        {
+            return set(batch<T, A>(), A {}, static_cast<T>(values ? -1LL : 0LL)...).data;
+        }
+
+        template <class A, class... Values>
+        inline batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<avx>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch_bool<float, A>::size, "consistent init");
+            return _mm256_castsi256_ps(set(batch<int32_t, A>(), A {}, static_cast<int32_t>(values ? -1LL : 0LL)...).data);
+        }
+
+        template <class A, class... Values>
+        inline batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<avx>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch_bool<double, A>::size, "consistent init");
+            return _mm256_castsi256_pd(set(batch<int64_t, A>(), A {}, static_cast<int64_t>(values ? -1LL : 0LL)...).data);
+        }
+
+        // slide_left
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx>) noexcept
+        {
+            constexpr unsigned BitCount = N * 8;
+            if (BitCount == 0)
+            {
+                return x;
+            }
+            if (BitCount >= 256)
+            {
+                return batch<T, A>(T(0));
+            }
+            if (BitCount > 128)
+            {
+                constexpr unsigned M = (BitCount - 128) / 8;
+                __m128i low = _mm256_castsi256_si128(x);
+                auto y = _mm_slli_si128(low, M);
+                __m256i zero = _mm256_setzero_si256();
+                return _mm256_insertf128_si256(zero, y, 1);
+            }
+            if (BitCount == 128)
+            {
+                __m128i low = _mm256_castsi256_si128(x);
+                __m256i zero = _mm256_setzero_si256();
+                return _mm256_insertf128_si256(zero, low, 1);
+            }
+            // shifting by [0, 128[ bits
+            constexpr unsigned M = BitCount / 8;
+
+            __m128i low = _mm256_castsi256_si128(x);
+            auto ylow = _mm_slli_si128(low, M);
+            auto zlow = _mm_srli_si128(low, 16 - M);
+
+            __m128i high = _mm256_extractf128_si256(x, 1);
+            auto yhigh = _mm_slli_si128(high, M);
+
+            __m256i res = _mm256_castsi128_si256(ylow);
+            return _mm256_insertf128_si256(res, _mm_or_si128(yhigh, zlow), 1);
+        }
+
+        // slide_right
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx>) noexcept
+        {
+            constexpr unsigned BitCount = N * 8;
+            if (BitCount == 0)
+            {
+                return x;
+            }
+            if (BitCount >= 256)
+            {
+                return batch<T, A>(T(0));
+            }
+            if (BitCount > 128)
+            {
+                constexpr unsigned M = (BitCount - 128) / 8;
+                __m128i high = _mm256_extractf128_si256(x, 1);
+                __m128i y = _mm_srli_si128(high, M);
+                __m256i zero = _mm256_setzero_si256();
+                return _mm256_insertf128_si256(zero, y, 0);
+            }
+            if (BitCount == 128)
+            {
+                __m128i high = _mm256_extractf128_si256(x, 1);
+                return _mm256_castsi128_si256(high);
+            }
+            // shifting by [0, 128[ bits
+            constexpr unsigned M = BitCount / 8;
+
+            __m128i low = _mm256_castsi256_si128(x);
+            auto ylow = _mm_srli_si128(low, M);
+
+            __m128i high = _mm256_extractf128_si256(x, 1);
+            auto yhigh = _mm_srli_si128(high, M);
+            auto zhigh = _mm_slli_si128(high, 16 - M);
+
+            __m256i res = _mm256_castsi128_si256(_mm_or_si128(ylow, zhigh));
+            return _mm256_insertf128_si256(res, yhigh, 1);
+        }
+
+        // sqrt
+        template <class A>
+        inline batch<float, A> sqrt(batch<float, A> const& val, requires_arch<avx>) noexcept
+        {
+            return _mm256_sqrt_ps(val);
+        }
+        template <class A>
+        inline batch<double, A> sqrt(batch<double, A> const& val, requires_arch<avx>) noexcept
+        {
+            return _mm256_sqrt_pd(val);
+        }
+
+        // ssub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                return sadd(self, -other);
+            }
+            else
+            {
+                const auto diff = min(self, other);
+                return self - diff;
+            }
+        }
+
+        // store_aligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_aligned(T* mem, batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_store_si256((__m256i*)mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_store_si256((__m256i*)mem, self);
+        }
+        template <class A>
+        inline void store_aligned(float* mem, batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_store_ps(mem, self);
+        }
+        template <class A>
+        inline void store_aligned(double* mem, batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_store_pd(mem, self);
+        }
+
+        // store_unaligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_storeu_si256((__m256i*)mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_storeu_si256((__m256i*)mem, self);
+        }
+        template <class A>
+        inline void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_storeu_ps(mem, self);
+        }
+        template <class A>
+        inline void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_storeu_pd(mem, self);
+        }
+
+        // sub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return sub(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+        template <class A>
+        inline batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_sub_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_sub_pd(self, other);
+        }
+
+        // swizzle
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
+        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<avx>) noexcept
+        {
+            // duplicate low and high part of input
+            __m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1));
+            __m256 hi_hi = _mm256_insertf128_ps(self, _mm256_castps256_ps128(hi), 0);
+
+            __m256 low = _mm256_castps128_ps256(_mm256_castps256_ps128(self));
+            __m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1);
+
+            // normalize mask
+            batch_constant<batch<uint32_t, A>, (V0 % 4), (V1 % 4), (V2 % 4), (V3 % 4), (V4 % 4), (V5 % 4), (V6 % 4), (V7 % 4)> half_mask;
+
+            // permute within each lane
+            __m256 r0 = _mm256_permutevar_ps(low_low, (batch<uint32_t, A>)half_mask);
+            __m256 r1 = _mm256_permutevar_ps(hi_hi, (batch<uint32_t, A>)half_mask);
+
+            // mask to choose the right lane
+            batch_bool_constant<batch<uint32_t, A>, (V0 >= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> blend_mask;
+
+            // blend the two permutes
+            constexpr auto mask = blend_mask.mask();
+            return _mm256_blend_ps(r0, r1, mask);
+        }
+
+        template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
+        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx>) noexcept
+        {
+            // duplicate low and high part of input
+            __m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1));
+            __m256d hi_hi = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(hi), 0);
+
+            __m256d low = _mm256_castpd128_pd256(_mm256_castpd256_pd128(self));
+            __m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1);
+
+            // normalize mask
+            batch_constant<batch<uint64_t, A>, (V0 % 2) * -1, (V1 % 2) * -1, (V2 % 2) * -1, (V3 % 2) * -1> half_mask;
+
+            // permute within each lane
+            __m256d r0 = _mm256_permutevar_pd(low_low, (batch<uint64_t, A>)half_mask);
+            __m256d r1 = _mm256_permutevar_pd(hi_hi, (batch<uint64_t, A>)half_mask);
+
+            // mask to choose the right lane
+            batch_bool_constant<batch<uint64_t, A>, (V0 >= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask;
+
+            // blend the two permutes
+            constexpr auto mask = blend_mask.mask();
+            return _mm256_blend_pd(r0, r1, mask);
+        }
+        template <class A,
+                  typename T,
+                  uint32_t V0,
+                  uint32_t V1,
+                  uint32_t V2,
+                  uint32_t V3,
+                  uint32_t V4,
+                  uint32_t V5,
+                  uint32_t V6,
+                  uint32_t V7,
+                  detail::enable_sized_integral_t<T, 4> = 0>
+        inline batch<T, A> swizzle(batch<T, A> const& self,
+                                   batch_constant<batch<uint32_t, A>,
+                                                  V0,
+                                                  V1,
+                                                  V2,
+                                                  V3,
+                                                  V4,
+                                                  V5,
+                                                  V6,
+                                                  V7> const& mask,
+                                   requires_arch<avx>) noexcept
+        {
+            return bitwise_cast<T>(
+                swizzle(bitwise_cast<float>(self), mask));
+        }
+
+        template <class A,
+                  typename T,
+                  uint64_t V0,
+                  uint64_t V1,
+                  uint64_t V2,
+                  uint64_t V3,
+                  detail::enable_sized_integral_t<T, 8> = 0>
+        inline batch<T, A>
+        swizzle(batch<T, A> const& self,
+                batch_constant<batch<uint64_t, A>, V0, V1, V2, V3> const& mask,
+                requires_arch<avx>) noexcept
+        {
+            return bitwise_cast<T>(
+                swizzle(bitwise_cast<double>(self), mask));
+        }
+
+        // trunc
+        template <class A>
+        inline batch<float, A> trunc(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_round_ps(self, _MM_FROUND_TO_ZERO);
+        }
+        template <class A>
+        inline batch<double, A> trunc(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_round_pd(self, _MM_FROUND_TO_ZERO);
+        }
+
+        // zip_hi
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2)
+            {
+                // extract high word
+                __m128i self_hi = _mm256_extractf128_si256(self, 1);
+                __m128i other_hi = _mm256_extractf128_si256(other, 1);
+
+                // interleave
+                __m128i res_lo, res_hi;
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    res_lo = _mm_unpacklo_epi8(self_hi, other_hi);
+                    res_hi = _mm_unpackhi_epi8(self_hi, other_hi);
+                }
+                else
+                {
+                    res_lo = _mm_unpacklo_epi16(self_hi, other_hi);
+                    res_hi = _mm_unpackhi_epi16(self_hi, other_hi);
+                }
+
+                // fuse
+                return _mm256_castps_si256(
+                    _mm256_insertf128_ps(
+                        _mm256_castsi256_ps(_mm256_castsi128_si256(res_lo)),
+                        _mm_castsi128_ps(res_hi),
+                        1));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                auto lo = _mm256_unpacklo_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
+                auto hi = _mm256_unpackhi_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
+                return _mm256_castps_si256(_mm256_permute2f128_ps(lo, hi, 0x31));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                auto lo = _mm256_unpacklo_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
+                auto hi = _mm256_unpackhi_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
+                return _mm256_castpd_si256(_mm256_permute2f128_pd(lo, hi, 0x31));
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        inline batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            auto lo = _mm256_unpacklo_ps(self, other);
+            auto hi = _mm256_unpackhi_ps(self, other);
+            return _mm256_permute2f128_ps(lo, hi, 0x31);
+        }
+        template <class A>
+        inline batch<double, A> zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            auto lo = _mm256_unpacklo_pd(self, other);
+            auto hi = _mm256_unpackhi_pd(self, other);
+            return _mm256_permute2f128_pd(lo, hi, 0x31);
+        }
+
+        // zip_lo
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2)
+            {
+                // extract low word
+                __m128i self_lo = _mm256_extractf128_si256(self, 0);
+                __m128i other_lo = _mm256_extractf128_si256(other, 0);
+
+                // interleave
+                __m128i res_lo, res_hi;
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    res_lo = _mm_unpacklo_epi8(self_lo, other_lo);
+                    res_hi = _mm_unpackhi_epi8(self_lo, other_lo);
+                }
+                else
+                {
+                    res_lo = _mm_unpacklo_epi16(self_lo, other_lo);
+                    res_hi = _mm_unpackhi_epi16(self_lo, other_lo);
+                }
+
+                // fuse
+                return _mm256_castps_si256(
+                    _mm256_insertf128_ps(
+                        _mm256_castsi256_ps(_mm256_castsi128_si256(res_lo)),
+                        _mm_castsi128_ps(res_hi),
+                        1));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                auto lo = _mm256_unpacklo_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
+                auto hi = _mm256_unpackhi_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
+                return _mm256_castps_si256(_mm256_insertf128_ps(lo, _mm256_castps256_ps128(hi), 1));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                auto lo = _mm256_unpacklo_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
+                auto hi = _mm256_unpackhi_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
+                return _mm256_castpd_si256(_mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1));
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        template <class A>
+        inline batch<float, A> zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            auto lo = _mm256_unpacklo_ps(self, other);
+            auto hi = _mm256_unpackhi_ps(self, other);
+            return _mm256_insertf128_ps(lo, _mm256_castps256_ps128(hi), 1);
+        }
+        template <class A>
+        inline batch<double, A> zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            auto lo = _mm256_unpacklo_pd(self, other);
+            auto hi = _mm256_unpackhi_pd(self, other);
+            return _mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1);
+        }
+    }
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp
new file mode 100644
index 0000000000..8d0fcc27a4
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp
@@ -0,0 +1,950 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX2_HPP
+#define XSIMD_AVX2_HPP
+
+#include <complex>
+#include <type_traits>
+
+#include "../types/xsimd_avx2_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // abs
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> abs(batch<T, A> const& self, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_abs_epi8(self);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_abs_epi16(self);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_abs_epi32(self);
+                }
+                else
+                {
+                    return abs(self, avx {});
+                }
+            }
+            return self;
+        }
+
+        // add
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm256_add_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_add_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_add_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_add_epi64(self, other);
+            }
+            else
+            {
+                return add(self, other, avx {});
+            }
+        }
+
+        // bitwise_and
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_and_si256(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_and_si256(self, other);
+        }
+
+        // bitwise_andnot
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_andnot_si256(other, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_andnot_si256(other, self);
+        }
+
+        // bitwise_not
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx2>) noexcept
+        {
+            return _mm256_xor_si256(self, _mm256_set1_epi32(-1));
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept
+        {
+            return _mm256_xor_si256(self, _mm256_set1_epi32(-1));
+        }
+
+        // bitwise_lshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_slli_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_slli_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_slli_epi64(self, other);
+            }
+            else
+            {
+                return bitwise_lshift(self, other, avx {});
+            }
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_sllv_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_sllv_epi64(self, other);
+            }
+            else
+            {
+                return bitwise_lshift(self, other, avx {});
+            }
+        }
+
+        // bitwise_or
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_or_si256(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_or_si256(self, other);
+        }
+
+        // bitwise_rshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    __m256i sign_mask = _mm256_set1_epi16((0xFF00 >> other) & 0x00FF);
+                    __m256i cmp_is_negative = _mm256_cmpgt_epi8(_mm256_setzero_si256(), self);
+                    __m256i res = _mm256_srai_epi16(self, other);
+                    return _mm256_or_si256(
+                        detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                           { return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
+                                           sign_mask, cmp_is_negative),
+                        _mm256_andnot_si256(sign_mask, res));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_srai_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_srai_epi32(self, other);
+                }
+                else
+                {
+                    return bitwise_rshift(self, other, avx {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_srli_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_srli_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm256_srli_epi64(self, other);
+                }
+                else
+                {
+                    return bitwise_rshift(self, other, avx {});
+                }
+            }
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_srav_epi32(self, other);
+                }
+                else
+                {
+                    return bitwise_rshift(self, other, avx {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_srlv_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm256_srlv_epi64(self, other);
+                }
+                else
+                {
+                    return bitwise_rshift(self, other, avx {});
+                }
+            }
+        }
+
+        // bitwise_xor
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_xor_si256(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            return _mm256_xor_si256(self, other);
+        }
+
+        // complex_low
+        template <class A>
+        inline batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx2>) noexcept
+        {
+            __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 1, 1, 0));
+            __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(1, 2, 0, 0));
+            return _mm256_blend_pd(tmp0, tmp1, 10);
+        }
+
+        // complex_high
+        template <class A>
+        inline batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx2>) noexcept
+        {
+            __m256d tmp0 = _mm256_permute4x64_pd(self.real(), _MM_SHUFFLE(3, 3, 1, 2));
+            __m256d tmp1 = _mm256_permute4x64_pd(self.imag(), _MM_SHUFFLE(3, 2, 2, 0));
+            return _mm256_blend_pd(tmp0, tmp1, 10);
+        }
+
+        // fast_cast
+        namespace detail
+        {
+
+            template <class A>
+            inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<avx2>) noexcept
+            {
+                // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
+                __m256i msk_lo = _mm256_set1_epi32(0xFFFF);
+                __m256 cnst65536f = _mm256_set1_ps(65536.0f);
+
+                __m256i v_lo = _mm256_and_si256(v, msk_lo); /* extract the 16 lowest significant bits of self                             */
+                __m256i v_hi = _mm256_srli_epi32(v, 16); /* 16 most significant bits of v                                                 */
+                __m256 v_lo_flt = _mm256_cvtepi32_ps(v_lo); /* No rounding                                                                   */
+                __m256 v_hi_flt = _mm256_cvtepi32_ps(v_hi); /* No rounding                                                                   */
+                v_hi_flt = _mm256_mul_ps(cnst65536f, v_hi_flt); /* No rounding                                                                   */
+                return _mm256_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer    */
+            }
+
+            template <class A>
+            inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept
+            {
+                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
+                // adapted to avx
+                __m256i xH = _mm256_srli_epi64(x, 32);
+                xH = _mm256_or_si256(xH, _mm256_castpd_si256(_mm256_set1_pd(19342813113834066795298816.))); //  2^84
+                __m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000,
+                                                 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
+                __m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); //  2^52
+                __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(19342813118337666422669312.)); //  2^84 + 2^52
+                return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
+            }
+
+            template <class A>
+            inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<avx2>) noexcept
+            {
+                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
+                // adapted to avx
+                __m256i xH = _mm256_srai_epi32(x, 16);
+                xH = _mm256_and_si256(xH, _mm256_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF));
+                xH = _mm256_add_epi64(xH, _mm256_castpd_si256(_mm256_set1_pd(442721857769029238784.))); //  3*2^67
+                __m256i mask = _mm256_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000,
+                                                 0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
+                __m256i xL = _mm256_or_si256(_mm256_and_si256(mask, x), _mm256_andnot_si256(mask, _mm256_castpd_si256(_mm256_set1_pd(0x0010000000000000)))); //  2^52
+                __m256d f = _mm256_sub_pd(_mm256_castsi256_pd(xH), _mm256_set1_pd(442726361368656609280.)); //  3*2^67 + 2^52
+                return _mm256_add_pd(f, _mm256_castsi256_pd(xL));
+            }
+        }
+
+        // eq
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm256_cmpeq_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_cmpeq_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_cmpeq_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_cmpeq_epi64(self, other);
+            }
+            else
+            {
+                return eq(self, other, avx {});
+            }
+        }
+
+        // gather
+        template <class T, class A, class U, detail::enable_sized_integral_t<T, 4> = 0, detail::enable_sized_integral_t<U, 4> = 0>
+        inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
+                                  kernel::requires_arch<avx2>) noexcept
+        {
+            // scatter for this one is AVX512F+AVX512VL
+            return _mm256_i32gather_epi32(reinterpret_cast<const int*>(src), index, sizeof(T));
+        }
+
+        template <class T, class A, class U, detail::enable_sized_integral_t<T, 8> = 0, detail::enable_sized_integral_t<U, 8> = 0>
+        inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
+                                  kernel::requires_arch<avx2>) noexcept
+        {
+            // scatter for this one is AVX512F+AVX512VL
+            return _mm256_i64gather_epi64(reinterpret_cast<const long long int*>(src), index, sizeof(T));
+        }
+
+        template <class A, class U,
+                  detail::enable_sized_integral_t<U, 4> = 0>
+        inline batch<float, A> gather(batch<float, A> const&, float const* src,
+                                      batch<U, A> const& index,
+                                      kernel::requires_arch<avx2>) noexcept
+        {
+            // scatter for this one is AVX512F+AVX512VL
+            return _mm256_i32gather_ps(src, index, sizeof(float));
+        }
+
+        template <class A, class U, detail::enable_sized_integral_t<U, 8> = 0>
+        inline batch<double, A> gather(batch<double, A> const&, double const* src,
+                                       batch<U, A> const& index,
+                                       requires_arch<avx2>) noexcept
+        {
+            // scatter for this one is AVX512F+AVX512VL
+            return _mm256_i64gather_pd(src, index, sizeof(double));
+        }
+
+        // gather: handmade conversions
+        template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
+        inline batch<float, A> gather(batch<float, A> const&, double const* src,
+                                      batch<V, A> const& index,
+                                      requires_arch<avx2>) noexcept
+        {
+            const batch<double, A> low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double)));
+            const batch<double, A> high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double)));
+            return detail::merge_sse(_mm256_cvtpd_ps(low.data), _mm256_cvtpd_ps(high.data));
+        }
+
+        template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
+        inline batch<int32_t, A> gather(batch<int32_t, A> const&, double const* src,
+                                        batch<V, A> const& index,
+                                        requires_arch<avx2>) noexcept
+        {
+            const batch<double, A> low(_mm256_i32gather_pd(src, _mm256_castsi256_si128(index.data), sizeof(double)));
+            const batch<double, A> high(_mm256_i32gather_pd(src, _mm256_extractf128_si256(index.data, 1), sizeof(double)));
+            return detail::merge_sse(_mm256_cvtpd_epi32(low.data), _mm256_cvtpd_epi32(high.data));
+        }
+
+        // lt
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_cmpgt_epi8(other, self);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_cmpgt_epi16(other, self);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_cmpgt_epi32(other, self);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm256_cmpgt_epi64(other, self);
+                }
+                else
+                {
+                    return lt(self, other, avx {});
+                }
+            }
+            else
+            {
+                return lt(self, other, avx {});
+            }
+        }
+
+        // load_complex
+        template <class A>
+        inline batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx2>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            batch_type real = _mm256_castpd_ps(
+                _mm256_permute4x64_pd(
+                    _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0))),
+                    _MM_SHUFFLE(3, 1, 2, 0)));
+            batch_type imag = _mm256_castpd_ps(
+                _mm256_permute4x64_pd(
+                    _mm256_castps_pd(_mm256_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1))),
+                    _MM_SHUFFLE(3, 1, 2, 0)));
+            return { real, imag };
+        }
+        template <class A>
+        inline batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx2>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            batch_type real = _mm256_permute4x64_pd(_mm256_unpacklo_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0));
+            batch_type imag = _mm256_permute4x64_pd(_mm256_unpackhi_pd(hi, lo), _MM_SHUFFLE(3, 1, 2, 0));
+            return { real, imag };
+        }
+        // mask
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                uint64_t mask8 = 0xFFFFFFFF & (uint64_t)_mm256_movemask_epi8(self);
+                return detail::mask_lut(mask8) | (detail::mask_lut(mask8 >> 8) << 4) | (detail::mask_lut(mask8 >> 16) << 8) | (detail::mask_lut(mask8 >> 24) << 12);
+            }
+            else
+            {
+                return mask(self, avx {});
+            }
+        }
+
+        // max
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_max_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_max_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_max_epi32(self, other);
+                }
+                else
+                {
+                    return max(self, other, avx {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_max_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_max_epu16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_max_epu32(self, other);
+                }
+                else
+                {
+                    return max(self, other, avx {});
+                }
+            }
+        }
+
+        // min
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_min_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_min_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_min_epi32(self, other);
+                }
+                else
+                {
+                    return min(self, other, avx {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_min_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_min_epu16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm256_min_epu32(self, other);
+                }
+                else
+                {
+                    return min(self, other, avx {});
+                }
+            }
+        }
+
+        // mul
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                __m256i mask_hi = _mm256_set1_epi32(0xFF00FF00);
+                __m256i res_lo = _mm256_mullo_epi16(self, other);
+                __m256i other_hi = _mm256_srli_epi16(other, 8);
+                __m256i self_hi = _mm256_and_si256(self, mask_hi);
+                __m256i res_hi = _mm256_mullo_epi16(self_hi, other_hi);
+                __m256i res = _mm256_blendv_epi8(res_lo, res_hi, mask_hi);
+                return res;
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_mullo_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_mullo_epi32(self, other);
+            }
+            else
+            {
+                return mul(self, other, avx {});
+            }
+        }
+
+        // reduce_add
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline T reduce_add(batch<T, A> const& self, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                __m256i tmp1 = _mm256_hadd_epi32(self, self);
+                __m256i tmp2 = _mm256_hadd_epi32(tmp1, tmp1);
+                __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1);
+                __m128i tmp4 = _mm_add_epi32(_mm256_castsi256_si128(tmp2), tmp3);
+                return _mm_cvtsi128_si32(tmp4);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                __m256i tmp1 = _mm256_shuffle_epi32(self, 0x0E);
+                __m256i tmp2 = _mm256_add_epi64(self, tmp1);
+                __m128i tmp3 = _mm256_extracti128_si256(tmp2, 1);
+                __m128i res = _mm_add_epi64(_mm256_castsi256_si128(tmp2), tmp3);
+#if defined(__x86_64__)
+                return _mm_cvtsi128_si64(res);
+#else
+                __m128i m;
+                _mm_storel_epi64(&m, res);
+                int64_t i;
+                std::memcpy(&i, &m, sizeof(i));
+                return i;
+#endif
+            }
+            else
+            {
+                return reduce_add(self, avx {});
+            }
+        }
+
+        // sadd
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_adds_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_adds_epi16(self, other);
+                }
+                else
+                {
+                    return sadd(self, other, avx {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_adds_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_adds_epu16(self, other);
+                }
+                else
+                {
+                    return sadd(self, other, avx {});
+                }
+            }
+        }
+
+        // select
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm256_blendv_epi8(false_br, true_br, cond);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_blendv_epi8(false_br, true_br, cond);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_blendv_epi8(false_br, true_br, cond);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_blendv_epi8(false_br, true_br, cond);
+            }
+            else
+            {
+                return select(cond, true_br, false_br, avx {});
+            }
+        }
+        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
+        {
+            constexpr int mask = batch_bool_constant<batch<T, A>, Values...>::mask();
+            // FIXME: for some reason mask here is not considered as an immediate,
+            // but it's okay for _mm256_blend_epi32
+            // case 2: return _mm256_blend_epi16(false_br, true_br, mask);
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_blend_epi32(false_br, true_br, mask);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                constexpr int imask = detail::interleave(mask);
+                return _mm256_blend_epi32(false_br, true_br, imask);
+            }
+            else
+            {
+                return select(batch_bool<T, A> { Values... }, true_br, false_br, avx2 {});
+            }
+        }
+
+        // slide_left
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx2>) noexcept
+        {
+            constexpr unsigned BitCount = N * 8;
+            if (BitCount == 0)
+            {
+                return x;
+            }
+            if (BitCount >= 256)
+            {
+                return batch<T, A>(T(0));
+            }
+            if (BitCount > 128)
+            {
+                constexpr unsigned M = (BitCount - 128) / 8;
+                auto y = _mm256_bslli_epi128(x, M);
+                return _mm256_permute2x128_si256(y, y, 0x28);
+            }
+            if (BitCount == 128)
+            {
+                return _mm256_permute2x128_si256(x, x, 0x28);
+            }
+            // shifting by [0, 128[ bits
+            constexpr unsigned M = BitCount / 8;
+            auto y = _mm256_bslli_epi128(x, M);
+            auto z = _mm256_bsrli_epi128(x, 16 - M);
+            auto w = _mm256_permute2x128_si256(z, z, 0x28);
+            return _mm256_or_si256(y, w);
+        }
+
+        // slide_right
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx2>) noexcept
+        {
+            constexpr unsigned BitCount = N * 8;
+            if (BitCount == 0)
+            {
+                return x;
+            }
+            if (BitCount >= 256)
+            {
+                return batch<T, A>(T(0));
+            }
+            if (BitCount > 128)
+            {
+                constexpr unsigned M = (BitCount - 128) / 8;
+                auto y = _mm256_bsrli_epi128(x, M);
+                return _mm256_permute2x128_si256(y, y, 0x81);
+            }
+            if (BitCount == 128)
+            {
+                return _mm256_permute2x128_si256(x, x, 0x81);
+            }
+            // shifting by [0, 128[ bits
+            constexpr unsigned M = BitCount / 8;
+            auto y = _mm256_bsrli_epi128(x, M);
+            auto z = _mm256_bslli_epi128(x, 16 - M);
+            auto w = _mm256_permute2x128_si256(z, z, 0x81);
+            return _mm256_or_si256(y, w);
+        }
+
+        // ssub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_subs_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_subs_epi16(self, other);
+                }
+                else
+                {
+                    return ssub(self, other, avx {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm256_subs_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm256_subs_epu16(self, other);
+                }
+                else
+                {
+                    return ssub(self, other, avx {});
+                }
+            }
+        }
+
+        // sub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm256_sub_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_sub_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_sub_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_sub_epi64(self, other);
+            }
+            else
+            {
+                return sub(self, other, avx {});
+            }
+        }
+
+        // swizzle
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
+        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
+        {
+            return _mm256_permutevar8x32_ps(self, (batch<uint32_t, A>)mask);
+        }
+
+        template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
+        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
+        {
+            constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
+            return _mm256_permute4x64_pd(self, mask);
+        }
+
+        template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
+        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
+        {
+            constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
+            return _mm256_permute4x64_epi64(self, mask);
+        }
+        template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
+        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
+        {
+            return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, avx2 {}));
+        }
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
+        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
+        {
+            return _mm256_permutevar8x32_epi32(self, (batch<uint32_t, A>)mask);
+        }
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
+        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
+        {
+            return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx2 {}));
+        }
+
+        // zip_hi
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                auto lo = _mm256_unpacklo_epi8(self, other);
+                auto hi = _mm256_unpackhi_epi8(self, other);
+                return _mm256_permute2f128_si256(lo, hi, 0x31);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                auto lo = _mm256_unpacklo_epi16(self, other);
+                auto hi = _mm256_unpackhi_epi16(self, other);
+                return _mm256_permute2f128_si256(lo, hi, 0x31);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                auto lo = _mm256_unpacklo_epi32(self, other);
+                auto hi = _mm256_unpackhi_epi32(self, other);
+                return _mm256_permute2f128_si256(lo, hi, 0x31);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                auto lo = _mm256_unpacklo_epi64(self, other);
+                auto hi = _mm256_unpackhi_epi64(self, other);
+                return _mm256_permute2f128_si256(lo, hi, 0x31);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        // zip_lo
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                auto lo = _mm256_unpacklo_epi8(self, other);
+                auto hi = _mm256_unpackhi_epi8(self, other);
+                return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                auto lo = _mm256_unpacklo_epi16(self, other);
+                auto hi = _mm256_unpackhi_epi16(self, other);
+                return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                auto lo = _mm256_unpacklo_epi32(self, other);
+                auto hi = _mm256_unpackhi_epi32(self, other);
+                return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                auto lo = _mm256_unpacklo_epi64(self, other);
+                auto hi = _mm256_unpackhi_epi64(self, other);
+                return _mm256_inserti128_si256(lo, _mm256_castsi256_si128(hi), 1);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+    }
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp
new file mode 100644
index 0000000000..77182e1ef2
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp
@@ -0,0 +1,627 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512BW_HPP
+#define XSIMD_AVX512BW_HPP
+
+#include <array>
+#include <type_traits>
+
+#include "../types/xsimd_avx512bw_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        namespace detail
+        {
+            template <class A, class T, int Cmp>
+            inline batch_bool<T, A> compare_int_avx512bw(batch<T, A> const& self, batch<T, A> const& other) noexcept
+            {
+                using register_type = typename batch_bool<T, A>::register_type;
+                if (std::is_signed<T>::value)
+                {
+                    XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                    {
+                        return (register_type)_mm512_cmp_epi8_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                    {
+                        return (register_type)_mm512_cmp_epi16_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                    {
+                        return (register_type)_mm512_cmp_epi32_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                    {
+                        return (register_type)_mm512_cmp_epi64_mask(self, other, Cmp);
+                    }
+                }
+                else
+                {
+                    XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                    {
+                        return (register_type)_mm512_cmp_epu8_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                    {
+                        return (register_type)_mm512_cmp_epu16_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                    {
+                        return (register_type)_mm512_cmp_epu32_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                    {
+                        return (register_type)_mm512_cmp_epu64_mask(self, other, Cmp);
+                    }
+                }
+            }
+        }
+
+        // abs
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> abs(batch<T, A> const& self, requires_arch<avx512bw>) noexcept
+        {
+            if (std::is_unsigned<T>::value)
+            {
+                return self;
+            }
+
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm512_abs_epi8(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_abs_epi16(self);
+            }
+            else
+            {
+                return abs(self, avx512dq {});
+            }
+        }
+
+        // add
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm512_add_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_add_epi16(self, other);
+            }
+            else
+            {
+                return add(self, other, avx512dq {});
+            }
+        }
+
+        // bitwise_lshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx512bw>) noexcept
+        {
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_sllv_epi16(self, _mm512_set1_epi16(other));
+#else
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_slli_epi16(self, other);
+#endif
+            }
+            else
+            {
+                return bitwise_lshift(self, other, avx512dq {});
+            }
+        }
+
+        // bitwise_rshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx512bw>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    __m512i sign_mask = _mm512_set1_epi16((0xFF00 >> other) & 0x00FF);
+                    __m512i zeros = _mm512_setzero_si512();
+                    __mmask64 cmp_is_negative_mask = _mm512_cmpgt_epi8_mask(zeros, self);
+                    __m512i cmp_sign_mask = _mm512_mask_blend_epi8(cmp_is_negative_mask, zeros, sign_mask);
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+                    __m512i res = _mm512_srav_epi16(self, _mm512_set1_epi16(other));
+#else
+                    __m512i res = _mm512_srai_epi16(self, other);
+#endif
+                    return _mm512_or_si512(cmp_sign_mask, _mm512_andnot_si512(sign_mask, res));
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_srav_epi16(self, _mm512_set1_epi16(other));
+#else
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_srai_epi16(self, other);
+#endif
+                }
+                else
+                {
+                    return bitwise_rshift(self, other, avx512dq {});
+                }
+            }
+            else
+            {
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_srlv_epi16(self, _mm512_set1_epi16(other));
+#else
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_srli_epi16(self, other);
+#endif
+                }
+                else
+                {
+                    return bitwise_rshift(self, other, avx512dq {});
+                }
+            }
+        }
+
+        // eq
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            return detail::compare_int_avx512bw<A, T, _MM_CMPINT_EQ>(self, other);
+        }
+
+        // ge
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            return detail::compare_int_avx512bw<A, T, _MM_CMPINT_GE>(self, other);
+        }
+
+        // gt
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            return detail::compare_int_avx512bw<A, T, _MM_CMPINT_GT>(self, other);
+        }
+
+        // le
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            return detail::compare_int_avx512bw<A, T, _MM_CMPINT_LE>(self, other);
+        }
+
+        // lt
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            return detail::compare_int_avx512bw<A, T, _MM_CMPINT_LT>(self, other);
+        }
+
+        // max
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_max_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_max_epi16(self, other);
+                }
+                else
+                {
+                    return max(self, other, avx512dq {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_max_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_max_epu16(self, other);
+                }
+                else
+                {
+                    return max(self, other, avx512dq {});
+                }
+            }
+        }
+
+        // min
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_min_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_min_epi16(self, other);
+                }
+                else
+                {
+                    return min(self, other, avx512dq {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_min_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_min_epu16(self, other);
+                }
+                else
+                {
+                    return min(self, other, avx512dq {});
+                }
+            }
+        }
+
+        // mul
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                __m512i upper = _mm512_and_si512(_mm512_mullo_epi16(self, other), _mm512_srli_epi16(_mm512_set1_epi16(-1), 8));
+                __m512i lower = _mm512_slli_epi16(_mm512_mullo_epi16(_mm512_srli_epi16(self, 8), _mm512_srli_epi16(other, 8)), 8);
+                return _mm512_or_si512(upper, lower);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_mullo_epi16(self, other);
+            }
+            else
+            {
+                return mul(self, other, avx512dq {});
+            }
+        }
+
+        // neq
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            return detail::compare_int_avx512bw<A, T, _MM_CMPINT_NE>(self, other);
+        }
+
+        // sadd
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_adds_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_adds_epi16(self, other);
+                }
+                else
+                {
+                    return sadd(self, other, avx512dq {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_adds_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_adds_epu16(self, other);
+                }
+                else
+                {
+                    return sadd(self, other, avx512dq {});
+                }
+            }
+        }
+
+        // select
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512bw>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm512_mask_blend_epi8(cond, false_br.data, true_br.data);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_mask_blend_epi16(cond, false_br.data, true_br.data);
+            }
+            else
+            {
+                return select(cond, true_br, false_br, avx512dq {});
+            }
+        }
+
+        // slide_left
+        namespace detail
+        {
+            template <size_t... Is>
+            constexpr std::array<uint64_t, sizeof...(Is)> make_slide_perm_hi(::xsimd::detail::index_sequence<Is...>)
+            {
+                return { (Is == 0 ? 8 : Is - 1)... };
+            }
+
+            template <size_t N, size_t... Is>
+            constexpr std::array<uint16_t, sizeof...(Is)> make_slide_left_pattern(::xsimd::detail::index_sequence<Is...>)
+            {
+                return { (Is >= N ? Is - N : 0)... };
+            }
+            template <size_t N, size_t... Is>
+            constexpr std::array<uint16_t, sizeof...(Is)> make_slide_left_mask(::xsimd::detail::index_sequence<Is...>)
+            {
+                return { (Is >= N ? 0xFFFF : 0x0000)... };
+            }
+        }
+
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
+        {
+            constexpr unsigned BitCount = N * 8;
+            if (BitCount == 0)
+            {
+                return x;
+            }
+            if (BitCount >= 512)
+            {
+                return batch<T, A>(T(0));
+            }
+            batch<T, A> xx;
+            if (N & 1)
+            {
+                alignas(A::alignment()) uint64_t buffer[8];
+                _mm512_store_epi64(&buffer[0], x);
+                for (int i = 7; i > 0; --i)
+                    buffer[i] = (buffer[i] << 8) | (buffer[i - 1] >> 56);
+                buffer[0] = buffer[0] << 8;
+                xx = _mm512_load_epi64(&buffer[0]);
+
+                alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_hi(::xsimd::detail::make_index_sequence<512 / 64>());
+                __m512i xl = _mm512_slli_epi64(x, 8);
+                __m512i xr = _mm512_srli_epi64(x, 56);
+                xr = _mm512_permutex2var_epi64(xr, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512());
+                xx = _mm512_or_si512(xr, xl);
+                if (N == 1)
+                    return xx;
+            }
+            else
+            {
+                xx = x;
+            }
+            alignas(A::alignment()) auto slide_pattern = detail::make_slide_left_pattern<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
+            alignas(A::alignment()) auto slide_mask = detail::make_slide_left_mask<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
+            return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data()));
+        }
+
+        // slide_right
+        namespace detail
+        {
+            template <size_t... Is>
+            constexpr std::array<uint64_t, sizeof...(Is)> make_slide_perm_low(::xsimd::detail::index_sequence<Is...>)
+            {
+                return { (Is + 1)... };
+            }
+
+            template <size_t N, size_t... Is>
+            constexpr std::array<uint16_t, sizeof...(Is)> make_slide_right_pattern(::xsimd::detail::index_sequence<Is...>)
+            {
+                return { (Is < (32 - N) ? Is + N : 0)... };
+            }
+            template <size_t N, size_t... Is>
+            constexpr std::array<uint16_t, sizeof...(Is)> make_slide_right_mask(::xsimd::detail::index_sequence<Is...>)
+            {
+                return { (Is < 32 - N ? 0xFFFF : 0x0000)... };
+            }
+        }
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx512bw>) noexcept
+        {
+            constexpr unsigned BitCount = N * 8;
+            if (BitCount == 0)
+            {
+                return x;
+            }
+            if (BitCount >= 512)
+            {
+                return batch<T, A>(T(0));
+            }
+            batch<T, A> xx;
+            if (N & 1)
+            {
+                alignas(A::alignment()) auto slide_perm = detail::make_slide_perm_low(::xsimd::detail::make_index_sequence<512 / 64>());
+                __m512i xr = _mm512_srli_epi64(x, 8);
+                __m512i xl = _mm512_slli_epi64(x, 56);
+                xl = _mm512_permutex2var_epi64(xl, _mm512_load_epi64(slide_perm.data()), _mm512_setzero_si512());
+                xx = _mm512_or_si512(xr, xl);
+                if (N == 1)
+                    return xx;
+            }
+            else
+            {
+                xx = x;
+            }
+            alignas(A::alignment()) auto slide_pattern = detail::make_slide_right_pattern<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
+            alignas(A::alignment()) auto slide_mask = detail::make_slide_right_mask<N / 2>(::xsimd::detail::make_index_sequence<512 / 16>());
+            return _mm512_and_si512(_mm512_permutexvar_epi16(_mm512_load_epi32(slide_pattern.data()), xx), _mm512_load_epi32(slide_mask.data()));
+        }
+
+        // ssub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_subs_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_subs_epi16(self, other);
+                }
+                else
+                {
+                    return ssub(self, other, avx512dq {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm512_subs_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm512_subs_epu16(self, other);
+                }
+                else
+                {
+                    return ssub(self, other, avx512dq {});
+                }
+            }
+        }
+
+        // sub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm512_sub_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_sub_epi16(self, other);
+            }
+            else
+            {
+                return sub(self, other, avx512dq {});
+            }
+        }
+
+        // swizzle
+
+        template <class A, uint16_t... Vs>
+        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
+        {
+            return _mm512_permutexvar_epi16((batch<uint16_t, A>)mask, self);
+        }
+
+        template <class A, uint16_t... Vs>
+        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
+        {
+            return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, avx512bw {}));
+        }
+
+        template <class A, uint8_t... Vs>
+        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
+        {
+            return _mm512_shuffle_epi8(self, (batch<uint8_t, A>)mask);
+        }
+
+        template <class A, uint8_t... Vs>
+        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
+        {
+            return bitwise_cast<int8_t>(swizzle(bitwise_cast<uint8_t>(self), mask, avx512bw {}));
+        }
+
+        // zip_hi
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            __m512i lo, hi;
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                lo = _mm512_unpacklo_epi8(self, other);
+                hi = _mm512_unpackhi_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                lo = _mm512_unpacklo_epi16(self, other);
+                hi = _mm512_unpackhi_epi16(self, other);
+            }
+            else
+            {
+                return zip_hi(self, other, avx512f {});
+            }
+            return _mm512_inserti32x4(
+                _mm512_inserti32x4(
+                    _mm512_inserti32x4(hi, _mm512_extracti32x4_epi32(lo, 2), 0),
+                    _mm512_extracti32x4_epi32(lo, 3),
+                    2),
+                _mm512_extracti32x4_epi32(hi, 2),
+                1);
+        }
+
+        // zip_lo
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+        {
+            __m512i lo, hi;
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                lo = _mm512_unpacklo_epi8(self, other);
+                hi = _mm512_unpackhi_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                lo = _mm512_unpacklo_epi16(self, other);
+                hi = _mm512_unpackhi_epi16(self, other);
+            }
+            else
+            {
+                return zip_lo(self, other, avx512f {});
+            }
+            return _mm512_inserti32x4(
+                _mm512_inserti32x4(
+                    _mm512_inserti32x4(lo, _mm512_extracti32x4_epi32(hi, 0), 1),
+                    _mm512_extracti32x4_epi32(hi, 1),
+                    3),
+                _mm512_extracti32x4_epi32(lo, 1),
+                2);
+        }
+    }
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp
new file mode 100644
index 0000000000..95f3f1df8f
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512cd.hpp
@@ -0,0 +1,28 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512CD_HPP
+#define XSIMD_AVX512CD_HPP
+
+#include "../types/xsimd_avx512cd_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        // Nothing there yet.
+
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp
new file mode 100644
index 0000000000..7840ea8fc5
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512dq.hpp
@@ -0,0 +1,212 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512_DQHPP
+#define XSIMD_AVX512_D_HPP
+
+#include "../types/xsimd_avx512dq_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // bitwise_and
+        template <class A>
+        inline batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_and_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_and_pd(self, other);
+        }
+
+        // bitwise_andnot
+        template <class A>
+        inline batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_andnot_ps(other, self);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_andnot_pd(other, self);
+        }
+
+        // bitwise_not
+        template <class A>
+        inline batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_xor_ps(self, _mm512_castsi512_ps(_mm512_set1_epi32(-1)));
+        }
+        template <class A>
+        inline batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_xor_pd(self, _mm512_castsi512_pd(_mm512_set1_epi32(-1)));
+        }
+
+        // bitwise_or
+        template <class A>
+        inline batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_or_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_or_pd(self, other);
+        }
+
+        template <class A, class T>
+        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(self.data | other.data);
+        }
+
+        // bitwise_xor
+        template <class A>
+        inline batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_xor_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_xor_pd(self, other);
+        }
+
+        // haddp
+        template <class A>
+        inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx512dq>) noexcept
+        {
+            // The following folds over the vector once:
+            // tmp1 = [a0..8, b0..8]
+            // tmp2 = [a8..f, b8..f]
+#define XSIMD_AVX512_HADDP_STEP1(I, a, b)                                \
+    batch<float, avx512f> res##I;                                        \
+    {                                                                    \
+        auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
+        auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
+        res##I = _mm512_add_ps(tmp1, tmp2);                              \
+    }
+
+            XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]);
+            XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]);
+            XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]);
+            XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]);
+            XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]);
+            XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]);
+            XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]);
+            XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]);
+
+#undef XSIMD_AVX512_HADDP_STEP1
+
+            // The following flds the code and shuffles so that hadd_ps produces the correct result
+            // tmp1 = [a0..4,  a8..12,  b0..4,  b8..12] (same for tmp3)
+            // tmp2 = [a5..8, a12..16, b5..8, b12..16]  (same for tmp4)
+            // tmp5 = [r1[0], r1[2], r2[0], r2[2], r1[4], r1[6] ...
+#define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d)                               \
+    batch<float, avx2> halfx##I;                                              \
+    {                                                                         \
+        auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0));      \
+        auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1));      \
+                                                                              \
+        auto resx1 = _mm512_add_ps(tmp1, tmp2);                               \
+                                                                              \
+        auto tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0));      \
+        auto tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1));      \
+                                                                              \
+        auto resx2 = _mm512_add_ps(tmp3, tmp4);                               \
+                                                                              \
+        auto tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0)); \
+        auto tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1)); \
+                                                                              \
+        auto resx3 = _mm512_add_ps(tmp5, tmp6);                               \
+                                                                              \
+        halfx##I = _mm256_hadd_ps(_mm512_extractf32x8_ps(resx3, 0),           \
+                                  _mm512_extractf32x8_ps(resx3, 1));          \
+    }
+
+            XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3);
+            XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7);
+
+#undef XSIMD_AVX512_HADDP_STEP2
+
+            auto concat = _mm512_castps256_ps512(halfx0);
+            concat = _mm512_insertf32x8(concat, halfx1, 1);
+            return concat;
+        }
+
+        // ldexp
+        template <class A>
+        inline batch<double, A> ldexp(const batch<double, A>& self, const batch<as_integer_t<double>, A>& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_scalef_pd(self, _mm512_cvtepi64_pd(other));
+        }
+
+        // mul
+        template <class A>
+        inline batch<uint64_t, A> mul(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_mullo_epi64(self, other);
+        }
+
+        template <class A>
+        inline batch<int64_t, A> mul(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_mullo_epi64(self, other);
+        }
+
+        // nearbyint_as_int
+        template <class A>
+        inline batch<int64_t, A> nearbyint_as_int(batch<double, A> const& self,
+                                                  requires_arch<avx512dq>) noexcept
+        {
+            return _mm512_cvtpd_epi64(self);
+        }
+
+        // reduce_add
+        template <class A>
+        inline float reduce_add(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
+        {
+            __m256 tmp1 = _mm512_extractf32x8_ps(rhs, 1);
+            __m256 tmp2 = _mm512_extractf32x8_ps(rhs, 0);
+            __m256 res1 = _mm256_add_ps(tmp1, tmp2);
+            return reduce_add(batch<float, avx2>(res1), avx2 {});
+        }
+
+        // convert
+        namespace detail
+        {
+            template <class A>
+            inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<avx512dq>) noexcept
+            {
+                return _mm512_cvtepi64_pd(self);
+            }
+
+            template <class A>
+            inline batch<int64_t, A> fast_cast(batch<double, A> const& self, batch<int64_t, A> const&, requires_arch<avx512dq>) noexcept
+            {
+                return _mm512_cvttpd_epi64(self);
+            }
+
+        }
+
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp
new file mode 100644
index 0000000000..7eea894137
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp
@@ -0,0 +1,2028 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512F_HPP
+#define XSIMD_AVX512F_HPP
+
+#include <complex>
+#include <limits>
+#include <type_traits>
+
+#include "../types/xsimd_avx512f_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        namespace detail
+        {
+            inline void split_avx512(__m512 val, __m256& low, __m256& high) noexcept
+            {
+                low = _mm512_castps512_ps256(val);
+                high = _mm512_extractf32x8_ps(val, 1);
+            }
+            inline void split_avx512(__m512d val, __m256d& low, __m256d& high) noexcept
+            {
+                low = _mm512_castpd512_pd256(val);
+                high = _mm512_extractf64x4_pd(val, 1);
+            }
+            inline void split_avx512(__m512i val, __m256i& low, __m256i& high) noexcept
+            {
+                low = _mm512_castsi512_si256(val);
+                high = _mm512_extracti64x4_epi64(val, 1);
+            }
+            inline __m512i merge_avx(__m256i low, __m256i high) noexcept
+            {
+                return _mm512_inserti64x4(_mm512_castsi256_si512(low), high, 1);
+            }
+            inline __m512 merge_avx(__m256 low, __m256 high) noexcept
+            {
+                return _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castpd256_pd512(_mm256_castps_pd(low)), _mm256_castps_pd(high), 1));
+            }
+            inline __m512d merge_avx(__m256d low, __m256d high) noexcept
+            {
+                return _mm512_insertf64x4(_mm512_castpd256_pd512(low), high, 1);
+            }
+            template <class F>
+            __m512i fwd_to_avx(F f, __m512i self)
+            {
+                __m256i self_low, self_high;
+                split_avx512(self, self_low, self_high);
+                __m256i res_low = f(self_low);
+                __m256i res_high = f(self_high);
+                return merge_avx(res_low, res_high);
+            }
+            template <class F>
+            __m512i fwd_to_avx(F f, __m512i self, __m512i other)
+            {
+                __m256i self_low, self_high, other_low, other_high;
+                split_avx512(self, self_low, self_high);
+                split_avx512(other, other_low, other_high);
+                __m256i res_low = f(self_low, other_low);
+                __m256i res_high = f(self_high, other_high);
+                return merge_avx(res_low, res_high);
+            }
+            template <class F>
+            __m512i fwd_to_avx(F f, __m512i self, int32_t other)
+            {
+                __m256i self_low, self_high;
+                split_avx512(self, self_low, self_high);
+                __m256i res_low = f(self_low, other);
+                __m256i res_high = f(self_high, other);
+                return merge_avx(res_low, res_high);
+            }
+        }
+        namespace detail
+        {
+
+            inline uint32_t morton(uint16_t x, uint16_t y) noexcept
+            {
+
+                static const unsigned short MortonTable256[256] = {
+                    0x0000, 0x0001, 0x0004, 0x0005, 0x0010, 0x0011, 0x0014, 0x0015,
+                    0x0040, 0x0041, 0x0044, 0x0045, 0x0050, 0x0051, 0x0054, 0x0055,
+                    0x0100, 0x0101, 0x0104, 0x0105, 0x0110, 0x0111, 0x0114, 0x0115,
+                    0x0140, 0x0141, 0x0144, 0x0145, 0x0150, 0x0151, 0x0154, 0x0155,
+                    0x0400, 0x0401, 0x0404, 0x0405, 0x0410, 0x0411, 0x0414, 0x0415,
+                    0x0440, 0x0441, 0x0444, 0x0445, 0x0450, 0x0451, 0x0454, 0x0455,
+                    0x0500, 0x0501, 0x0504, 0x0505, 0x0510, 0x0511, 0x0514, 0x0515,
+                    0x0540, 0x0541, 0x0544, 0x0545, 0x0550, 0x0551, 0x0554, 0x0555,
+                    0x1000, 0x1001, 0x1004, 0x1005, 0x1010, 0x1011, 0x1014, 0x1015,
+                    0x1040, 0x1041, 0x1044, 0x1045, 0x1050, 0x1051, 0x1054, 0x1055,
+                    0x1100, 0x1101, 0x1104, 0x1105, 0x1110, 0x1111, 0x1114, 0x1115,
+                    0x1140, 0x1141, 0x1144, 0x1145, 0x1150, 0x1151, 0x1154, 0x1155,
+                    0x1400, 0x1401, 0x1404, 0x1405, 0x1410, 0x1411, 0x1414, 0x1415,
+                    0x1440, 0x1441, 0x1444, 0x1445, 0x1450, 0x1451, 0x1454, 0x1455,
+                    0x1500, 0x1501, 0x1504, 0x1505, 0x1510, 0x1511, 0x1514, 0x1515,
+                    0x1540, 0x1541, 0x1544, 0x1545, 0x1550, 0x1551, 0x1554, 0x1555,
+                    0x4000, 0x4001, 0x4004, 0x4005, 0x4010, 0x4011, 0x4014, 0x4015,
+                    0x4040, 0x4041, 0x4044, 0x4045, 0x4050, 0x4051, 0x4054, 0x4055,
+                    0x4100, 0x4101, 0x4104, 0x4105, 0x4110, 0x4111, 0x4114, 0x4115,
+                    0x4140, 0x4141, 0x4144, 0x4145, 0x4150, 0x4151, 0x4154, 0x4155,
+                    0x4400, 0x4401, 0x4404, 0x4405, 0x4410, 0x4411, 0x4414, 0x4415,
+                    0x4440, 0x4441, 0x4444, 0x4445, 0x4450, 0x4451, 0x4454, 0x4455,
+                    0x4500, 0x4501, 0x4504, 0x4505, 0x4510, 0x4511, 0x4514, 0x4515,
+                    0x4540, 0x4541, 0x4544, 0x4545, 0x4550, 0x4551, 0x4554, 0x4555,
+                    0x5000, 0x5001, 0x5004, 0x5005, 0x5010, 0x5011, 0x5014, 0x5015,
+                    0x5040, 0x5041, 0x5044, 0x5045, 0x5050, 0x5051, 0x5054, 0x5055,
+                    0x5100, 0x5101, 0x5104, 0x5105, 0x5110, 0x5111, 0x5114, 0x5115,
+                    0x5140, 0x5141, 0x5144, 0x5145, 0x5150, 0x5151, 0x5154, 0x5155,
+                    0x5400, 0x5401, 0x5404, 0x5405, 0x5410, 0x5411, 0x5414, 0x5415,
+                    0x5440, 0x5441, 0x5444, 0x5445, 0x5450, 0x5451, 0x5454, 0x5455,
+                    0x5500, 0x5501, 0x5504, 0x5505, 0x5510, 0x5511, 0x5514, 0x5515,
+                    0x5540, 0x5541, 0x5544, 0x5545, 0x5550, 0x5551, 0x5554, 0x5555
+                };
+
+                uint32_t z = MortonTable256[y >> 8] << 17 | MortonTable256[x >> 8] << 16 | MortonTable256[y & 0xFF] << 1 | MortonTable256[x & 0xFF];
+                return z;
+            }
+
+            template <class A, class T, int Cmp>
+            inline batch_bool<T, A> compare_int_avx512f(batch<T, A> const& self, batch<T, A> const& other) noexcept
+            {
+                using register_type = typename batch_bool<T, A>::register_type;
+                if (std::is_signed<T>::value)
+                {
+                    XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                    {
+                        // shifting to take sign into account
+                        uint64_t mask_low0 = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x000000FF)) << 24,
+                                                                   (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x000000FF)) << 24,
+                                                                   Cmp);
+                        uint64_t mask_low1 = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x0000FF00)) << 16,
+                                                                   (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x0000FF00)) << 16,
+                                                                   Cmp);
+                        uint64_t mask_high0 = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x00FF0000)) << 8,
+                                                                    (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x00FF0000)) << 8,
+                                                                    Cmp);
+                        uint64_t mask_high1 = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0xFF000000)),
+                                                                    (batch<int32_t, A>(other.data) & batch<int32_t, A>(0xFF000000)),
+                                                                    Cmp);
+                        uint64_t mask = 0;
+                        for (unsigned i = 0; i < 16; ++i)
+                        {
+                            mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0);
+                            mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1);
+                            mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2);
+                            mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3);
+                        }
+                        return (register_type)mask;
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                    {
+                        // shifting to take sign into account
+                        uint16_t mask_low = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0x0000FFFF)) << 16,
+                                                                  (batch<int32_t, A>(other.data) & batch<int32_t, A>(0x0000FFFF)) << 16,
+                                                                  Cmp);
+                        uint16_t mask_high = _mm512_cmp_epi32_mask((batch<int32_t, A>(self.data) & batch<int32_t, A>(0xFFFF0000)),
+                                                                   (batch<int32_t, A>(other.data) & batch<int32_t, A>(0xFFFF0000)),
+                                                                   Cmp);
+                        return static_cast<register_type>(morton(mask_low, mask_high));
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                    {
+                        return (register_type)_mm512_cmp_epi32_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                    {
+                        return (register_type)_mm512_cmp_epi64_mask(self, other, Cmp);
+                    }
+                }
+                else
+                {
+                    XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                    {
+                        uint64_t mask_low0 = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x000000FF)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x000000FF)), Cmp);
+                        uint64_t mask_low1 = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x0000FF00)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x0000FF00)), Cmp);
+                        uint64_t mask_high0 = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x00FF0000)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x00FF0000)), Cmp);
+                        uint64_t mask_high1 = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0xFF000000)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0xFF000000)), Cmp);
+                        uint64_t mask = 0;
+                        for (unsigned i = 0; i < 16; ++i)
+                        {
+                            mask |= (mask_low0 & (uint64_t(1) << i)) << (3 * i + 0);
+                            mask |= (mask_low1 & (uint64_t(1) << i)) << (3 * i + 1);
+                            mask |= (mask_high0 & (uint64_t(1) << i)) << (3 * i + 2);
+                            mask |= (mask_high1 & (uint64_t(1) << i)) << (3 * i + 3);
+                        }
+                        return (register_type)mask;
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                    {
+                        uint16_t mask_low = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0x0000FFFF)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0x0000FFFF)), Cmp);
+                        uint16_t mask_high = _mm512_cmp_epu32_mask((batch<uint32_t, A>(self.data) & batch<uint32_t, A>(0xFFFF0000)), (batch<uint32_t, A>(other.data) & batch<uint32_t, A>(0xFFFF0000)), Cmp);
+                        return static_cast<register_type>(morton(mask_low, mask_high));
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                    {
+                        return (register_type)_mm512_cmp_epu32_mask(self, other, Cmp);
+                    }
+                    else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                    {
+                        return (register_type)_mm512_cmp_epu64_mask(self, other, Cmp);
+                    }
+                }
+            }
+        }
+
+        // abs
+        template <class A>
+        inline batch<float, A> abs(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            __m512 self_asf = (__m512)self;
+            __m512i self_asi = *reinterpret_cast<__m512i*>(&self_asf);
+            __m512i res_asi = _mm512_and_epi32(_mm512_set1_epi32(0x7FFFFFFF), self_asi);
+            return *reinterpret_cast<__m512*>(&res_asi);
+        }
+        template <class A>
+        inline batch<double, A> abs(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            __m512d self_asd = (__m512d)self;
+            __m512i self_asi = *reinterpret_cast<__m512i*>(&self_asd);
+            __m512i res_asi = _mm512_and_epi64(_mm512_set1_epi64(0x7FFFFFFFFFFFFFFF),
+                                               self_asi);
+            return *reinterpret_cast<__m512d*>(&res_asi);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> abs(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            if (std::is_unsigned<T>::value)
+            {
+                return self;
+            }
+
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return detail::fwd_to_avx([](__m256i s) noexcept
+                                          { return abs(batch<T, avx2>(s)); },
+                                          self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return detail::fwd_to_avx([](__m256i s) noexcept
+                                          { return abs(batch<T, avx2>(s)); },
+                                          self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm512_abs_epi32(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm512_abs_epi64(self);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        // add
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
+                                          { return add(batch<T, avx2>(s), batch<T, avx2>(o)); },
+                                          self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
+                                          { return add(batch<T, avx2>(s), batch<T, avx2>(o)); },
+                                          self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm512_add_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm512_add_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        inline batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_add_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_add_pd(self, other);
+        }
+
+        // all
+        template <class A, class T>
+        inline bool all(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return self.data == register_type(-1);
+        }
+
+        // any
+        template <class A, class T>
+        inline bool any(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return self.data != register_type(0);
+        }
+
+        // batch_bool_cast
+        template <class A, class T_out, class T_in>
+        inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<avx512f>) noexcept
+        {
+            return self.data;
+        }
+
+        // bitwise_and
+        template <class A>
+        inline batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+#if defined(_MSC_VER)
+            return _mm512_and_ps(self, other);
+#else
+            return _mm512_castsi512_ps(_mm512_and_si512(_mm512_castps_si512(self), _mm512_castps_si512(other)));
+#endif
+        }
+        template <class A>
+        inline batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_pd(_mm512_and_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other)));
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_and_si512(self, other);
+        }
+
+        template <class A, class T>
+        inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(self.data & other.data);
+        }
+
+        // bitwise_andnot
+        template <class A>
+        inline batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_ps(_mm512_andnot_si512(_mm512_castps_si512(other), _mm512_castps_si512(self)));
+        }
+        template <class A>
+        inline batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_pd(_mm512_andnot_si512(_mm512_castpd_si512(other), _mm512_castpd_si512(self)));
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_andnot_si512(other, self);
+        }
+
+        template <class A, class T>
+        inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(self.data & ~other.data);
+        }
+
+        // bitwise_lshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+                __m512i tmp = _mm512_sllv_epi32(self, _mm512_set1_epi32(other));
+#else
+                __m512i tmp = _mm512_slli_epi32(self, other);
+#endif
+                return _mm512_and_si512(_mm512_set1_epi8(0xFF << other), tmp);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept
+                                          { return bitwise_lshift(batch<T, avx2>(s), o, avx2 {}); },
+                                          self, other);
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm512_sllv_epi32(self, _mm512_set1_epi32(other));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm512_sllv_epi64(self, _mm512_set1_epi64(other));
+#else
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm512_slli_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm512_slli_epi64(self, other);
+#endif
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        // bitwise_not
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_xor_si512(self, _mm512_set1_epi32(-1));
+        }
+        template <class A, class T>
+        inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(~self.data);
+        }
+
+        template <class A>
+        inline batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(self), _mm512_set1_epi32(-1)));
+        }
+        template <class A>
+        inline batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(self), _mm512_set1_epi32(-1)));
+        }
+
+        // bitwise_or
+        template <class A>
+        inline batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_ps(_mm512_or_si512(_mm512_castps_si512(self), _mm512_castps_si512(other)));
+        }
+        template <class A>
+        inline batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_pd(_mm512_or_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other)));
+        }
+
+        template <class A, class T>
+        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(self.data | other.data);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_or_si512(self, other);
+        }
+
+        // bitwise_rshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx512f>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm512_srav_epi32(self, _mm512_set1_epi32(other));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm512_srav_epi64(self, _mm512_set1_epi64(other));
+#else
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm512_srai_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm512_srai_epi64(self, other);
+#endif
+                }
+                else
+                {
+                    return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept
+                                              { return bitwise_rshift(batch<T, avx2>(s), o, avx2 {}); },
+                                              self, other);
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+                    __m512i tmp = _mm512_srlv_epi32(self, _mm512_set1_epi32(other));
+#else
+                    __m512i tmp = _mm512_srli_epi32(self, other);
+#endif
+                    return _mm512_and_si512(_mm512_set1_epi8(0xFF >> other), tmp);
+#if defined(XSIMD_AVX512_SHIFT_INTRINSICS_IMM_ONLY)
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm512_srlv_epi32(self, _mm512_set1_epi32(other));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm512_srlv_epi64(self, _mm512_set1_epi64(other));
+#else
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm512_srli_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm512_srli_epi64(self, other);
+#endif
+                }
+                else
+                {
+                    return detail::fwd_to_avx([](__m256i s, int32_t o) noexcept
+                                              { return bitwise_rshift(batch<T, avx2>(s), o, avx2 {}); },
+                                              self, other);
+                }
+            }
+        }
+
+        // bitwise_xor
+        template <class A>
+        inline batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_ps(_mm512_xor_si512(_mm512_castps_si512(self), _mm512_castps_si512(other)));
+        }
+        template <class A>
+        inline batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_pd(_mm512_xor_si512(_mm512_castpd_si512(self), _mm512_castpd_si512(other)));
+        }
+
+        template <class A, class T>
+        inline batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(self.data | other.data);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_xor_si512(self, other);
+        }
+
+        // bitwise_cast
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<float, A> bitwise_cast(batch<T, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_ps(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<double, A> bitwise_cast(batch<T, A> const& self, batch<double, A> const&, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castsi512_pd(self);
+        }
+        template <class A, class T, class Tp, class = typename std::enable_if<std::is_integral<typename std::common_type<T, Tp>::type>::value, void>::type>
+        inline batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<avx512f>) noexcept
+        {
+            return batch<Tp, A>(self.data);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_cast(batch<float, A> const& self, batch<double, A> const&, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castps_pd(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_cast(batch<float, A> const& self, batch<T, A> const&, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castps_si512(self);
+        }
+        template <class A>
+        inline batch<float, A> bitwise_cast(batch<double, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castpd_ps(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_cast(batch<double, A> const& self, batch<T, A> const&, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_castpd_si512(self);
+        }
+
+        // broadcast
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> broadcast(T val, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm512_set1_epi8(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm512_set1_epi16(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm512_set1_epi32(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm512_set1_epi64(val);
+            }
+            else
+            {
+                assert(false && "unsupported");
+                return {};
+            }
+        }
+        template <class A>
+        inline batch<float, A> broadcast(float val, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_set1_ps(val);
+        }
+        template <class A>
+        batch<double, A> inline broadcast(double val, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_set1_pd(val);
+        }
+
+        // ceil
+        template <class A>
+        inline batch<float, A> ceil(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_roundscale_ps(self, _MM_FROUND_TO_POS_INF);
+        }
+        template <class A>
+        inline batch<double, A> ceil(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_roundscale_pd(self, _MM_FROUND_TO_POS_INF);
+        }
+
+        // convert
+        namespace detail
+        {
+            template <class A>
+            inline batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
+            {
+                return _mm512_cvtepi32_ps(self);
+            }
+
+            template <class A>
+            inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<avx512f>) noexcept
+            {
+                return _mm512_cvttps_epi32(self);
+            }
+
+            template <class A>
+            inline batch<float, A> fast_cast(batch<uint32_t, A> const& self, batch<float, A> const&, requires_arch<avx512f>) noexcept
+            {
+                return _mm512_cvtepu32_ps(self);
+            }
+
+            template <class A>
+            batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<avx512f>)
+            {
+                return _mm512_cvttps_epu32(self);
+            }
+        }
+
+        namespace detail
+        {
+            // complex_low
+            template <class A>
+            inline batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<avx512f>) noexcept
+            {
+                __m512i idx = _mm512_setr_epi32(0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23);
+                return _mm512_permutex2var_ps(self.real(), idx, self.imag());
+            }
+            template <class A>
+            inline batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx512f>) noexcept
+            {
+                __m512i idx = _mm512_setr_epi64(0, 8, 1, 9, 2, 10, 3, 11);
+                return _mm512_permutex2var_pd(self.real(), idx, self.imag());
+            }
+
+            // complex_high
+            template <class A>
+            inline batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<avx512f>) noexcept
+            {
+                __m512i idx = _mm512_setr_epi32(8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31);
+                return _mm512_permutex2var_ps(self.real(), idx, self.imag());
+            }
+            template <class A>
+            inline batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx512f>) noexcept
+            {
+                __m512i idx = _mm512_setr_epi64(4, 12, 5, 13, 6, 14, 7, 15);
+                return _mm512_permutex2var_pd(self.real(), idx, self.imag());
+            }
+        }
+
+        // div
+        template <class A>
+        inline batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_div_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_div_pd(self, other);
+        }
+
+        // eq
+        template <class A>
+        inline batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_ps_mask(self, other, _CMP_EQ_OQ);
+        }
+        template <class A>
+        inline batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_pd_mask(self, other, _CMP_EQ_OQ);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return detail::compare_int_avx512f<A, T, _MM_CMPINT_EQ>(self, other);
+        }
+        template <class A, class T>
+        inline batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(~self.data ^ other.data);
+        }
+
+        // floor
+        template <class A>
+        inline batch<float, A> floor(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_roundscale_ps(self, _MM_FROUND_TO_NEG_INF);
+        }
+        template <class A>
+        inline batch<double, A> floor(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_roundscale_pd(self, _MM_FROUND_TO_NEG_INF);
+        }
+
+        // fnma
+        template <class A>
+        inline batch<float, A> fnma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_fnmadd_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fnma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_fnmadd_pd(x, y, z);
+        }
+
+        // fma
+        template <class A>
+        inline batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_fmadd_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_fmadd_pd(x, y, z);
+        }
+
+        // fms
+        template <class A>
+        inline batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_fmsub_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_fmsub_pd(x, y, z);
+        }
+
+        // from bool
+        template <class A, class T>
+        inline batch<T, A> from_bool(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return select(self, batch<T, A>(1), batch<T, A>(0));
+        }
+
+        // from_mask
+        template <class T, class A>
+        inline batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<avx512f>) noexcept
+        {
+            return static_cast<typename batch_bool<T, A>::register_type>(mask);
+        }
+
+        // gather
+        template <class T, class A, class U, detail::enable_sized_integral_t<T, 4> = 0, detail::enable_sized_integral_t<U, 4> = 0>
+        inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
+                                  kernel::requires_arch<avx512f>) noexcept
+        {
+            return _mm512_i32gather_epi32(index, static_cast<const void*>(src), sizeof(T));
+        }
+
+        template <class T, class A, class U, detail::enable_sized_integral_t<T, 8> = 0, detail::enable_sized_integral_t<U, 8> = 0>
+        inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index,
+                                  kernel::requires_arch<avx512f>) noexcept
+        {
+            return _mm512_i64gather_epi64(index, static_cast<const void*>(src), sizeof(T));
+        }
+
+        template <class A, class U, detail::enable_sized_integral_t<U, 4> = 0>
+        inline batch<float, A> gather(batch<float, A> const&, float const* src,
+                                      batch<U, A> const& index,
+                                      kernel::requires_arch<avx512f>) noexcept
+        {
+            return _mm512_i32gather_ps(index, src, sizeof(float));
+        }
+
+        template <class A, class U, detail::enable_sized_integral_t<U, 8> = 0>
+        inline batch<double, A>
+        gather(batch<double, A> const&, double const* src, batch<U, A> const& index,
+               kernel::requires_arch<avx512f>) noexcept
+        {
+            return _mm512_i64gather_pd(index, src, sizeof(double));
+        }
+
+        // gather: handmade conversions
+        template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
+        inline batch<float, A> gather(batch<float, A> const&, double const* src,
+                                      batch<V, A> const& index,
+                                      requires_arch<avx512f>) noexcept
+        {
+            const batch<double, A> low(_mm512_i32gather_pd(_mm512_castsi512_si256(index.data), src, sizeof(double)));
+            const batch<double, A> high(_mm512_i32gather_pd(_mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castsi512_pd(index.data), 1)), src, sizeof(double)));
+            return detail::merge_avx(_mm512_cvtpd_ps(low.data), _mm512_cvtpd_ps(high.data));
+        }
+
+        template <class A, class V, detail::enable_sized_integral_t<V, 4> = 0>
+        inline batch<int32_t, A> gather(batch<int32_t, A> const&, double const* src,
+                                        batch<V, A> const& index,
+                                        requires_arch<avx512f>) noexcept
+        {
+            const batch<double, A> low(_mm512_i32gather_pd(_mm512_castsi512_si256(index.data), src, sizeof(double)));
+            const batch<double, A> high(_mm512_i32gather_pd(_mm256_castpd_si256(_mm512_extractf64x4_pd(_mm512_castsi512_pd(index.data), 1)), src, sizeof(double)));
+            return detail::merge_avx(_mm512_cvtpd_epi32(low.data), _mm512_cvtpd_epi32(high.data));
+        }
+
+        // ge
+        template <class A>
+        inline batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_ps_mask(self, other, _CMP_GE_OQ);
+        }
+        template <class A>
+        inline batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_pd_mask(self, other, _CMP_GE_OQ);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return detail::compare_int_avx512f<A, T, _MM_CMPINT_GE>(self, other);
+        }
+
+        // gt
+        template <class A>
+        inline batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_ps_mask(self, other, _CMP_GT_OQ);
+        }
+        template <class A>
+        inline batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_pd_mask(self, other, _CMP_GT_OQ);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return detail::compare_int_avx512f<A, T, _MM_CMPINT_GT>(self, other);
+        }
+
+        // haddp
+        template <class A>
+        inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx512f>) noexcept
+        {
+            // The following folds over the vector once:
+            // tmp1 = [a0..8, b0..8]
+            // tmp2 = [a8..f, b8..f]
+#define XSIMD_AVX512_HADDP_STEP1(I, a, b)                                \
+    batch<float, avx512f> res##I;                                        \
+    {                                                                    \
+        auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
+        auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
+        res##I = _mm512_add_ps(tmp1, tmp2);                              \
+    }
+
+            XSIMD_AVX512_HADDP_STEP1(0, row[0], row[2]);
+            XSIMD_AVX512_HADDP_STEP1(1, row[4], row[6]);
+            XSIMD_AVX512_HADDP_STEP1(2, row[1], row[3]);
+            XSIMD_AVX512_HADDP_STEP1(3, row[5], row[7]);
+            XSIMD_AVX512_HADDP_STEP1(4, row[8], row[10]);
+            XSIMD_AVX512_HADDP_STEP1(5, row[12], row[14]);
+            XSIMD_AVX512_HADDP_STEP1(6, row[9], row[11]);
+            XSIMD_AVX512_HADDP_STEP1(7, row[13], row[15]);
+
+#undef XSIMD_AVX512_HADDP_STEP1
+
+            // The following flds the code and shuffles so that hadd_ps produces the correct result
+            // tmp1 = [a0..4,  a8..12,  b0..4,  b8..12] (same for tmp3)
+            // tmp2 = [a5..8, a12..16, b5..8, b12..16]  (same for tmp4)
+            // tmp5 = [r1[0], r1[2], r2[0], r2[2], r1[4], r1[6] ...
+#define XSIMD_AVX512_HADDP_STEP2(I, a, b, c, d)                                                                                                         \
+    batch<float, avx2> halfx##I;                                                                                                                        \
+    {                                                                                                                                                   \
+        auto tmp1 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(2, 0, 2, 0));                                                                                \
+        auto tmp2 = _mm512_shuffle_f32x4(a, b, _MM_SHUFFLE(3, 1, 3, 1));                                                                                \
+                                                                                                                                                        \
+        auto resx1 = _mm512_add_ps(tmp1, tmp2);                                                                                                         \
+                                                                                                                                                        \
+        auto tmp3 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(2, 0, 2, 0));                                                                                \
+        auto tmp4 = _mm512_shuffle_f32x4(c, d, _MM_SHUFFLE(3, 1, 3, 1));                                                                                \
+                                                                                                                                                        \
+        auto resx2 = _mm512_add_ps(tmp3, tmp4);                                                                                                         \
+                                                                                                                                                        \
+        auto tmp5 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(2, 0, 2, 0));                                                                           \
+        auto tmp6 = _mm512_shuffle_ps(resx1, resx2, _MM_SHUFFLE(3, 1, 3, 1));                                                                           \
+                                                                                                                                                        \
+        auto resx3 = _mm512_add_ps(tmp5, tmp6);                                                                                                         \
+                                                                                                                                                        \
+        halfx##I = _mm256_hadd_ps(_mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(resx3, 0)), _mm512_extractf32x4_ps(resx3, 1), 1),  \
+                                  _mm256_insertf128_ps(_mm256_castps128_ps256(_mm512_extractf32x4_ps(resx3, 2)), _mm512_extractf32x4_ps(resx3, 3), 1)); \
+    }
+
+            XSIMD_AVX512_HADDP_STEP2(0, res0, res1, res2, res3);
+            XSIMD_AVX512_HADDP_STEP2(1, res4, res5, res6, res7);
+
+#undef XSIMD_AVX512_HADDP_STEP2
+
+            auto concat = _mm512_castps256_ps512(halfx0);
+            concat = _mm512_castpd_ps(_mm512_insertf64x4(_mm512_castps_pd(concat), _mm256_castps_pd(halfx1), 1));
+            return concat;
+        }
+
+        template <class A>
+        inline batch<double, A> haddp(batch<double, A> const* row, requires_arch<avx512f>) noexcept
+        {
+#define step1(I, a, b)                                                   \
+    batch<double, avx512f> res##I;                                       \
+    {                                                                    \
+        auto tmp1 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(1, 0, 1, 0)); \
+        auto tmp2 = _mm512_shuffle_f64x2(a, b, _MM_SHUFFLE(3, 2, 3, 2)); \
+        res##I = _mm512_add_pd(tmp1, tmp2);                              \
+    }
+
+            step1(1, row[0], row[2]);
+            step1(2, row[4], row[6]);
+            step1(3, row[1], row[3]);
+            step1(4, row[5], row[7]);
+
+#undef step1
+
+            auto tmp5 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(2, 0, 2, 0));
+            auto tmp6 = _mm512_shuffle_f64x2(res1, res2, _MM_SHUFFLE(3, 1, 3, 1));
+
+            auto resx1 = _mm512_add_pd(tmp5, tmp6);
+
+            auto tmp7 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(2, 0, 2, 0));
+            auto tmp8 = _mm512_shuffle_f64x2(res3, res4, _MM_SHUFFLE(3, 1, 3, 1));
+
+            auto resx2 = _mm512_add_pd(tmp7, tmp8);
+
+            auto tmpx = _mm512_shuffle_pd(resx1, resx2, 0b00000000);
+            auto tmpy = _mm512_shuffle_pd(resx1, resx2, 0b11111111);
+
+            return _mm512_add_pd(tmpx, tmpy);
+        }
+
+        // isnan
+        template <class A>
+        inline batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_ps_mask(self, self, _CMP_UNORD_Q);
+        }
+        template <class A>
+        inline batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_pd_mask(self, self, _CMP_UNORD_Q);
+        }
+
+        // ldexp
+        template <class A>
+        inline batch<float, A> ldexp(const batch<float, A>& self, const batch<as_integer_t<float>, A>& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_scalef_ps(self, _mm512_cvtepi32_ps(other));
+        }
+
+        template <class A>
+        inline batch<double, A> ldexp(const batch<double, A>& self, const batch<as_integer_t<double>, A>& other, requires_arch<avx512f>) noexcept
+        {
+            // FIXME: potential data loss here when converting other elements to
+            // int32 before converting them back to double.
+            __m512d adjusted_index = _mm512_cvtepi32_pd(_mm512_cvtepi64_epi32(other));
+            return _mm512_scalef_pd(self, adjusted_index);
+        }
+
+        // le
+        template <class A>
+        inline batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_ps_mask(self, other, _CMP_LE_OQ);
+        }
+        template <class A>
+        inline batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_pd_mask(self, other, _CMP_LE_OQ);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return detail::compare_int_avx512f<A, T, _MM_CMPINT_LE>(self, other);
+        }
+
+        // load_aligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_load_si512((__m512i const*)mem);
+        }
+        template <class A>
+        inline batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_load_ps(mem);
+        }
+        template <class A>
+        inline batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_load_pd(mem);
+        }
+
+        // load_complex
+        namespace detail
+        {
+            template <class A>
+            inline batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx512f>) noexcept
+            {
+                __m512i real_idx = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30);
+                __m512i imag_idx = _mm512_setr_epi32(1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31);
+                auto real = _mm512_permutex2var_ps(hi, real_idx, lo);
+                auto imag = _mm512_permutex2var_ps(hi, imag_idx, lo);
+                return { real, imag };
+            }
+            template <class A>
+            inline batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx512f>) noexcept
+            {
+                __m512i real_idx = _mm512_setr_epi64(0, 2, 4, 6, 8, 10, 12, 14);
+                __m512i imag_idx = _mm512_setr_epi64(1, 3, 5, 7, 9, 11, 13, 15);
+                auto real = _mm512_permutex2var_pd(hi, real_idx, lo);
+                auto imag = _mm512_permutex2var_pd(hi, imag_idx, lo);
+                return { real, imag };
+            }
+        }
+
+        // load_unaligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_loadu_si512((__m512i const*)mem);
+        }
+        template <class A>
+        inline batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_loadu_ps(mem);
+        }
+        template <class A>
+        inline batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_loadu_pd(mem);
+        }
+
+        // lt
+        template <class A>
+        inline batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_ps_mask(self, other, _CMP_LT_OQ);
+        }
+        template <class A>
+        inline batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_pd_mask(self, other, _CMP_LT_OQ);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return detail::compare_int_avx512f<A, T, _MM_CMPINT_LT>(self, other);
+        }
+
+        // mask
+        template <class A, class T>
+        inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return self.data;
+        }
+
+        // max
+        template <class A>
+        inline batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_max_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_max_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm512_max_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm512_max_epi64(self, other);
+                }
+                else
+                {
+                    return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
+                                              { return max(batch<T, avx2>(s), batch<T, avx2>(o)); },
+                                              self, other);
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm512_max_epu32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm512_max_epu64(self, other);
+                }
+                else
+                {
+                    return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
+                                              { return max(batch<T, avx2>(s), batch<T, avx2>(o)); },
+                                              self, other);
+                }
+            }
+        }
+
+        // min
+        template <class A>
+        inline batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_min_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_min_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm512_min_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm512_min_epi64(self, other);
+                }
+                else
+                {
+                    return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
+                                              { return min(batch<T, avx2>(s), batch<T, avx2>(o)); },
+                                              self, other);
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm512_min_epu32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm512_min_epu64(self, other);
+                }
+                else
+                {
+                    return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
+                                              { return min(batch<T, avx2>(s), batch<T, avx2>(o)); },
+                                              self, other);
+                }
+            }
+        }
+
+        // mul
+        template <class A>
+        inline batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_mul_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_mul_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm512_mullo_epi32(self, other);
+            }
+            else
+            {
+                return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
+                                          { return mul(batch<T, avx2>(s), batch<T, avx2>(o)); },
+                                          self, other);
+            }
+        }
+
+        // nearbyint
+        template <class A>
+        inline batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_roundscale_round_ps(self, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION);
+        }
+        template <class A>
+        inline batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_roundscale_round_pd(self, _MM_FROUND_TO_NEAREST_INT, _MM_FROUND_CUR_DIRECTION);
+        }
+
+        // nearbyint_as_int
+        template <class A>
+        inline batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
+                                                  requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cvtps_epi32(self);
+        }
+
+        // neg
+        template <class A, class T>
+        inline batch<T, A> neg(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return 0 - self;
+        }
+
+        // neq
+        template <class A>
+        inline batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_ps_mask(self, other, _CMP_NEQ_UQ);
+        }
+        template <class A>
+        inline batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_cmp_pd_mask(self, other, _CMP_NEQ_UQ);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return ~(self == other);
+        }
+
+        template <class A, class T>
+        inline batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return register_type(self.data ^ other.data);
+        }
+
+        // reciprocal
+        template <class A>
+        inline batch<float, A>
+        reciprocal(batch<float, A> const& self,
+                   kernel::requires_arch<avx512f>) noexcept
+        {
+            return _mm512_rcp14_ps(self);
+        }
+
+        template <class A>
+        inline batch<double, A>
+        reciprocal(batch<double, A> const& self,
+                   kernel::requires_arch<avx512f>) noexcept
+        {
+            return _mm512_rcp14_pd(self);
+        }
+
+        // reduce_add
+        template <class A>
+        inline float reduce_add(batch<float, A> const& rhs, requires_arch<avx512f>) noexcept
+        {
+            __m128 tmp1 = _mm512_extractf32x4_ps(rhs, 0);
+            __m128 tmp2 = _mm512_extractf32x4_ps(rhs, 1);
+            __m128 tmp3 = _mm512_extractf32x4_ps(rhs, 2);
+            __m128 tmp4 = _mm512_extractf32x4_ps(rhs, 3);
+            __m128 res1 = _mm_add_ps(tmp1, tmp2);
+            __m128 res2 = _mm_add_ps(tmp3, tmp4);
+            __m128 res3 = _mm_add_ps(res1, res2);
+            return reduce_add(batch<float, sse4_2>(res3), sse4_2 {});
+        }
+        template <class A>
+        inline double reduce_add(batch<double, A> const& rhs, requires_arch<avx512f>) noexcept
+        {
+            __m256d tmp1 = _mm512_extractf64x4_pd(rhs, 1);
+            __m256d tmp2 = _mm512_extractf64x4_pd(rhs, 0);
+            __m256d res1 = _mm256_add_pd(tmp1, tmp2);
+            return reduce_add(batch<double, avx2>(res1), avx2 {});
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline T reduce_add(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            __m256i low, high;
+            detail::split_avx512(self, low, high);
+            batch<T, avx2> blow(low), bhigh(high);
+            return reduce_add(blow, avx2 {}) + reduce_add(bhigh, avx2 {});
+        }
+
+        // reduce_max
+        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) == 1), void>::type>
+        inline T reduce_max(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            constexpr batch_constant<batch<uint64_t, A>, 5, 6, 7, 8, 0, 0, 0, 0> mask;
+            batch<T, A> step = _mm512_permutexvar_epi64((batch<uint64_t, A>)mask, self);
+            batch<T, A> acc = max(self, step);
+            __m256i low = _mm512_castsi512_si256(acc);
+            return reduce_max(batch<T, avx2>(low));
+        }
+
+        // reduce_min
+        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) == 1), void>::type>
+        inline T reduce_min(batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            constexpr batch_constant<batch<uint64_t, A>, 5, 6, 7, 8, 0, 0, 0, 0> mask;
+            batch<T, A> step = _mm512_permutexvar_epi64((batch<uint64_t, A>)mask, self);
+            batch<T, A> acc = min(self, step);
+            __m256i low = _mm512_castsi512_si256(acc);
+            return reduce_min(batch<T, avx2>(low));
+        }
+
+        // rsqrt
+        template <class A>
+        inline batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_rsqrt14_ps(val);
+        }
+        template <class A>
+        inline batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_rsqrt14_pd(val);
+        }
+
+        // sadd
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                auto mask = other < 0;
+                auto self_pos_branch = min(std::numeric_limits<T>::max() - other, self);
+                auto self_neg_branch = max(std::numeric_limits<T>::min() - other, self);
+                return other + select(mask, self_neg_branch, self_pos_branch);
+            }
+            else
+            {
+                const auto diffmax = std::numeric_limits<T>::max() - self;
+                const auto mindiff = min(diffmax, other);
+                return self + mindiff;
+            }
+        }
+
+        // scatter
+        template <class A, class T,
+                  class = typename std::enable_if<std::is_same<uint32_t, T>::value || std::is_same<int32_t, T>::value, void>::type>
+        inline void scatter(batch<T, A> const& src, T* dst,
+                            batch<int32_t, A> const& index,
+                            kernel::requires_arch<avx512f>) noexcept
+        {
+            _mm512_i32scatter_epi32(dst, index, src, sizeof(T));
+        }
+
+        template <class A, class T,
+                  class = typename std::enable_if<std::is_same<uint64_t, T>::value || std::is_same<int64_t, T>::value, void>::type>
+        inline void scatter(batch<T, A> const& src, T* dst,
+                            batch<int64_t, A> const& index,
+                            kernel::requires_arch<avx512f>) noexcept
+        {
+            _mm512_i64scatter_epi64(dst, index, src, sizeof(T));
+        }
+
+        template <class A>
+        inline void scatter(batch<float, A> const& src, float* dst,
+                            batch<int32_t, A> const& index,
+                            kernel::requires_arch<avx512f>) noexcept
+        {
+            _mm512_i32scatter_ps(dst, index, src, sizeof(float));
+        }
+
+        template <class A>
+        inline void scatter(batch<double, A> const& src, double* dst,
+                            batch<int64_t, A> const& index,
+                            kernel::requires_arch<avx512f>) noexcept
+        {
+            _mm512_i64scatter_pd(dst, index, src, sizeof(double));
+        }
+
+        // select
+        template <class A>
+        inline batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_mask_blend_ps(cond, false_br, true_br);
+        }
+        template <class A>
+        inline batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_mask_blend_pd(cond, false_br, true_br);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                alignas(avx2::alignment()) uint8_t buffer[64];
+                // FIXME: ultra inefficient
+                for (int i = 0; i < 64; ++i)
+                    buffer[i] = cond.data & (1ull << i) ? 0xFF : 0;
+                __m256i cond_low = batch<uint8_t, avx2>::load_aligned(&buffer[0]);
+                __m256i cond_hi = batch<uint8_t, avx2>::load_aligned(&buffer[32]);
+
+                __m256i true_low, true_hi;
+                detail::split_avx512(true_br, true_low, true_hi);
+
+                __m256i false_low, false_hi;
+                detail::split_avx512(false_br, false_low, false_hi);
+
+                __m256i res_low = select(batch_bool<T, avx2>(cond_low), batch<T, avx2>(true_low), batch<T, avx2>(false_low), avx2 {});
+                __m256i res_hi = select(batch_bool<T, avx2>(cond_hi), batch<T, avx2>(true_hi), batch<T, avx2>(false_hi), avx2 {});
+                return detail::merge_avx(res_low, res_hi);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                __m256i cond_low = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data & 0xFFFF, _mm512_set1_epi32(~0));
+                __m256i cond_hi = _mm512_maskz_cvtepi32_epi16((uint64_t)cond.data >> 16, _mm512_set1_epi32(~0));
+
+                __m256i true_low, true_hi;
+                detail::split_avx512(true_br, true_low, true_hi);
+
+                __m256i false_low, false_hi;
+                detail::split_avx512(false_br, false_low, false_hi);
+
+                __m256i res_low = select(batch_bool<T, avx2>(cond_low), batch<T, avx2>(true_low), batch<T, avx2>(false_low), avx2 {});
+                __m256i res_hi = select(batch_bool<T, avx2>(cond_hi), batch<T, avx2>(true_hi), batch<T, avx2>(false_hi), avx2 {});
+                return detail::merge_avx(res_low, res_hi);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm512_mask_blend_epi32(cond, false_br, true_br);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm512_mask_blend_epi64(cond, false_br, true_br);
+            }
+            else
+            {
+                assert(false && "unsupported arch/type combination");
+                return {};
+            }
+        }
+
+        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512f>) noexcept
+        {
+            return select(batch_bool<T, A> { Values... }, true_br, false_br, avx512f {});
+        }
+
+        namespace detail
+        {
+            template <class T>
+            using enable_signed_integer_t = typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value,
+                                                                    int>::type;
+
+            template <class T>
+            using enable_unsigned_integer_t = typename std::enable_if<std::is_integral<T>::value && std::is_unsigned<T>::value,
+                                                                      int>::type;
+        }
+
+        // set
+        template <class A>
+        inline batch<float, A> set(batch<float, A> const&, requires_arch<avx512f>, float v0, float v1, float v2, float v3, float v4, float v5, float v6, float v7, float v8, float v9, float v10, float v11, float v12, float v13, float v14, float v15) noexcept
+        {
+            return _mm512_setr_ps(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+        }
+
+        template <class A>
+        inline batch<double, A> set(batch<double, A> const&, requires_arch<avx512f>, double v0, double v1, double v2, double v3, double v4, double v5, double v6, double v7) noexcept
+        {
+            return _mm512_setr_pd(v0, v1, v2, v3, v4, v5, v6, v7);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
+        {
+            return _mm512_set_epi64(v7, v6, v5, v4, v3, v2, v1, v0);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+                               T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
+        {
+            return _mm512_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+        }
+        template <class A, class T, detail::enable_signed_integer_t<T> = 0>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+                               T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
+                               T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
+                               T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept
+        {
+#if defined(__clang__) || __GNUC__
+            return __extension__(__m512i)(__v32hi) {
+                v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+                v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+            };
+#else
+            return _mm512_set_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+                                    v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
+#endif
+        }
+
+        template <class A, class T, detail::enable_unsigned_integer_t<T> = 0>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+                               T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
+                               T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
+                               T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept
+        {
+#if defined(__clang__) || __GNUC__
+            return __extension__(__m512i)(__v32hu) {
+                v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+                v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31
+            };
+#else
+            return _mm512_set_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+                                    v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
+#endif
+        }
+
+        template <class A, class T, detail::enable_signed_integer_t<T> = 0>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+                               T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
+                               T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
+                               T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31,
+                               T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39,
+                               T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47,
+                               T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55,
+                               T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63) noexcept
+        {
+
+#if defined(__clang__) || __GNUC__
+            return __extension__(__m512i)(__v64qi) {
+                v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+                v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
+                v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
+                v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63
+            };
+#else
+            return _mm512_set_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+                                   v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
+                                   v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
+                                   v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63);
+#endif
+        }
+        template <class A, class T, detail::enable_unsigned_integer_t<T> = 0>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx512f>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7,
+                               T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
+                               T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23,
+                               T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31,
+                               T v32, T v33, T v34, T v35, T v36, T v37, T v38, T v39,
+                               T v40, T v41, T v42, T v43, T v44, T v45, T v46, T v47,
+                               T v48, T v49, T v50, T v51, T v52, T v53, T v54, T v55,
+                               T v56, T v57, T v58, T v59, T v60, T v61, T v62, T v63) noexcept
+        {
+
+#if defined(__clang__) || __GNUC__
+            return __extension__(__m512i)(__v64qu) {
+                v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+                v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
+                v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
+                v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63
+            };
+#else
+            return _mm512_set_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15,
+                                   v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31,
+                                   v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47,
+                                   v48, v49, v50, v51, v52, v53, v54, v55, v56, v57, v58, v59, v60, v61, v62, v63);
+#endif
+        }
+
+        template <class A, class T, class... Values>
+        inline batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<avx512f>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch_bool<T, A>::size, "consistent init");
+            using register_type = typename batch_bool<T, A>::register_type;
+            register_type r = 0;
+            unsigned shift = 0;
+            (void)std::initializer_list<register_type> { (r |= register_type(values ? 1 : 0) << (shift++))... };
+            return r;
+        }
+
+        // slide_left
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_left(batch<T, A> const&, requires_arch<avx512f>) noexcept
+        {
+            static_assert(N == 0xDEAD, "not implemented yet");
+            return {};
+        }
+
+        // slide_right
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_right(batch<T, A> const&, requires_arch<avx512f>) noexcept
+        {
+            static_assert(N == 0xDEAD, "not implemented yet");
+            return {};
+        }
+
+        // sqrt
+        template <class A>
+        inline batch<float, A> sqrt(batch<float, A> const& val, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_sqrt_ps(val);
+        }
+        template <class A>
+        inline batch<double, A> sqrt(batch<double, A> const& val, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_sqrt_pd(val);
+        }
+
+        // ssub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                return sadd(self, -other);
+            }
+            else
+            {
+                const auto diff = min(self, other);
+                return self - diff;
+            }
+        }
+
+        // store
+        template <class T, class A>
+        inline void store(batch_bool<T, A> const& self, bool* mem, requires_arch<avx512f>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            constexpr auto size = batch_bool<T, A>::size;
+            for (std::size_t i = 0; i < size; ++i)
+                mem[i] = self.data & (register_type(1) << i);
+        }
+
+        // store_aligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_aligned(T* mem, batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_store_si512((__m512i*)mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_store_si512((__m512i*)mem, self);
+        }
+        template <class A>
+        inline void store_aligned(float* mem, batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_store_ps(mem, self);
+        }
+        template <class A>
+        inline void store_aligned(double* mem, batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_store_pd(mem, self);
+        }
+
+        // store_unaligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_storeu_si512((__m512i*)mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_storeu_si512((__m512i*)mem, self);
+        }
+        template <class A>
+        inline void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_storeu_ps(mem, self);
+        }
+        template <class A>
+        inline void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_storeu_pd(mem, self);
+        }
+
+        // sub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
+                                          { return sub(batch<T, avx2>(s), batch<T, avx2>(o)); },
+                                          self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return detail::fwd_to_avx([](__m256i s, __m256i o) noexcept
+                                          { return sub(batch<T, avx2>(s), batch<T, avx2>(o)); },
+                                          self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm512_sub_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm512_sub_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        inline batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_sub_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_sub_pd(self, other);
+        }
+
+        // swizzle
+        template <class A, uint32_t... Vs>
+        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_permutexvar_ps((batch<uint32_t, A>)mask, self);
+        }
+
+        template <class A, uint64_t... Vs>
+        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_permutexvar_pd((batch<uint64_t, A>)mask, self);
+        }
+
+        template <class A, uint64_t... Vs>
+        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_permutexvar_epi64((batch<uint64_t, A>)mask, self);
+        }
+
+        template <class A, uint64_t... Vs>
+        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        {
+            return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, avx512f {}));
+        }
+
+        template <class A, uint32_t... Vs>
+        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_permutexvar_epi32((batch<uint32_t, A>)mask, self);
+        }
+
+        template <class A, uint32_t... Vs>
+        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        {
+            return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx512f {}));
+        }
+
+        namespace detail
+        {
+            template <class T, class A, T... Idx>
+            struct is_pair_of_contiguous_indices;
+
+            template <class T, class A>
+            struct is_pair_of_contiguous_indices<T, A> : std::true_type
+            {
+            };
+
+            template <class T, class A, T Idx0, T Idx1, T... Idx>
+            struct is_pair_of_contiguous_indices<T, A, Idx0, Idx1, Idx...> : std::conditional<(Idx0 % 2 == 0) && (Idx0 + 1 == Idx1), is_pair_of_contiguous_indices<T, A, Idx...>, std::false_type>::type
+            {
+            };
+
+            template <class A, uint16_t I0, uint16_t I1, uint16_t I2, uint16_t I3, uint16_t I4, uint16_t I5, uint16_t I6, uint16_t I7,
+                      uint16_t I8, uint16_t I9, uint16_t I10, uint16_t I11, uint16_t I12, uint16_t I13, uint16_t I14, uint16_t I15,
+                      uint16_t I16, uint16_t I17, uint16_t I18, uint16_t I19, uint16_t I20, uint16_t I21, uint16_t I22, uint16_t I23,
+                      uint16_t I24, uint16_t I25, uint16_t I26, uint16_t I27, uint16_t I28, uint16_t I29, uint16_t I30, uint16_t I31>
+            struct fold_batch_constant
+            {
+                using type = batch_constant<batch<uint32_t, A>, I0 / 2, I2 / 2, I4 / 2, I6 / 2, I8 / 2, I10 / 2, I12 / 2, I14 / 2,
+                                            I16 / 2, I18 / 2, I20 / 2, I22 / 2, I24 / 2, I26 / 2, I28 / 2, I30 / 2>;
+            };
+
+        }
+
+        template <class A, uint16_t... Idx, class _ = typename std::enable_if<detail::is_pair_of_contiguous_indices<uint16_t, A, Idx...>::value, void>::type>
+        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, Idx...>, requires_arch<avx512f>) noexcept
+        {
+            constexpr typename detail::fold_batch_constant<A, Idx...>::type mask32;
+            return _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
+        }
+
+        template <class A>
+        inline batch<uint16_t, A>
+        swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, (uint16_t)1, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1>, requires_arch<avx512f>) noexcept
+        {
+            // FIXME: this sequence is very inefficient, but it's here to catch
+            // a pattern generated by detail::reduce from xsimd_generic_math.hpp.
+            // The whole pattern is actually decently folded by GCC and Clang,
+            // so bare with it.
+            constexpr batch_constant<batch<uint32_t, A>, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> mask32;
+            auto tmp = _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
+
+            alignas(A::alignment()) uint16_t buffer[32];
+            _mm512_store_si512((__m512i*)&buffer[0], tmp);
+            buffer[0] = buffer[1];
+            return _mm512_load_si512(&buffer[0]);
+        }
+
+        template <class A, uint16_t... Vs>
+        inline batch<int16_t, A>
+        swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        {
+            return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, avx512f {}));
+        }
+
+        // trunc
+        template <class A>
+        inline batch<float, A>
+        trunc(batch<float, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_roundscale_round_ps(self, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION);
+        }
+        template <class A>
+        inline batch<double, A>
+        trunc(batch<double, A> const& self, requires_arch<avx512f>) noexcept
+        {
+            return _mm512_roundscale_round_pd(self, _MM_FROUND_TO_ZERO, _MM_FROUND_CUR_DIRECTION);
+        }
+
+        // zip_hi
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A>
+        zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            __m512i lo, hi;
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                assert(false && "not implemented yet");
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                assert(false && "not implemented yet");
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                lo = _mm512_unpacklo_epi32(self, other);
+                hi = _mm512_unpackhi_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                lo = _mm512_unpacklo_epi64(self, other);
+                hi = _mm512_unpackhi_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+            return _mm512_inserti32x4(
+                _mm512_inserti32x4(
+                    _mm512_inserti32x4(hi, _mm512_extracti32x4_epi32(lo, 2), 0),
+                    _mm512_extracti32x4_epi32(lo, 3),
+                    2),
+                _mm512_extracti32x4_epi32(hi, 2),
+                1);
+        }
+        template <class A>
+        inline batch<float, A>
+        zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            auto lo = _mm512_unpacklo_ps(self, other);
+            auto hi = _mm512_unpackhi_ps(self, other);
+            return _mm512_insertf32x4(
+                _mm512_insertf32x4(
+                    _mm512_insertf32x4(hi, _mm512_extractf32x4_ps(lo, 2), 0),
+                    _mm512_extractf32x4_ps(lo, 3),
+                    2),
+                _mm512_extractf32x4_ps(hi, 2),
+                1);
+        }
+        template <class A>
+        inline batch<double, A>
+        zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            auto lo = _mm512_castpd_ps(_mm512_unpacklo_pd(self, other));
+            auto hi = _mm512_castpd_ps(_mm512_unpackhi_pd(self, other));
+            return _mm512_castps_pd(_mm512_insertf32x4(
+                _mm512_insertf32x4(
+                    _mm512_insertf32x4(hi, _mm512_extractf32x4_ps(lo, 2), 0),
+                    _mm512_extractf32x4_ps(lo, 3),
+                    2),
+                _mm512_extractf32x4_ps(hi, 2),
+                1));
+        }
+
+        // zip_lo
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A>
+        zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            __m512i lo, hi;
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                assert(false && "not implemented yet");
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                assert(false && "not implemented yet");
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                lo = _mm512_unpacklo_epi32(self, other);
+                hi = _mm512_unpackhi_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                lo = _mm512_unpacklo_epi64(self, other);
+                hi = _mm512_unpackhi_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+            return _mm512_inserti32x4(
+                _mm512_inserti32x4(
+                    _mm512_inserti32x4(lo, _mm512_extracti32x4_epi32(hi, 0), 1),
+                    _mm512_extracti32x4_epi32(hi, 1),
+                    3),
+                _mm512_extracti32x4_epi32(lo, 1),
+                2);
+        }
+        template <class A>
+        inline batch<float, A>
+        zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            auto lo = _mm512_unpacklo_ps(self, other);
+            auto hi = _mm512_unpackhi_ps(self, other);
+            return _mm512_insertf32x4(
+                _mm512_insertf32x4(
+                    _mm512_insertf32x4(lo, _mm512_extractf32x4_ps(hi, 0), 1),
+                    _mm512_extractf32x4_ps(hi, 1),
+                    3),
+                _mm512_extractf32x4_ps(lo, 1),
+                2);
+        }
+        template <class A>
+        inline batch<double, A>
+        zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx512f>) noexcept
+        {
+            auto lo = _mm512_castpd_ps(_mm512_unpacklo_pd(self, other));
+            auto hi = _mm512_castpd_ps(_mm512_unpackhi_pd(self, other));
+            return _mm512_castps_pd(_mm512_insertf32x4(
+                _mm512_insertf32x4(
+                    _mm512_insertf32x4(lo, _mm512_extractf32x4_ps(hi, 0), 1),
+                    _mm512_extractf32x4_ps(hi, 1),
+                    3),
+                _mm512_extractf32x4_ps(lo, 1),
+                2));
+        }
+
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_constants.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_constants.hpp
new file mode 100644
index 0000000000..1ae77e8c7d
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_constants.hpp
@@ -0,0 +1,384 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_NUMERICAL_CONSTANT_HPP
+#define XSIMD_NUMERICAL_CONSTANT_HPP
+
+#include <limits>
+
+#include "../types/xsimd_utils.hpp"
+
+namespace xsimd
+{
+
+    namespace constants
+    {
+
+#define XSIMD_DEFINE_CONSTANT(NAME, SINGLE, DOUBLE) \
+    template <class T>                              \
+    inline T NAME() noexcept                        \
+    {                                               \
+        return T(NAME<typename T::value_type>());   \
+    }                                               \
+    template <>                                     \
+    inline float NAME<float>() noexcept             \
+    {                                               \
+        return SINGLE;                              \
+    }                                               \
+    template <>                                     \
+    inline double NAME<double>() noexcept           \
+    {                                               \
+        return DOUBLE;                              \
+    }
+
+#define XSIMD_DEFINE_CONSTANT_HEX(NAME, SINGLE, DOUBLE) \
+    template <class T>                                  \
+    inline T NAME() noexcept                            \
+    {                                                   \
+        return T(NAME<typename T::value_type>());       \
+    }                                                   \
+    template <>                                         \
+    inline float NAME<float>() noexcept                 \
+    {                                                   \
+        return bit_cast<float>((uint32_t)SINGLE);       \
+    }                                                   \
+    template <>                                         \
+    inline double NAME<double>() noexcept               \
+    {                                                   \
+        return bit_cast<double>((uint64_t)DOUBLE);      \
+    }
+
+        XSIMD_DEFINE_CONSTANT(infinity, (std::numeric_limits<float>::infinity()), (std::numeric_limits<double>::infinity()))
+        XSIMD_DEFINE_CONSTANT(invlog_2, 1.442695040888963407359924681001892137426645954152986f, 1.442695040888963407359924681001892137426645954152986)
+        XSIMD_DEFINE_CONSTANT_HEX(invlog_2hi, 0x3fb8b000, 0x3ff7154765200000)
+        XSIMD_DEFINE_CONSTANT_HEX(invlog_2lo, 0xb9389ad4, 0x3de705fc2eefa200)
+        XSIMD_DEFINE_CONSTANT(invlog10_2, 3.32192809488736234787031942949f, 3.32192809488736234787031942949)
+        XSIMD_DEFINE_CONSTANT_HEX(invpi, 0x3ea2f983, 0x3fd45f306dc9c883)
+        XSIMD_DEFINE_CONSTANT(log_2, 0.6931471805599453094172321214581765680755001343602553f, 0.6931471805599453094172321214581765680755001343602553)
+        XSIMD_DEFINE_CONSTANT_HEX(log_2hi, 0x3f318000, 0x3fe62e42fee00000)
+        XSIMD_DEFINE_CONSTANT_HEX(log_2lo, 0xb95e8083, 0x3dea39ef35793c76)
+        XSIMD_DEFINE_CONSTANT_HEX(log10_2hi, 0x3e9a0000, 0x3fd3440000000000)
+        XSIMD_DEFINE_CONSTANT_HEX(log10_2lo, 0x39826a14, 0x3ed3509f79fef312)
+        XSIMD_DEFINE_CONSTANT_HEX(logeps, 0xc17f1402, 0xc04205966f2b4f12)
+        XSIMD_DEFINE_CONSTANT_HEX(logpi, 0x3f928682, 0x3ff250d048e7a1bd)
+        XSIMD_DEFINE_CONSTANT_HEX(logsqrt2pi, 0x3f6b3f8e, 0x3fed67f1c864beb5)
+        XSIMD_DEFINE_CONSTANT(maxflint, 16777216.0f, 9007199254740992.0)
+        XSIMD_DEFINE_CONSTANT(maxlog, 88.3762626647949f, 709.78271289338400)
+        XSIMD_DEFINE_CONSTANT(maxlog2, 127.0f, 1023.)
+        XSIMD_DEFINE_CONSTANT(maxlog10, 38.23080825805664f, 308.2547155599167)
+        XSIMD_DEFINE_CONSTANT_HEX(mediumpi, 0x43490fdb, 0x412921fb54442d18)
+        XSIMD_DEFINE_CONSTANT(minlog, -88.3762626647949f, -708.3964185322641)
+        XSIMD_DEFINE_CONSTANT(minlog2, -127.0f, -1023.)
+        XSIMD_DEFINE_CONSTANT(minlog10, -37.89999771118164f, -308.2547155599167)
+        XSIMD_DEFINE_CONSTANT(minusinfinity, (-infinity<float>()), (-infinity<double>()))
+        XSIMD_DEFINE_CONSTANT(minuszero, -0.0f, -0.0)
+        XSIMD_DEFINE_CONSTANT_HEX(nan, 0xffffffff, 0xffffffffffffffff)
+        XSIMD_DEFINE_CONSTANT_HEX(oneosqrteps, 0x453504f3, 0x4190000000000000)
+        XSIMD_DEFINE_CONSTANT_HEX(oneotwoeps, 0x4a800000, 0x4320000000000000)
+        XSIMD_DEFINE_CONSTANT_HEX(pi, 0x40490fdb, 0x400921fb54442d18)
+        XSIMD_DEFINE_CONSTANT_HEX(pio_2lo, 0xb33bbd2e, 0x3c91a62633145c07)
+        XSIMD_DEFINE_CONSTANT_HEX(pio_4lo, 0xb2bbbd2e, 0x3c81a62633145c07)
+        XSIMD_DEFINE_CONSTANT_HEX(pio2, 0x3fc90fdb, 0x3ff921fb54442d18)
+        XSIMD_DEFINE_CONSTANT_HEX(pio2_1, 0x3fc90f80, 0x3ff921fb54400000)
+        XSIMD_DEFINE_CONSTANT_HEX(pio2_1t, 0x37354443, 0x3dd0b4611a626331)
+        XSIMD_DEFINE_CONSTANT_HEX(pio2_2, 0x37354400, 0x3dd0b4611a600000)
+        XSIMD_DEFINE_CONSTANT_HEX(pio2_2t, 0x2e85a308, 0x3ba3198a2e037073)
+        XSIMD_DEFINE_CONSTANT_HEX(pio2_3, 0x2e85a300, 0x3ba3198a2e000000)
+        XSIMD_DEFINE_CONSTANT_HEX(pio2_3t, 0x248d3132, 0x397b839a252049c1)
+        XSIMD_DEFINE_CONSTANT_HEX(pio4, 0x3f490fdb, 0x3fe921fb54442d18)
+        XSIMD_DEFINE_CONSTANT_HEX(signmask, 0x80000000, 0x8000000000000000)
+        XSIMD_DEFINE_CONSTANT(smallestposval, std::numeric_limits<float>::min(), std::numeric_limits<double>::min())
+        XSIMD_DEFINE_CONSTANT_HEX(sqrt_2pi, 0x40206c99, 0x40040d931ff62704)
+        XSIMD_DEFINE_CONSTANT_HEX(sqrteps, 0x39b504f3, 0x3e50000000000000)
+        XSIMD_DEFINE_CONSTANT_HEX(tanpio8, 0x3ed413cd, 0x3fda827999fcef31)
+        XSIMD_DEFINE_CONSTANT_HEX(tan3pio8, 0x401a827a, 0x4003504f333f9de6)
+        XSIMD_DEFINE_CONSTANT_HEX(twentypi, 0x427b53d1, 0x404f6a7a2955385e)
+        XSIMD_DEFINE_CONSTANT_HEX(twoopi, 0x3f22f983, 0x3fe45f306dc9c883)
+        XSIMD_DEFINE_CONSTANT(twotonmb, 8388608.0f, 4503599627370496.0)
+        XSIMD_DEFINE_CONSTANT_HEX(twotonmbo3, 0x3ba14518, 0x3ed428a2f98d7286)
+
+#undef XSIMD_DEFINE_CONSTANT
+#undef XSIMD_DEFINE_CONSTANT_HEX
+
+        template <class T>
+        constexpr T allbits() noexcept;
+
+        template <class T>
+        constexpr as_integer_t<T> mask1frexp() noexcept;
+
+        template <class T>
+        constexpr as_integer_t<T> mask2frexp() noexcept;
+
+        template <class T>
+        constexpr as_integer_t<T> maxexponent() noexcept;
+
+        template <class T>
+        constexpr as_integer_t<T> maxexponentm1() noexcept;
+
+        template <class T>
+        constexpr int32_t nmb() noexcept;
+
+        template <class T>
+        constexpr T zero() noexcept;
+
+        template <class T>
+        constexpr T minvalue() noexcept;
+
+        template <class T>
+        constexpr T maxvalue() noexcept;
+
+        /**************************
+         * allbits implementation *
+         **************************/
+
+        namespace detail
+        {
+            template <class T, bool = std::is_integral<T>::value>
+            struct allbits_impl
+            {
+                static constexpr T get_value() noexcept
+                {
+                    return T(~0);
+                }
+            };
+
+            template <class T>
+            struct allbits_impl<T, false>
+            {
+                static constexpr T get_value() noexcept
+                {
+                    return nan<T>();
+                }
+            };
+        }
+
+        template <class T>
+        inline constexpr T allbits() noexcept
+        {
+            return T(detail::allbits_impl<typename T::value_type>::get_value());
+        }
+
+        /*****************************
+         * mask1frexp implementation *
+         *****************************/
+
+        template <class T>
+        inline constexpr as_integer_t<T> mask1frexp() noexcept
+        {
+            return as_integer_t<T>(mask1frexp<typename T::value_type>());
+        }
+
+        template <>
+        inline constexpr int32_t mask1frexp<float>() noexcept
+        {
+            return 0x7f800000;
+        }
+
+        template <>
+        inline constexpr int64_t mask1frexp<double>() noexcept
+        {
+            return 0x7ff0000000000000;
+        }
+
+        /*****************************
+         * mask2frexp implementation *
+         *****************************/
+
+        template <class T>
+        inline constexpr as_integer_t<T> mask2frexp() noexcept
+        {
+            return as_integer_t<T>(mask2frexp<typename T::value_type>());
+        }
+
+        template <>
+        inline constexpr int32_t mask2frexp<float>() noexcept
+        {
+            return 0x3f000000;
+        }
+
+        template <>
+        inline constexpr int64_t mask2frexp<double>() noexcept
+        {
+            return 0x3fe0000000000000;
+        }
+
+        /******************************
+         * maxexponent implementation *
+         ******************************/
+
+        template <class T>
+        inline constexpr as_integer_t<T> maxexponent() noexcept
+        {
+            return as_integer_t<T>(maxexponent<typename T::value_type>());
+        }
+
+        template <>
+        inline constexpr int32_t maxexponent<float>() noexcept
+        {
+            return 127;
+        }
+
+        template <>
+        inline constexpr int64_t maxexponent<double>() noexcept
+        {
+            return 1023;
+        }
+
+        /******************************
+         * maxexponent implementation *
+         ******************************/
+
+        template <class T>
+        inline constexpr as_integer_t<T> maxexponentm1() noexcept
+        {
+            return as_integer_t<T>(maxexponentm1<typename T::value_type>());
+        }
+
+        template <>
+        inline constexpr int32_t maxexponentm1<float>() noexcept
+        {
+            return 126;
+        }
+
+        template <>
+        inline constexpr int64_t maxexponentm1<double>() noexcept
+        {
+            return 1022;
+        }
+
+        /**********************
+         * nmb implementation *
+         **********************/
+
+        template <class T>
+        inline constexpr int32_t nmb() noexcept
+        {
+            return nmb<typename T::value_type>();
+        }
+
+        template <>
+        inline constexpr int32_t nmb<float>() noexcept
+        {
+            return 23;
+        }
+
+        template <>
+        inline constexpr int32_t nmb<double>() noexcept
+        {
+            return 52;
+        }
+
+        /***********************
+         * zero implementation *
+         ***********************/
+
+        template <class T>
+        inline constexpr T zero() noexcept
+        {
+            return T(typename T::value_type(0));
+        }
+
+        /***************************
+         * minvalue implementation *
+         ***************************/
+
+        namespace detail
+        {
+            template <class T>
+            struct minvalue_impl
+            {
+                static constexpr T get_value() noexcept
+                {
+                    return std::numeric_limits<typename T::value_type>::min();
+                }
+            };
+
+            template <class T>
+            struct minvalue_common
+            {
+                static constexpr T get_value() noexcept
+                {
+                    return std::numeric_limits<T>::min();
+                }
+            };
+
+            template <>
+            struct minvalue_impl<int8_t> : minvalue_common<int8_t>
+            {
+            };
+            template <>
+            struct minvalue_impl<uint8_t> : minvalue_common<uint8_t>
+            {
+            };
+            template <>
+            struct minvalue_impl<int16_t> : minvalue_common<int16_t>
+            {
+            };
+            template <>
+            struct minvalue_impl<uint16_t> : minvalue_common<uint16_t>
+            {
+            };
+            template <>
+            struct minvalue_impl<int32_t> : minvalue_common<int32_t>
+            {
+            };
+            template <>
+            struct minvalue_impl<uint32_t> : minvalue_common<uint32_t>
+            {
+            };
+            template <>
+            struct minvalue_impl<int64_t> : minvalue_common<int64_t>
+            {
+            };
+            template <>
+            struct minvalue_impl<uint64_t> : minvalue_common<uint64_t>
+            {
+            };
+
+            template <>
+            struct minvalue_impl<float>
+            {
+                static float get_value() noexcept
+                {
+                    return bit_cast<float>((uint32_t)0xff7fffff);
+                }
+            };
+
+            template <>
+            struct minvalue_impl<double>
+            {
+                static double get_value() noexcept
+                {
+                    return bit_cast<double>((uint64_t)0xffefffffffffffff);
+                }
+            };
+        }
+
+        template <class T>
+        inline constexpr T minvalue() noexcept
+        {
+            return T(detail::minvalue_impl<typename T::value_type>::get_value());
+        }
+
+        /***************************
+         * maxvalue implementation *
+         ***************************/
+
+        template <class T>
+        inline constexpr T maxvalue() noexcept
+        {
+            return T(std::numeric_limits<typename T::value_type>::max());
+        }
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp
new file mode 100644
index 0000000000..64e9ed65d1
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx.hpp
@@ -0,0 +1,80 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA3_AVX_HPP
+#define XSIMD_FMA3_AVX_HPP
+
+#include "../types/xsimd_fma3_avx_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // fnma
+        template <class A>
+        inline batch<float, A> fnma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fnmadd_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fnma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fnmadd_pd(x, y, z);
+        }
+
+        // fnms
+        template <class A>
+        inline batch<float, A> fnms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fnmsub_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fnms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fnmsub_pd(x, y, z);
+        }
+
+        // fma
+        template <class A>
+        inline batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fmadd_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fmadd_pd(x, y, z);
+        }
+
+        // fms
+        template <class A>
+        inline batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fmsub_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<avx>>) noexcept
+        {
+            return _mm256_fmsub_pd(x, y, z);
+        }
+
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp
new file mode 100644
index 0000000000..134053951a
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_fma3_avx2.hpp
@@ -0,0 +1,46 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA3_AVX2_HPP
+#define XSIMD_FMA3_AVX2_HPP
+
+#include "../types/xsimd_fma3_avx2_register.hpp"
+
+// Allow inclusion of xsimd_fma3_avx.hpp
+#ifdef XSIMD_FMA3_AVX_HPP
+#undef XSIMD_FMA3_AVX_HPP
+#define XSIMD_FORCE_FMA3_AVX_HPP
+#endif
+
+// Disallow inclusion of ./xsimd_fma3_avx_register.hpp
+#ifndef XSIMD_FMA3_AVX_REGISTER_HPP
+#define XSIMD_FMA3_AVX_REGISTER_HPP
+#define XSIMD_FORCE_FMA3_AVX_REGISTER_HPP
+#endif
+
+// Include ./xsimd_fma3_avx.hpp but s/avx/avx2
+#define avx avx2
+#include "./xsimd_fma3_avx.hpp"
+#undef avx
+#undef XSIMD_FMA3_AVX_HPP
+
+// Carefully restore guards
+#ifdef XSIMD_FORCE_FMA3_AVX_HPP
+#define XSIMD_FMA3_AVX_HPP
+#undef XSIMD_FORCE_FMA3_AVX_HPP
+#endif
+
+#ifdef XSIMD_FORCE_FMA3_AVX_REGISTER_HPP
+#undef XSIMD_FMA3_AVX_REGISTER_HPP
+#undef XSIMD_FORCE_FMA3_AVX_REGISTER_HPP
+#endif
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp
new file mode 100644
index 0000000000..55c38f13a4
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_fma3_sse.hpp
@@ -0,0 +1,79 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA3_SSE_HPP
+#define XSIMD_FMA3_SSE_HPP
+
+#include "../types/xsimd_fma3_sse_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+        // fnma
+        template <class A>
+        inline batch<float, A> fnma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fnmadd_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fnma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fnmadd_pd(x, y, z);
+        }
+
+        // fnms
+        template <class A>
+        inline batch<float, A> fnms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fnmsub_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fnms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fnmsub_pd(x, y, z);
+        }
+
+        // fma
+        template <class A>
+        inline batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fmadd_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fmadd_pd(x, y, z);
+        }
+
+        // fms
+        template <class A>
+        inline batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fmsub_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<fma3<sse4_2>>) noexcept
+        {
+            return _mm_fmsub_pd(x, y, z);
+        }
+
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_fma4.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_fma4.hpp
new file mode 100644
index 0000000000..6a97d711e9
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_fma4.hpp
@@ -0,0 +1,79 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA4_HPP
+#define XSIMD_FMA4_HPP
+
+#include "../types/xsimd_fma4_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // fnma
+        template <class A>
+        inline batch<float, A> fnma(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_nmacc_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fnma(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_nmacc_pd(x, y, z);
+        }
+
+        // fnms
+        template <class A>
+        inline batch<float, A> fnms(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_nmsub_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fnms(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_nmsub_pd(x, y, z);
+        }
+
+        // fma
+        template <class A>
+        inline batch<float, A> fma(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_macc_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fma(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_macc_pd(x, y, z);
+        }
+
+        // fms
+        template <class A>
+        inline batch<float, A> fms(simd_register<float, A> const& x, simd_register<float, A> const& y, simd_register<float, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_msub_ps(x, y, z);
+        }
+
+        template <class A>
+        inline batch<double, A> fms(simd_register<double, A> const& x, simd_register<double, A> const& y, simd_register<double, A> const& z, requires_arch<fma4>) noexcept
+        {
+            return _mm_msub_pd(x, y, z);
+        }
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_generic.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_generic.hpp
new file mode 100644
index 0000000000..6403cfb0fc
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_generic.hpp
@@ -0,0 +1,23 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_HPP
+#define XSIMD_GENERIC_HPP
+
+#include "./generic/xsimd_generic_arithmetic.hpp"
+#include "./generic/xsimd_generic_complex.hpp"
+#include "./generic/xsimd_generic_logical.hpp"
+#include "./generic/xsimd_generic_math.hpp"
+#include "./generic/xsimd_generic_memory.hpp"
+#include "./generic/xsimd_generic_rounding.hpp"
+#include "./generic/xsimd_generic_trigo.hpp"
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_generic_fwd.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_generic_fwd.hpp
new file mode 100644
index 0000000000..86e398a5ea
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_generic_fwd.hpp
@@ -0,0 +1,38 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_FWD_HPP
+#define XSIMD_GENERIC_FWD_HPP
+
+#include "../types/xsimd_batch_constant.hpp"
+
+#include <type_traits>
+
+namespace xsimd
+{
+    namespace kernel
+    {
+        // forward declaration
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> abs(batch<T, A> const& self, requires_arch<generic>) noexcept;
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
+        template <class A, class T>
+        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept;
+
+    }
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp
new file mode 100644
index 0000000000..cf0f796a1e
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp
@@ -0,0 +1,86 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_ISA_HPP
+#define XSIMD_ISA_HPP
+
+#include "../config/xsimd_arch.hpp"
+
+#include "./xsimd_generic_fwd.hpp"
+
+#if XSIMD_WITH_SSE2
+#include "./xsimd_sse2.hpp"
+#endif
+
+#if XSIMD_WITH_SSE3
+#include "./xsimd_sse3.hpp"
+#endif
+
+#if XSIMD_WITH_SSSE3
+#include "./xsimd_ssse3.hpp"
+#endif
+
+#if XSIMD_WITH_SSE4_1
+#include "./xsimd_sse4_1.hpp"
+#endif
+
+#if XSIMD_WITH_SSE4_2
+#include "./xsimd_sse4_2.hpp"
+#endif
+
+#if XSIMD_WITH_FMA3_SSE
+#include "./xsimd_fma3_sse.hpp"
+#endif
+
+#if XSIMD_WITH_FMA4
+#include "./xsimd_fma4.hpp"
+#endif
+
+#if XSIMD_WITH_AVX
+#include "./xsimd_avx.hpp"
+#endif
+
+#if XSIMD_WITH_FMA3_AVX
+#include "./xsimd_fma3_avx.hpp"
+#endif
+
+#if XSIMD_WITH_AVX2
+#include "./xsimd_avx2.hpp"
+#endif
+
+#if XSIMD_WITH_FMA3_AVX2
+#include "./xsimd_fma3_avx2.hpp"
+#endif
+
+#if XSIMD_WITH_AVX512F
+#include "./xsimd_avx512f.hpp"
+#endif
+
+#if XSIMD_WITH_AVX512BW
+#include "./xsimd_avx512bw.hpp"
+#endif
+
+#if XSIMD_WITH_NEON
+#include "./xsimd_neon.hpp"
+#endif
+
+#if XSIMD_WITH_NEON64
+#include "./xsimd_neon64.hpp"
+#endif
+
+#if XSIMD_WITH_SVE
+#include "./xsimd_sve.hpp"
+#endif
+
+// Must come last to have access to all conversion specializations.
+#include "./xsimd_generic.hpp"
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp
new file mode 100644
index 0000000000..57c662cd63
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp
@@ -0,0 +1,2670 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_NEON_HPP
+#define XSIMD_NEON_HPP
+
+#include <algorithm>
+#include <complex>
+#include <tuple>
+#include <type_traits>
+
+#include "../types/xsimd_neon_register.hpp"
+#include "../types/xsimd_utils.hpp"
+
+// Wrap intrinsics so we can pass them as function pointers
+// - OP: intrinsics name prefix, e.g., vorrq
+// - RT: type traits to deduce intrinsics return types
+#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT)                                \
+    namespace wrap                                                          \
+    {                                                                       \
+        inline RT<uint8x16_t> OP##_u8(uint8x16_t a, uint8x16_t b) noexcept  \
+        {                                                                   \
+            return ::OP##_u8(a, b);                                         \
+        }                                                                   \
+        inline RT<int8x16_t> OP##_s8(int8x16_t a, int8x16_t b) noexcept     \
+        {                                                                   \
+            return ::OP##_s8(a, b);                                         \
+        }                                                                   \
+        inline RT<uint16x8_t> OP##_u16(uint16x8_t a, uint16x8_t b) noexcept \
+        {                                                                   \
+            return ::OP##_u16(a, b);                                        \
+        }                                                                   \
+        inline RT<int16x8_t> OP##_s16(int16x8_t a, int16x8_t b) noexcept    \
+        {                                                                   \
+            return ::OP##_s16(a, b);                                        \
+        }                                                                   \
+        inline RT<uint32x4_t> OP##_u32(uint32x4_t a, uint32x4_t b) noexcept \
+        {                                                                   \
+            return ::OP##_u32(a, b);                                        \
+        }                                                                   \
+        inline RT<int32x4_t> OP##_s32(int32x4_t a, int32x4_t b) noexcept    \
+        {                                                                   \
+            return ::OP##_s32(a, b);                                        \
+        }                                                                   \
+    }
+
+#define WRAP_BINARY_INT(OP, RT)                                             \
+    WRAP_BINARY_INT_EXCLUDING_64(OP, RT)                                    \
+    namespace wrap                                                          \
+    {                                                                       \
+        inline RT<uint64x2_t> OP##_u64(uint64x2_t a, uint64x2_t b) noexcept \
+        {                                                                   \
+            return ::OP##_u64(a, b);                                        \
+        }                                                                   \
+        inline RT<int64x2_t> OP##_s64(int64x2_t a, int64x2_t b) noexcept    \
+        {                                                                   \
+            return ::OP##_s64(a, b);                                        \
+        }                                                                   \
+    }
+
+#define WRAP_BINARY_FLOAT(OP, RT)                                              \
+    namespace wrap                                                             \
+    {                                                                          \
+        inline RT<float32x4_t> OP##_f32(float32x4_t a, float32x4_t b) noexcept \
+        {                                                                      \
+            return ::OP##_f32(a, b);                                           \
+        }                                                                      \
+    }
+
+#define WRAP_UNARY_INT_EXCLUDING_64(OP)                   \
+    namespace wrap                                        \
+    {                                                     \
+        inline uint8x16_t OP##_u8(uint8x16_t a) noexcept  \
+        {                                                 \
+            return ::OP##_u8(a);                          \
+        }                                                 \
+        inline int8x16_t OP##_s8(int8x16_t a) noexcept    \
+        {                                                 \
+            return ::OP##_s8(a);                          \
+        }                                                 \
+        inline uint16x8_t OP##_u16(uint16x8_t a) noexcept \
+        {                                                 \
+            return ::OP##_u16(a);                         \
+        }                                                 \
+        inline int16x8_t OP##_s16(int16x8_t a) noexcept   \
+        {                                                 \
+            return ::OP##_s16(a);                         \
+        }                                                 \
+        inline uint32x4_t OP##_u32(uint32x4_t a) noexcept \
+        {                                                 \
+            return ::OP##_u32(a);                         \
+        }                                                 \
+        inline int32x4_t OP##_s32(int32x4_t a) noexcept   \
+        {                                                 \
+            return ::OP##_s32(a);                         \
+        }                                                 \
+    }
+
+#define WRAP_UNARY_INT(OP)                                \
+    WRAP_UNARY_INT_EXCLUDING_64(OP)                       \
+    namespace wrap                                        \
+    {                                                     \
+        inline uint64x2_t OP##_u64(uint64x2_t a) noexcept \
+        {                                                 \
+            return ::OP##_u64(a);                         \
+        }                                                 \
+        inline int64x2_t OP##_s64(int64x2_t a) noexcept   \
+        {                                                 \
+            return ::OP##_s64(a);                         \
+        }                                                 \
+    }
+
+#define WRAP_UNARY_FLOAT(OP)                                \
+    namespace wrap                                          \
+    {                                                       \
+        inline float32x4_t OP##_f32(float32x4_t a) noexcept \
+        {                                                   \
+            return ::OP##_f32(a);                           \
+        }                                                   \
+    }
+
+// Dummy identity caster to ease coding
+inline uint8x16_t vreinterpretq_u8_u8(uint8x16_t arg) noexcept { return arg; }
+inline int8x16_t vreinterpretq_s8_s8(int8x16_t arg) noexcept { return arg; }
+inline uint16x8_t vreinterpretq_u16_u16(uint16x8_t arg) noexcept { return arg; }
+inline int16x8_t vreinterpretq_s16_s16(int16x8_t arg) noexcept { return arg; }
+inline uint32x4_t vreinterpretq_u32_u32(uint32x4_t arg) noexcept { return arg; }
+inline int32x4_t vreinterpretq_s32_s32(int32x4_t arg) noexcept { return arg; }
+inline uint64x2_t vreinterpretq_u64_u64(uint64x2_t arg) noexcept { return arg; }
+inline int64x2_t vreinterpretq_s64_s64(int64x2_t arg) noexcept { return arg; }
+inline float32x4_t vreinterpretq_f32_f32(float32x4_t arg) noexcept { return arg; }
+
+namespace xsimd
+{
+    template <class batch_type, bool... Values>
+    struct batch_bool_constant;
+
+    namespace kernel
+    {
+        using namespace types;
+
+        namespace detail
+        {
+            template <template <class> class return_type, class... T>
+            struct neon_dispatcher_base
+            {
+                struct unary
+                {
+                    using container_type = std::tuple<return_type<T> (*)(T)...>;
+                    const container_type m_func;
+
+                    template <class U>
+                    return_type<U> apply(U rhs) const noexcept
+                    {
+                        using func_type = return_type<U> (*)(U);
+                        auto func = xsimd::detail::get<func_type>(m_func);
+                        return func(rhs);
+                    }
+                };
+
+                struct binary
+                {
+                    using container_type = std::tuple<return_type<T> (*)(T, T)...>;
+                    const container_type m_func;
+
+                    template <class U>
+                    return_type<U> apply(U lhs, U rhs) const noexcept
+                    {
+                        using func_type = return_type<U> (*)(U, U);
+                        auto func = xsimd::detail::get<func_type>(m_func);
+                        return func(lhs, rhs);
+                    }
+                };
+            };
+
+            /***************************
+             *  arithmetic dispatchers *
+             ***************************/
+
+            template <class T>
+            using identity_return_type = T;
+
+            template <class... T>
+            struct neon_dispatcher_impl : neon_dispatcher_base<identity_return_type, T...>
+            {
+            };
+
+            using neon_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t,
+                                                         uint16x8_t, int16x8_t,
+                                                         uint32x4_t, int32x4_t,
+                                                         uint64x2_t, int64x2_t,
+                                                         float32x4_t>;
+
+            using excluding_int64_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t,
+                                                                    uint16x8_t, int16x8_t,
+                                                                    uint32x4_t, int32x4_t,
+                                                                    float32x4_t>;
+
+            /**************************
+             * comparison dispatchers *
+             **************************/
+
+            template <class T>
+            struct comp_return_type_impl;
+
+            template <>
+            struct comp_return_type_impl<uint8x16_t>
+            {
+                using type = uint8x16_t;
+            };
+
+            template <>
+            struct comp_return_type_impl<int8x16_t>
+            {
+                using type = uint8x16_t;
+            };
+
+            template <>
+            struct comp_return_type_impl<uint16x8_t>
+            {
+                using type = uint16x8_t;
+            };
+
+            template <>
+            struct comp_return_type_impl<int16x8_t>
+            {
+                using type = uint16x8_t;
+            };
+
+            template <>
+            struct comp_return_type_impl<uint32x4_t>
+            {
+                using type = uint32x4_t;
+            };
+
+            template <>
+            struct comp_return_type_impl<int32x4_t>
+            {
+                using type = uint32x4_t;
+            };
+
+            template <>
+            struct comp_return_type_impl<uint64x2_t>
+            {
+                using type = uint64x2_t;
+            };
+
+            template <>
+            struct comp_return_type_impl<int64x2_t>
+            {
+                using type = uint64x2_t;
+            };
+
+            template <>
+            struct comp_return_type_impl<float32x4_t>
+            {
+                using type = uint32x4_t;
+            };
+
+            template <class T>
+            using comp_return_type = typename comp_return_type_impl<T>::type;
+
+            template <class... T>
+            struct neon_comp_dispatcher_impl : neon_dispatcher_base<comp_return_type, T...>
+            {
+            };
+
+            using excluding_int64_comp_dispatcher = neon_comp_dispatcher_impl<uint8x16_t, int8x16_t,
+                                                                              uint16x8_t, int16x8_t,
+                                                                              uint32x4_t, int32x4_t,
+                                                                              float32x4_t>;
+
+            /**************************************
+             * enabling / disabling metafunctions *
+             **************************************/
+
+            template <class T>
+            using enable_neon_type_t = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, float>::value,
+                                                               int>::type;
+
+            template <class T>
+            using exclude_int64_neon_t
+                = typename std::enable_if<(std::is_integral<T>::value && sizeof(T) != 8) || std::is_same<T, float>::value, int>::type;
+        }
+
+        /*************
+         * broadcast *
+         *************/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        {
+            return vdupq_n_u8(uint8_t(val));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        {
+            return vdupq_n_s8(int8_t(val));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        {
+            return vdupq_n_u16(uint16_t(val));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        {
+            return vdupq_n_s16(int16_t(val));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        {
+            return vdupq_n_u32(uint32_t(val));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        {
+            return vdupq_n_s32(int32_t(val));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        {
+            return vdupq_n_u64(uint64_t(val));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept
+        {
+            return vdupq_n_s64(int64_t(val));
+        }
+
+        template <class A>
+        inline batch<float, A> broadcast(float val, requires_arch<neon>) noexcept
+        {
+            return vdupq_n_f32(val);
+        }
+
+        /*******
+         * set *
+         *******/
+
+        template <class A, class T, class... Args, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<neon>, Args... args) noexcept
+        {
+            return xsimd::types::detail::neon_vector_type<T> { args... };
+        }
+
+        template <class A, class T, class... Args, detail::enable_integral_t<T> = 0>
+        inline batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<neon>, Args... args) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            using unsigned_type = as_unsigned_integer_t<T>;
+            return register_type { static_cast<unsigned_type>(args ? -1LL : 0LL)... };
+        }
+
+        template <class A>
+        inline batch<float, A> set(batch<float, A> const&, requires_arch<neon>, float f0, float f1, float f2, float f3) noexcept
+        {
+            return float32x4_t { f0, f1, f2, f3 };
+        }
+
+        template <class A>
+        inline batch<std::complex<float>, A> set(batch<std::complex<float>, A> const&, requires_arch<neon>,
+                                                 std::complex<float> c0, std::complex<float> c1,
+                                                 std::complex<float> c2, std::complex<float> c3) noexcept
+        {
+            return batch<std::complex<float>>(float32x4_t { c0.real(), c1.real(), c2.real(), c3.real() },
+                                              float32x4_t { c0.imag(), c1.imag(), c2.imag(), c3.imag() });
+        }
+
+        template <class A, class... Args>
+        inline batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<neon>, Args... args) noexcept
+        {
+            using register_type = typename batch_bool<float, A>::register_type;
+            using unsigned_type = as_unsigned_integer_t<float>;
+            return register_type { static_cast<unsigned_type>(args ? -1LL : 0LL)... };
+        }
+
+        /*************
+         * from_bool *
+         *************/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return vandq_u8(arg, vdupq_n_u8(1));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return vandq_s8(reinterpret_cast<int8x16_t>(arg.data), vdupq_n_s8(1));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return vandq_u16(arg, vdupq_n_u16(1));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return vandq_s16(reinterpret_cast<int16x8_t>(arg.data), vdupq_n_s16(1));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return vandq_u32(arg, vdupq_n_u32(1));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return vandq_s32(reinterpret_cast<int32x4_t>(arg.data), vdupq_n_s32(1));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return vandq_u64(arg, vdupq_n_u64(1));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return vandq_s64(reinterpret_cast<int64x2_t>(arg.data), vdupq_n_s64(1));
+        }
+
+        template <class A>
+        inline batch<float, A> from_bool(batch_bool<float, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return vreinterpretq_f32_u32(vandq_u32(arg, vreinterpretq_u32_f32(vdupq_n_f32(1.f))));
+        }
+
+        /********
+         * load *
+         ********/
+
+        // It is not possible to use a call to A::alignment() here, so use an
+        // immediate instead.
+#if defined(__clang__) || defined(__GNUC__)
+#define xsimd_aligned_load(inst, type, expr) inst((type)__builtin_assume_aligned(expr, 16))
+#elif defined(_MSC_VER)
+#define xsimd_aligned_load(inst, type, expr) inst##_ex((type)expr, 128)
+#else
+#define xsimd_aligned_load(inst, type, expr) inst((type)expr)
+#endif
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_u8, uint8_t*, src);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_s8, int8_t*, src);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_u16, uint16_t*, src);
+        }
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_s16, int16_t*, src);
+        }
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_u32, uint32_t*, src);
+        }
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_s32, int32_t*, src);
+        }
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_u64, uint64_t*, src);
+        }
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_s64, int64_t*, src);
+        }
+
+        template <class A>
+        inline batch<float, A> load_aligned(float const* src, convert<float>, requires_arch<neon>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_f32, float*, src);
+        }
+
+#undef xsimd_aligned_load
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_u8((uint8_t*)src);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_s8((int8_t*)src);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_u16((uint16_t*)src);
+        }
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_s16((int16_t*)src);
+        }
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_u32((uint32_t*)src);
+        }
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_s32((int32_t*)src);
+        }
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_u64((uint64_t*)src);
+        }
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept
+        {
+            return vld1q_s64((int64_t*)src);
+        }
+
+        template <class A>
+        inline batch<float, A> load_unaligned(float const* src, convert<float>, requires_arch<neon>) noexcept
+        {
+            return vld1q_f32(src);
+        }
+
+        /*********
+         * store *
+         *********/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        {
+            vst1q_u8((uint8_t*)dst, src);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        {
+            vst1q_s8((int8_t*)dst, src);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        {
+            vst1q_u16((uint16_t*)dst, src);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        {
+            vst1q_s16((int16_t*)dst, src);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        {
+            vst1q_u32((uint32_t*)dst, src);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        {
+            vst1q_s32((int32_t*)dst, src);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        {
+            vst1q_u64((uint64_t*)dst, src);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        {
+            vst1q_s64((int64_t*)dst, src);
+        }
+
+        template <class A>
+        inline void store_aligned(float* dst, batch<float, A> const& src, requires_arch<neon>) noexcept
+        {
+            vst1q_f32(dst, src);
+        }
+
+        template <class A, class T>
+        inline void store_unaligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept
+        {
+            store_aligned<A>(dst, src, A {});
+        }
+
+        /****************
+         * load_complex *
+         ****************/
+
+        template <class A>
+        inline batch<std::complex<float>, A> load_complex_aligned(std::complex<float> const* mem, convert<std::complex<float>>, requires_arch<neon>) noexcept
+        {
+            using real_batch = batch<float, A>;
+            const float* buf = reinterpret_cast<const float*>(mem);
+            float32x4x2_t tmp = vld2q_f32(buf);
+            real_batch real = tmp.val[0],
+                       imag = tmp.val[1];
+            return batch<std::complex<float>, A> { real, imag };
+        }
+
+        template <class A>
+        inline batch<std::complex<float>, A> load_complex_unaligned(std::complex<float> const* mem, convert<std::complex<float>> cvt, requires_arch<neon>) noexcept
+        {
+            return load_complex_aligned<A>(mem, cvt, A {});
+        }
+
+        /*****************
+         * store_complex *
+         *****************/
+
+        template <class A>
+        inline void store_complex_aligned(std::complex<float>* dst, batch<std::complex<float>, A> const& src, requires_arch<neon>) noexcept
+        {
+            float32x4x2_t tmp;
+            tmp.val[0] = src.real();
+            tmp.val[1] = src.imag();
+            float* buf = reinterpret_cast<float*>(dst);
+            vst2q_f32(buf, tmp);
+        }
+
+        template <class A>
+        inline void store_complex_unaligned(std::complex<float>* dst, batch<std::complex<float>, A> const& src, requires_arch<neon>) noexcept
+        {
+            store_complex_aligned(dst, src, A {});
+        }
+
+        /*******
+         * neg *
+         *******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(rhs)));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vnegq_s8(rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vreinterpretq_u16_s16(vnegq_s16(vreinterpretq_s16_u16(rhs)));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vnegq_s16(rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vreinterpretq_u32_s32(vnegq_s32(vreinterpretq_s32_u32(rhs)));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vnegq_s32(rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return batch<T, A> { -rhs.get(0), -rhs.get(1) };
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return batch<T, A> { -rhs.get(0), -rhs.get(1) };
+        }
+
+        template <class A>
+        inline batch<float, A> neg(batch<float, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vnegq_f32(rhs);
+        }
+
+        /*******
+         * add *
+         *******/
+
+        WRAP_BINARY_INT(vaddq, detail::identity_return_type)
+        WRAP_BINARY_FLOAT(vaddq, detail::identity_return_type)
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        inline batch<T, A> add(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vaddq_u8, wrap::vaddq_s8, wrap::vaddq_u16, wrap::vaddq_s16,
+                                wrap::vaddq_u32, wrap::vaddq_s32, wrap::vaddq_u64, wrap::vaddq_s64,
+                                wrap::vaddq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        /********
+         * sadd *
+         ********/
+
+        WRAP_BINARY_INT(vqaddq, detail::identity_return_type)
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        inline batch<T, A> sadd(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vqaddq_u8, wrap::vqaddq_s8, wrap::vqaddq_u16, wrap::vqaddq_s16,
+                                wrap::vqaddq_u32, wrap::vqaddq_s32, wrap::vqaddq_u64, wrap::vqaddq_s64,
+                                wrap::vaddq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        /*******
+         * sub *
+         *******/
+
+        WRAP_BINARY_INT(vsubq, detail::identity_return_type)
+        WRAP_BINARY_FLOAT(vsubq, detail::identity_return_type)
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        inline batch<T, A> sub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vsubq_u8, wrap::vsubq_s8, wrap::vsubq_u16, wrap::vsubq_s16,
+                                wrap::vsubq_u32, wrap::vsubq_s32, wrap::vsubq_u64, wrap::vsubq_s64,
+                                wrap::vsubq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        /********
+         * ssub *
+         ********/
+
+        WRAP_BINARY_INT(vqsubq, detail::identity_return_type)
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        inline batch<T, A> ssub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vqsubq_u8, wrap::vqsubq_s8, wrap::vqsubq_u16, wrap::vqsubq_s16,
+                                wrap::vqsubq_u32, wrap::vqsubq_s32, wrap::vqsubq_u64, wrap::vqsubq_s64,
+                                wrap::vsubq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        /*******
+         * mul *
+         *******/
+
+        WRAP_BINARY_INT_EXCLUDING_64(vmulq, detail::identity_return_type)
+        WRAP_BINARY_FLOAT(vmulq, detail::identity_return_type)
+
+        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
+        inline batch<T, A> mul(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::excluding_int64_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vmulq_u8, wrap::vmulq_s8, wrap::vmulq_u16, wrap::vmulq_s16,
+                                wrap::vmulq_u32, wrap::vmulq_s32, wrap::vmulq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        /*******
+         * div *
+         *******/
+
+#if defined(XSIMD_FAST_INTEGER_DIVISION)
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vcvtq_s32_f32(vcvtq_f32_s32(lhs) / vcvtq_f32_s32(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vcvtq_u32_f32(vcvtq_f32_u32(lhs) / vcvtq_f32_u32(rhs));
+        }
+#endif
+
+        template <class A>
+        inline batch<float, A> div(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            // from stackoverflow & https://projectne10.github.io/Ne10/doc/NE10__divc_8neon_8c_source.html
+            // get an initial estimate of 1/b.
+            float32x4_t rcp = reciprocal(rhs);
+
+            // use a couple Newton-Raphson steps to refine the estimate.  Depending on your
+            // application's accuracy requirements, you may be able to get away with only
+            // one refinement (instead of the two used here).  Be sure to test!
+            rcp = vmulq_f32(vrecpsq_f32(rhs, rcp), rcp);
+            rcp = vmulq_f32(vrecpsq_f32(rhs, rcp), rcp);
+
+            // and finally, compute a / b = a * (1 / b)
+            return vmulq_f32(lhs, rcp);
+        }
+
+        /******
+         * eq *
+         ******/
+
+        WRAP_BINARY_INT_EXCLUDING_64(vceqq, detail::comp_return_type)
+        WRAP_BINARY_FLOAT(vceqq, detail::comp_return_type)
+
+        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
+        inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vceqq_u8, wrap::vceqq_s8, wrap::vceqq_u16, wrap::vceqq_s16,
+                                wrap::vceqq_u32, wrap::vceqq_s32, wrap::vceqq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
+        inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            using dispatcher_type = detail::neon_comp_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary;
+            const dispatcher_type dispatcher = {
+                std::make_tuple(wrap::vceqq_u8, wrap::vceqq_u16, wrap::vceqq_u32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+        inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return batch_bool<T, A>({ lhs.get(0) == rhs.get(0), lhs.get(1) == rhs.get(1) });
+        }
+
+        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+        inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return batch_bool<T, A>({ lhs.get(0) == rhs.get(0), lhs.get(1) == rhs.get(1) });
+        }
+
+        /*************
+         * fast_cast *
+         *************/
+
+        namespace detail
+        {
+            template <class A>
+            inline batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<neon>) noexcept
+            {
+                return vcvtq_f32_s32(self);
+            }
+
+            template <class A>
+            inline batch<float, A> fast_cast(batch<uint32_t, A> const& self, batch<float, A> const&, requires_arch<neon>) noexcept
+            {
+                return vcvtq_f32_u32(self);
+            }
+
+            template <class A>
+            inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<neon>) noexcept
+            {
+                return vcvtq_s32_f32(self);
+            }
+
+            template <class A>
+            inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<neon>) noexcept
+            {
+                return vcvtq_u32_f32(self);
+            }
+
+        }
+
+        /******
+         * lt *
+         ******/
+
+        WRAP_BINARY_INT_EXCLUDING_64(vcltq, detail::comp_return_type)
+        WRAP_BINARY_FLOAT(vcltq, detail::comp_return_type)
+
+        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
+        inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vcltq_u8, wrap::vcltq_s8, wrap::vcltq_u16, wrap::vcltq_s16,
+                                wrap::vcltq_u32, wrap::vcltq_s32, wrap::vcltq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+        inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return batch_bool<T, A>({ lhs.get(0) < rhs.get(0), lhs.get(1) < rhs.get(1) });
+        }
+
+        /******
+         * le *
+         ******/
+
+        WRAP_BINARY_INT_EXCLUDING_64(vcleq, detail::comp_return_type)
+        WRAP_BINARY_FLOAT(vcleq, detail::comp_return_type)
+
+        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
+        inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vcleq_u8, wrap::vcleq_s8, wrap::vcleq_u16, wrap::vcleq_s16,
+                                wrap::vcleq_u32, wrap::vcleq_s32, wrap::vcleq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+        inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return batch_bool<T, A>({ lhs.get(0) <= rhs.get(0), lhs.get(1) <= rhs.get(1) });
+        }
+
+        /******
+         * gt *
+         ******/
+
+        WRAP_BINARY_INT_EXCLUDING_64(vcgtq, detail::comp_return_type)
+        WRAP_BINARY_FLOAT(vcgtq, detail::comp_return_type)
+
+        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
+        inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vcgtq_u8, wrap::vcgtq_s8, wrap::vcgtq_u16, wrap::vcgtq_s16,
+                                wrap::vcgtq_u32, wrap::vcgtq_s32, wrap::vcgtq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+        inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return batch_bool<T, A>({ lhs.get(0) > rhs.get(0), lhs.get(1) > rhs.get(1) });
+        }
+
+        /******
+         * ge *
+         ******/
+
+        WRAP_BINARY_INT_EXCLUDING_64(vcgeq, detail::comp_return_type)
+        WRAP_BINARY_FLOAT(vcgeq, detail::comp_return_type)
+
+        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
+        inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::excluding_int64_comp_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vcgeq_u8, wrap::vcgeq_s8, wrap::vcgeq_u16, wrap::vcgeq_s16,
+                                wrap::vcgeq_u32, wrap::vcgeq_s32, wrap::vcgeq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+        inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return batch_bool<T, A>({ lhs.get(0) >= rhs.get(0), lhs.get(1) >= rhs.get(1) });
+        }
+
+        /*******************
+         * batch_bool_cast *
+         *******************/
+
+        template <class A, class T_out, class T_in>
+        inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch_bool<T_out, A>::register_type;
+            return register_type(self);
+        }
+
+        /***************
+         * bitwise_and *
+         ***************/
+
+        WRAP_BINARY_INT(vandq, detail::identity_return_type)
+
+        namespace detail
+        {
+            inline float32x4_t bitwise_and_f32(float32x4_t lhs, float32x4_t rhs) noexcept
+            {
+                return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(lhs),
+                                                       vreinterpretq_u32_f32(rhs)));
+            }
+
+            template <class V>
+            V bitwise_and_neon(V const& lhs, V const& rhs)
+            {
+                const neon_dispatcher::binary dispatcher = {
+                    std::make_tuple(wrap::vandq_u8, wrap::vandq_s8, wrap::vandq_u16, wrap::vandq_s16,
+                                    wrap::vandq_u32, wrap::vandq_s32, wrap::vandq_u64, wrap::vandq_s64,
+                                    bitwise_and_f32)
+                };
+                return dispatcher.apply(lhs, rhs);
+            }
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        inline batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            return detail::bitwise_and_neon(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return detail::bitwise_and_neon(register_type(lhs), register_type(rhs));
+        }
+
+        /**************
+         * bitwise_or *
+         **************/
+
+        WRAP_BINARY_INT(vorrq, detail::identity_return_type)
+
+        namespace detail
+        {
+            inline float32x4_t bitwise_or_f32(float32x4_t lhs, float32x4_t rhs) noexcept
+            {
+                return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(lhs),
+                                                       vreinterpretq_u32_f32(rhs)));
+            }
+
+            template <class V>
+            inline V bitwise_or_neon(V const& lhs, V const& rhs) noexcept
+            {
+                const neon_dispatcher::binary dispatcher = {
+                    std::make_tuple(wrap::vorrq_u8, wrap::vorrq_s8, wrap::vorrq_u16, wrap::vorrq_s16,
+                                    wrap::vorrq_u32, wrap::vorrq_s32, wrap::vorrq_u64, wrap::vorrq_s64,
+                                    bitwise_or_f32)
+                };
+                return dispatcher.apply(lhs, rhs);
+            }
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        inline batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            return detail::bitwise_or_neon(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return detail::bitwise_or_neon(register_type(lhs), register_type(rhs));
+        }
+
+        /***************
+         * bitwise_xor *
+         ***************/
+
+        WRAP_BINARY_INT(veorq, detail::identity_return_type)
+
+        namespace detail
+        {
+            inline float32x4_t bitwise_xor_f32(float32x4_t lhs, float32x4_t rhs) noexcept
+            {
+                return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(lhs),
+                                                       vreinterpretq_u32_f32(rhs)));
+            }
+
+            template <class V>
+            inline V bitwise_xor_neon(V const& lhs, V const& rhs) noexcept
+            {
+                const neon_dispatcher::binary dispatcher = {
+                    std::make_tuple(wrap::veorq_u8, wrap::veorq_s8, wrap::veorq_u16, wrap::veorq_s16,
+                                    wrap::veorq_u32, wrap::veorq_s32, wrap::veorq_u64, wrap::veorq_s64,
+                                    bitwise_xor_f32)
+                };
+                return dispatcher.apply(lhs, rhs);
+            }
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        inline batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        inline batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs));
+        }
+
+        /*******
+         * neq *
+         *******/
+
+        template <class A, class T>
+        inline batch_bool<T, A> neq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return bitwise_xor(lhs, rhs, A {});
+        }
+
+        /***************
+         * bitwise_not *
+         ***************/
+
+        WRAP_UNARY_INT_EXCLUDING_64(vmvnq)
+
+        namespace detail
+        {
+            inline int64x2_t bitwise_not_s64(int64x2_t arg) noexcept
+            {
+                return vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(arg)));
+            }
+
+            inline uint64x2_t bitwise_not_u64(uint64x2_t arg) noexcept
+            {
+                return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(arg)));
+            }
+
+            inline float32x4_t bitwise_not_f32(float32x4_t arg) noexcept
+            {
+                return vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(arg)));
+            }
+
+            template <class V>
+            inline V bitwise_not_neon(V const& arg) noexcept
+            {
+                const neon_dispatcher::unary dispatcher = {
+                    std::make_tuple(wrap::vmvnq_u8, wrap::vmvnq_s8, wrap::vmvnq_u16, wrap::vmvnq_s16,
+                                    wrap::vmvnq_u32, wrap::vmvnq_s32,
+                                    bitwise_not_u64, bitwise_not_s64,
+                                    bitwise_not_f32)
+                };
+                return dispatcher.apply(arg);
+            }
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        inline batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            return detail::bitwise_not_neon(register_type(arg));
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return detail::bitwise_not_neon(register_type(arg));
+        }
+
+        /******************
+         * bitwise_andnot *
+         ******************/
+
+        WRAP_BINARY_INT(vbicq, detail::identity_return_type)
+
+        namespace detail
+        {
+            inline float32x4_t bitwise_andnot_f32(float32x4_t lhs, float32x4_t rhs) noexcept
+            {
+                return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(lhs), vreinterpretq_u32_f32(rhs)));
+            }
+
+            template <class V>
+            inline V bitwise_andnot_neon(V const& lhs, V const& rhs) noexcept
+            {
+                const detail::neon_dispatcher::binary dispatcher = {
+                    std::make_tuple(wrap::vbicq_u8, wrap::vbicq_s8, wrap::vbicq_u16, wrap::vbicq_s16,
+                                    wrap::vbicq_u32, wrap::vbicq_s32, wrap::vbicq_u64, wrap::vbicq_s64,
+                                    bitwise_andnot_f32)
+                };
+                return dispatcher.apply(lhs, rhs);
+            }
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        inline batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch_bool<T, A>::register_type;
+            return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs));
+        }
+
+        /*******
+         * min *
+         *******/
+
+        WRAP_BINARY_INT_EXCLUDING_64(vminq, detail::identity_return_type)
+        WRAP_BINARY_FLOAT(vminq, detail::identity_return_type)
+
+        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
+        inline batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::excluding_int64_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vminq_u8, wrap::vminq_s8, wrap::vminq_u16, wrap::vminq_s16,
+                                wrap::vminq_u32, wrap::vminq_s32, wrap::vminq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+        inline batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return { std::min(lhs.get(0), rhs.get(0)), std::min(lhs.get(1), rhs.get(1)) };
+        }
+
+        /*******
+         * max *
+         *******/
+
+        WRAP_BINARY_INT_EXCLUDING_64(vmaxq, detail::identity_return_type)
+        WRAP_BINARY_FLOAT(vmaxq, detail::identity_return_type)
+
+        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
+        inline batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::excluding_int64_dispatcher::binary dispatcher = {
+                std::make_tuple(wrap::vmaxq_u8, wrap::vmaxq_s8, wrap::vmaxq_u16, wrap::vmaxq_s16,
+                                wrap::vmaxq_u32, wrap::vmaxq_s32, wrap::vmaxq_f32)
+            };
+            return dispatcher.apply(register_type(lhs), register_type(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+        inline batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return { std::max(lhs.get(0), rhs.get(0)), std::max(lhs.get(1), rhs.get(1)) };
+        }
+
+        /*******
+         * abs *
+         *******/
+
+        namespace wrap
+        {
+            inline int8x16_t vabsq_s8(int8x16_t a) noexcept { return ::vabsq_s8(a); }
+            inline int16x8_t vabsq_s16(int16x8_t a) noexcept { return ::vabsq_s16(a); }
+            inline int32x4_t vabsq_s32(int32x4_t a) noexcept { return ::vabsq_s32(a); }
+        }
+        WRAP_UNARY_FLOAT(vabsq)
+
+        namespace detail
+        {
+            inline uint8x16_t abs_u8(uint8x16_t arg) noexcept
+            {
+                return arg;
+            }
+
+            inline uint16x8_t abs_u16(uint16x8_t arg) noexcept
+            {
+                return arg;
+            }
+
+            inline uint32x4_t abs_u32(uint32x4_t arg) noexcept
+            {
+                return arg;
+            }
+        }
+
+        template <class A, class T, detail::exclude_int64_neon_t<T> = 0>
+        inline batch<T, A> abs(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::excluding_int64_dispatcher::unary dispatcher = {
+                std::make_tuple(detail::abs_u8, wrap::vabsq_s8, detail::abs_u16, wrap::vabsq_s16,
+                                detail::abs_u32, wrap::vabsq_s32, wrap::vabsq_f32)
+            };
+            return dispatcher.apply(register_type(arg));
+        }
+
+        /********
+         * rsqrt *
+         ********/
+
+        template <class A>
+        inline batch<float, A> rsqrt(batch<float, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return vrsqrteq_f32(arg);
+        }
+
+        /********
+         * sqrt *
+         ********/
+
+        template <class A>
+        inline batch<float, A> sqrt(batch<float, A> const& arg, requires_arch<neon>) noexcept
+        {
+            batch<float, A> sqrt_reciprocal = vrsqrteq_f32(arg);
+            // one iter
+            sqrt_reciprocal = sqrt_reciprocal * batch<float, A>(vrsqrtsq_f32(arg * sqrt_reciprocal, sqrt_reciprocal));
+            batch<float, A> sqrt_approx = arg * sqrt_reciprocal * batch<float, A>(vrsqrtsq_f32(arg * sqrt_reciprocal, sqrt_reciprocal));
+            batch<float, A> zero(0.f);
+            return select(arg == zero, zero, sqrt_approx);
+        }
+
+        /********************
+         * Fused operations *
+         ********************/
+
+#ifdef __ARM_FEATURE_FMA
+        template <class A>
+        inline batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<neon>) noexcept
+        {
+            return vfmaq_f32(z, x, y);
+        }
+
+        template <class A>
+        inline batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<neon>) noexcept
+        {
+            return vfmaq_f32(-z, x, y);
+        }
+#endif
+
+        /*********
+         * haddp *
+         *********/
+
+        template <class A>
+        inline batch<float, A> haddp(const batch<float, A>* row, requires_arch<neon>) noexcept
+        {
+            // row = (a,b,c,d)
+            float32x2_t tmp1, tmp2, tmp3;
+            // tmp1 = (a0 + a2, a1 + a3)
+            tmp1 = vpadd_f32(vget_low_f32(row[0]), vget_high_f32(row[0]));
+            // tmp2 = (b0 + b2, b1 + b3)
+            tmp2 = vpadd_f32(vget_low_f32(row[1]), vget_high_f32(row[1]));
+            // tmp1 = (a0..3, b0..3)
+            tmp1 = vpadd_f32(tmp1, tmp2);
+            // tmp2 = (c0 + c2, c1 + c3)
+            tmp2 = vpadd_f32(vget_low_f32(row[2]), vget_high_f32(row[2]));
+            // tmp3 = (d0 + d2, d1 + d3)
+            tmp3 = vpadd_f32(vget_low_f32(row[3]), vget_high_f32(row[3]));
+            // tmp1 = (c0..3, d0..3)
+            tmp2 = vpadd_f32(tmp2, tmp3);
+            // return = (a0..3, b0..3, c0..3, d0..3)
+            return vcombine_f32(tmp1, tmp2);
+        }
+
+        /**************
+         * reciprocal *
+         **************/
+
+        template <class A>
+        inline batch<float, A>
+        reciprocal(const batch<float, A>& x,
+                   kernel::requires_arch<neon>) noexcept
+        {
+            return vrecpeq_f32(x);
+        }
+
+        /**********
+         * insert *
+         **********/
+
+        template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 1> = 0>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
+        {
+            return vsetq_lane_u8(val, self, I);
+        }
+
+        template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 1> = 0>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
+        {
+            return vsetq_lane_s8(val, self, I);
+        }
+
+        template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 2> = 0>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
+        {
+            return vsetq_lane_u16(val, self, I);
+        }
+
+        template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 2> = 0>
+        inline batch<int16_t, A> insert(batch<int16_t, A> const& self, int16_t val, index<I>, requires_arch<neon>) noexcept
+        {
+            return vsetq_lane_s16(val, self, I);
+        }
+
+        template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
+        {
+            return vsetq_lane_u32(val, self, I);
+        }
+
+        template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 4> = 0>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
+        {
+            return vsetq_lane_s32(val, self, I);
+        }
+
+        template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
+        {
+            return vsetq_lane_u64(val, self, I);
+        }
+
+        template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept
+        {
+            return vsetq_lane_s64(val, self, I);
+        }
+
+        template <class A, size_t I>
+        inline batch<float, A> insert(batch<float, A> const& self, float val, index<I>, requires_arch<neon>) noexcept
+        {
+            return vsetq_lane_f32(val, self, I);
+        }
+
+        /********************
+         * nearbyint_as_int *
+         *******************/
+
+        template <class A>
+        inline batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
+                                                  requires_arch<neon>) noexcept
+        {
+            /* origin: https://github.com/DLTcollab/sse2neon/blob/cad518a93b326f0f644b7972d488d04eaa2b0475/sse2neon.h#L4028-L4047 */
+            //  Contributors to this work are:
+            //   John W. Ratcliff <jratcliffscarab@gmail.com>
+            //   Brandon Rowlett <browlett@nvidia.com>
+            //   Ken Fast <kfast@gdeb.com>
+            //   Eric van Beurden <evanbeurden@nvidia.com>
+            //   Alexander Potylitsin <apotylitsin@nvidia.com>
+            //   Hasindu Gamaarachchi <hasindu2008@gmail.com>
+            //   Jim Huang <jserv@biilabs.io>
+            //   Mark Cheng <marktwtn@biilabs.io>
+            //   Malcolm James MacLeod <malcolm@gulden.com>
+            //   Devin Hussey (easyaspi314) <husseydevin@gmail.com>
+            //   Sebastian Pop <spop@amazon.com>
+            //   Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com>
+            //   Danila Kutenin <danilak@google.com>
+            //   François Turban (JishinMaster) <francois.turban@gmail.com>
+            //   Pei-Hsuan Hung <afcidk@gmail.com>
+            //   Yang-Hao Yuan <yanghau@biilabs.io>
+            //   Syoyo Fujita <syoyo@lighttransport.com>
+            //   Brecht Van Lommel <brecht@blender.org>
+
+            /*
+             * sse2neon is freely redistributable under the MIT License.
+             *
+             * Permission is hereby granted, free of charge, to any person obtaining a copy
+             * of this software and associated documentation files (the "Software"), to deal
+             * in the Software without restriction, including without limitation the rights
+             * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+             * copies of the Software, and to permit persons to whom the Software is
+             * furnished to do so, subject to the following conditions:
+             *
+             * The above copyright notice and this permission notice shall be included in
+             * all copies or substantial portions of the Software.
+             *
+             * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+             * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+             * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+             * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+             * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+             * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+             * SOFTWARE.
+             */
+
+            const auto signmask = vdupq_n_u32(0x80000000);
+            const auto half = vbslq_f32(signmask, self,
+                                        vdupq_n_f32(0.5f)); /* +/- 0.5 */
+            const auto r_normal = vcvtq_s32_f32(vaddq_f32(
+                self, half)); /* round to integer: [a + 0.5]*/
+            const auto r_trunc = vcvtq_s32_f32(self); /* truncate to integer: [a] */
+            const auto plusone = vreinterpretq_s32_u32(vshrq_n_u32(
+                vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */
+            const auto r_even = vbicq_s32(vaddq_s32(r_trunc, plusone),
+                                          vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */
+            const auto delta = vsubq_f32(
+                self,
+                vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */
+            const auto is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */
+            return vbslq_s32(is_delta_half, r_even, r_normal);
+        }
+
+        /**************
+         * reduce_add *
+         **************/
+
+        namespace detail
+        {
+            template <class T, class A, class V>
+            inline T sum_batch(V const& arg) noexcept
+            {
+                T res = T(0);
+                for (std::size_t i = 0; i < batch<T, A>::size; ++i)
+                {
+                    res += arg[i];
+                }
+                return res;
+            }
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            uint8x8_t tmp = vpadd_u8(vget_low_u8(arg), vget_high_u8(arg));
+            tmp = vpadd_u8(tmp, tmp);
+            tmp = vpadd_u8(tmp, tmp);
+            tmp = vpadd_u8(tmp, tmp);
+            return vget_lane_u8(tmp, 0);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            int8x8_t tmp = vpadd_s8(vget_low_s8(arg), vget_high_s8(arg));
+            tmp = vpadd_s8(tmp, tmp);
+            tmp = vpadd_s8(tmp, tmp);
+            tmp = vpadd_s8(tmp, tmp);
+            return vget_lane_s8(tmp, 0);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            uint16x4_t tmp = vpadd_u16(vget_low_u16(arg), vget_high_u16(arg));
+            tmp = vpadd_u16(tmp, tmp);
+            tmp = vpadd_u16(tmp, tmp);
+            return vget_lane_u16(tmp, 0);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            int16x4_t tmp = vpadd_s16(vget_low_s16(arg), vget_high_s16(arg));
+            tmp = vpadd_s16(tmp, tmp);
+            tmp = vpadd_s16(tmp, tmp);
+            return vget_lane_s16(tmp, 0);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            uint32x2_t tmp = vpadd_u32(vget_low_u32(arg), vget_high_u32(arg));
+            tmp = vpadd_u32(tmp, tmp);
+            return vget_lane_u32(tmp, 0);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            int32x2_t tmp = vpadd_s32(vget_low_s32(arg), vget_high_s32(arg));
+            tmp = vpadd_s32(tmp, tmp);
+            return vget_lane_s32(tmp, 0);
+        }
+
+        template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+        inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return arg.get(0) + arg.get(1);
+        }
+
+        template <class A>
+        inline float reduce_add(batch<float, A> const& arg, requires_arch<neon>) noexcept
+        {
+            float32x2_t tmp = vpadd_f32(vget_low_f32(arg), vget_high_f32(arg));
+            tmp = vpadd_f32(tmp, tmp);
+            return vget_lane_f32(tmp, 0);
+        }
+
+        /**************
+         * reduce_max *
+         **************/
+
+        // Using generic implementation because ARM doe snot provide intrinsics
+        // for this operation
+
+        /**************
+         * reduce_min *
+         **************/
+
+        // Using generic implementation because ARM doe snot provide intrinsics
+        // for this operation
+
+        /**********
+         * select *
+         **********/
+
+        namespace wrap
+        {
+            inline uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) noexcept { return ::vbslq_u8(a, b, c); }
+            inline int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) noexcept { return ::vbslq_s8(a, b, c); }
+            inline uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) noexcept { return ::vbslq_u16(a, b, c); }
+            inline int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) noexcept { return ::vbslq_s16(a, b, c); }
+            inline uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) noexcept { return ::vbslq_u32(a, b, c); }
+            inline int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) noexcept { return ::vbslq_s32(a, b, c); }
+            inline uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) noexcept { return ::vbslq_u64(a, b, c); }
+            inline int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) noexcept { return ::vbslq_s64(a, b, c); }
+            inline float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) noexcept { return ::vbslq_f32(a, b, c); }
+        }
+
+        namespace detail
+        {
+            template <class... T>
+            struct neon_select_dispatcher_impl
+            {
+                using container_type = std::tuple<T (*)(comp_return_type<T>, T, T)...>;
+                const container_type m_func;
+
+                template <class U>
+                U apply(comp_return_type<U> cond, U lhs, U rhs) const noexcept
+                {
+                    using func_type = U (*)(comp_return_type<U>, U, U);
+                    auto func = xsimd::detail::get<func_type>(m_func);
+                    return func(cond, lhs, rhs);
+                }
+            };
+
+            using neon_select_dispatcher = neon_select_dispatcher_impl<uint8x16_t, int8x16_t,
+                                                                       uint16x8_t, int16x8_t,
+                                                                       uint32x4_t, int32x4_t,
+                                                                       uint64x2_t, int64x2_t,
+                                                                       float32x4_t>;
+        }
+
+        template <class A, class T, detail::enable_neon_type_t<T> = 0>
+        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& a, batch<T, A> const& b, requires_arch<neon>) noexcept
+        {
+            using bool_register_type = typename batch_bool<T, A>::register_type;
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_select_dispatcher dispatcher = {
+                std::make_tuple(wrap::vbslq_u8, wrap::vbslq_s8, wrap::vbslq_u16, wrap::vbslq_s16,
+                                wrap::vbslq_u32, wrap::vbslq_s32, wrap::vbslq_u64, wrap::vbslq_s64,
+                                wrap::vbslq_f32)
+            };
+            return dispatcher.apply(bool_register_type(cond), register_type(a), register_type(b));
+        }
+
+        template <class A, class T, bool... b, detail::enable_neon_type_t<T> = 0>
+        inline batch<T, A> select(batch_bool_constant<batch<T, A>, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<neon>) noexcept
+        {
+            return select(batch_bool<T, A> { b... }, true_br, false_br, neon {});
+        }
+
+        /**********
+         * zip_lo *
+         **********/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            uint8x8x2_t tmp = vzip_u8(vget_low_u8(lhs), vget_low_u8(rhs));
+            return vcombine_u8(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            int8x8x2_t tmp = vzip_s8(vget_low_s8(lhs), vget_low_s8(rhs));
+            return vcombine_s8(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            uint16x4x2_t tmp = vzip_u16(vget_low_u16(lhs), vget_low_u16(rhs));
+            return vcombine_u16(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            int16x4x2_t tmp = vzip_s16(vget_low_s16(lhs), vget_low_s16(rhs));
+            return vcombine_s16(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            uint32x2x2_t tmp = vzip_u32(vget_low_u32(lhs), vget_low_u32(rhs));
+            return vcombine_u32(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            int32x2x2_t tmp = vzip_s32(vget_low_s32(lhs), vget_low_s32(rhs));
+            return vcombine_s32(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vcombine_u64(vget_low_u64(lhs), vget_low_u64(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vcombine_s64(vget_low_s64(lhs), vget_low_s64(rhs));
+        }
+
+        template <class A>
+        inline batch<float, A> zip_lo(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            float32x2x2_t tmp = vzip_f32(vget_low_f32(lhs), vget_low_f32(rhs));
+            return vcombine_f32(tmp.val[0], tmp.val[1]);
+        }
+
+        /**********
+         * zip_hi *
+         **********/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            uint8x8x2_t tmp = vzip_u8(vget_high_u8(lhs), vget_high_u8(rhs));
+            return vcombine_u8(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            int8x8x2_t tmp = vzip_s8(vget_high_s8(lhs), vget_high_s8(rhs));
+            return vcombine_s8(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            uint16x4x2_t tmp = vzip_u16(vget_high_u16(lhs), vget_high_u16(rhs));
+            return vcombine_u16(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            int16x4x2_t tmp = vzip_s16(vget_high_s16(lhs), vget_high_s16(rhs));
+            return vcombine_s16(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            uint32x2x2_t tmp = vzip_u32(vget_high_u32(lhs), vget_high_u32(rhs));
+            return vcombine_u32(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            int32x2x2_t tmp = vzip_s32(vget_high_s32(lhs), vget_high_s32(rhs));
+            return vcombine_s32(tmp.val[0], tmp.val[1]);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vcombine_u64(vget_high_u64(lhs), vget_high_u64(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vcombine_s64(vget_high_s64(lhs), vget_high_s64(rhs));
+        }
+
+        template <class A>
+        inline batch<float, A> zip_hi(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            float32x2x2_t tmp = vzip_f32(vget_high_f32(lhs), vget_high_f32(rhs));
+            return vcombine_f32(tmp.val[0], tmp.val[1]);
+        }
+
+        /****************
+         * extract_pair *
+         ****************/
+
+        namespace detail
+        {
+            template <class A, class T>
+            inline batch<T, A> extract_pair(batch<T, A> const&, batch<T, A> const& /*rhs*/, std::size_t, ::xsimd::detail::index_sequence<>) noexcept
+            {
+                assert(false && "extract_pair out of bounds");
+                return batch<T, A> {};
+            }
+
+            template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 1> = 0>
+            inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_u8(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 1> = 0>
+            inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_s8(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 2> = 0>
+            inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_u16(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 2> = 0>
+            inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_s16(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 4> = 0>
+            inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_u32(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 4> = 0>
+            inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_s32(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 8> = 0>
+            inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_u64(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 8> = 0>
+            inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_s64(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+
+            template <class A, size_t I, size_t... Is>
+            inline batch<float, A> extract_pair(batch<float, A> const& lhs, batch<float, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_f32(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, size_t... Is>
+            inline batch<T, A> extract_pair_impl(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<0, Is...>) noexcept
+            {
+                if (n == 0)
+                {
+                    return rhs;
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+        }
+
+        template <class A, class T>
+        inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, requires_arch<neon>) noexcept
+        {
+            constexpr std::size_t size = batch<T, A>::size;
+            assert(n < size && "index in bounds");
+            return detail::extract_pair_impl(lhs, rhs, n, ::xsimd::detail::make_index_sequence<size>());
+        }
+
+        /******************
+         * bitwise_lshift *
+         ******************/
+
+        namespace detail
+        {
+            template <class A, class T>
+            inline batch<T, A> bitwise_lshift(batch<T, A> const& /*lhs*/, int /*n*/, ::xsimd::detail::int_sequence<>) noexcept
+            {
+                assert(false && "bitwise_lshift out of bounds");
+                return batch<T, A> {};
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 1> = 0>
+            inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshlq_n_u8(lhs, I);
+                }
+                else
+                {
+                    return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 1> = 0>
+            inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshlq_n_s8(lhs, I);
+                }
+                else
+                {
+                    return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 2> = 0>
+            inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshlq_n_u16(lhs, I);
+                }
+                else
+                {
+                    return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 2> = 0>
+            inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshlq_n_s16(lhs, I);
+                }
+                else
+                {
+                    return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 4> = 0>
+            inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshlq_n_u32(lhs, I);
+                }
+                else
+                {
+                    return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 4> = 0>
+            inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshlq_n_s32(lhs, I);
+                }
+                else
+                {
+                    return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 8> = 0>
+            inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshlq_n_u64(lhs, I);
+                }
+                else
+                {
+                    return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 8> = 0>
+            inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshlq_n_s64(lhs, I);
+                }
+                else
+                {
+                    return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int... Is>
+            inline batch<T, A> bitwise_lshift_impl(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<0, Is...>) noexcept
+            {
+                if (n == 0)
+                {
+                    return lhs;
+                }
+                else
+                {
+                    return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+        }
+
+        template <class A, class T>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, requires_arch<neon>) noexcept
+        {
+            constexpr int size = sizeof(typename batch<T, A>::value_type) * 8;
+            assert(0 <= n && n < size && "index in bounds");
+            return detail::bitwise_lshift_impl(lhs, n, ::xsimd::detail::make_int_sequence<size>());
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_u8(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_s8(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_u16(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_s16(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_u32(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_s32(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_s64(lhs, rhs);
+        }
+
+        /******************
+         * bitwise_rshift *
+         ******************/
+
+        namespace detail
+        {
+            template <class A, class T>
+            inline batch<T, A> bitwise_rshift(batch<T, A> const& /*lhs*/, int /*n*/, ::xsimd::detail::int_sequence<>) noexcept
+            {
+                assert(false && "bitwise_rshift out of bounds");
+                return batch<T, A> {};
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 1> = 0>
+            inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshrq_n_u8(lhs, I);
+                }
+                else
+                {
+                    return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 1> = 0>
+            inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshrq_n_s8(lhs, I);
+                }
+                else
+                {
+                    return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 2> = 0>
+            inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshrq_n_u16(lhs, I);
+                }
+                else
+                {
+                    return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 2> = 0>
+            inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshrq_n_s16(lhs, I);
+                }
+                else
+                {
+                    return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 4> = 0>
+            inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshrq_n_u32(lhs, I);
+                }
+                else
+                {
+                    return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 4> = 0>
+            inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshrq_n_s32(lhs, I);
+                }
+                else
+                {
+                    return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 8> = 0>
+            inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshrq_n_u64(lhs, I);
+                }
+                else
+                {
+                    return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 8> = 0>
+            inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vshrq_n_s64(lhs, I);
+                }
+                else
+                {
+                    return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, int... Is>
+            inline batch<T, A> bitwise_rshift_impl(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<0, Is...>) noexcept
+            {
+                if (n == 0)
+                {
+                    return lhs;
+                }
+                else
+                {
+                    return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>());
+                }
+            }
+        }
+
+        template <class A, class T>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon>) noexcept
+        {
+            constexpr int size = sizeof(typename batch<T, A>::value_type) * 8;
+            assert(0 <= n && n < size && "index in bounds");
+            return detail::bitwise_rshift_impl(lhs, n, ::xsimd::detail::make_int_sequence<size>());
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_u8(lhs, vnegq_s8(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_s8(lhs, vnegq_s8(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_u16(lhs, vnegq_s16(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_s16(lhs, vnegq_s16(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_u32(lhs, vnegq_s32(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+        {
+            return vshlq_s32(lhs, vnegq_s32(rhs));
+        }
+
+        // Overloads of bitwise shifts accepting two batches of uint64/int64 are not available with ARMv7
+
+        /*******
+         * all *
+         *******/
+
+        template <class A, class T, detail::enable_sized_t<T, 8> = 0>
+        inline bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            uint64x1_t tmp = vand_u64(vget_low_u64(arg), vget_high_u64(arg));
+            return vget_lane_u64(tmp, 0) == ~0ULL;
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 1> = 0>
+        inline bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u8(arg)), neon {});
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 2> = 0>
+        inline bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u16(arg)), neon {});
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 4> = 0>
+        inline bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u32(arg)), neon {});
+        }
+
+        /*******
+         * any *
+         *******/
+
+        template <class A, class T, detail::enable_sized_t<T, 8> = 0>
+        inline bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            uint32x2_t tmp = vqmovn_u64(arg);
+            return vget_lane_u64(vreinterpret_u64_u32(tmp), 0) != 0;
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 1> = 0>
+        inline bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u8(arg)), neon {});
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 2> = 0>
+        inline bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u16(arg)), neon {});
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 4> = 0>
+        inline bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u32(arg)), neon {});
+        }
+
+        /****************
+         * bitwise_cast *
+         ****************/
+
+#define WRAP_CAST(SUFFIX, TYPE)                                          \
+    namespace wrap                                                       \
+    {                                                                    \
+        inline TYPE vreinterpretq_##SUFFIX##_u8(uint8x16_t a) noexcept   \
+        {                                                                \
+            return ::vreinterpretq_##SUFFIX##_u8(a);                     \
+        }                                                                \
+        inline TYPE vreinterpretq_##SUFFIX##_s8(int8x16_t a) noexcept    \
+        {                                                                \
+            return ::vreinterpretq_##SUFFIX##_s8(a);                     \
+        }                                                                \
+        inline TYPE vreinterpretq_##SUFFIX##_u16(uint16x8_t a) noexcept  \
+        {                                                                \
+            return ::vreinterpretq_##SUFFIX##_u16(a);                    \
+        }                                                                \
+        inline TYPE vreinterpretq_##SUFFIX##_s16(int16x8_t a) noexcept   \
+        {                                                                \
+            return ::vreinterpretq_##SUFFIX##_s16(a);                    \
+        }                                                                \
+        inline TYPE vreinterpretq_##SUFFIX##_u32(uint32x4_t a) noexcept  \
+        {                                                                \
+            return ::vreinterpretq_##SUFFIX##_u32(a);                    \
+        }                                                                \
+        inline TYPE vreinterpretq_##SUFFIX##_s32(int32x4_t a) noexcept   \
+        {                                                                \
+            return ::vreinterpretq_##SUFFIX##_s32(a);                    \
+        }                                                                \
+        inline TYPE vreinterpretq_##SUFFIX##_u64(uint64x2_t a) noexcept  \
+        {                                                                \
+            return ::vreinterpretq_##SUFFIX##_u64(a);                    \
+        }                                                                \
+        inline TYPE vreinterpretq_##SUFFIX##_s64(int64x2_t a) noexcept   \
+        {                                                                \
+            return ::vreinterpretq_##SUFFIX##_s64(a);                    \
+        }                                                                \
+        inline TYPE vreinterpretq_##SUFFIX##_f32(float32x4_t a) noexcept \
+        {                                                                \
+            return ::vreinterpretq_##SUFFIX##_f32(a);                    \
+        }                                                                \
+    }
+
+        WRAP_CAST(u8, uint8x16_t)
+        WRAP_CAST(s8, int8x16_t)
+        WRAP_CAST(u16, uint16x8_t)
+        WRAP_CAST(s16, int16x8_t)
+        WRAP_CAST(u32, uint32x4_t)
+        WRAP_CAST(s32, int32x4_t)
+        WRAP_CAST(u64, uint64x2_t)
+        WRAP_CAST(s64, int64x2_t)
+        WRAP_CAST(f32, float32x4_t)
+
+#undef WRAP_CAST
+
+        namespace detail
+        {
+            template <class R, class... T>
+            struct bitwise_caster_impl
+            {
+                using container_type = std::tuple<R (*)(T)...>;
+                container_type m_func;
+
+                template <class U>
+                R apply(U rhs) const noexcept
+                {
+                    using func_type = R (*)(U);
+                    auto func = xsimd::detail::get<func_type>(m_func);
+                    return func(rhs);
+                }
+            };
+
+            template <class R, class... T>
+            inline const bitwise_caster_impl<R, T...> make_bitwise_caster_impl(R (*... arg)(T)) noexcept
+            {
+                return { std::make_tuple(arg...) };
+            }
+
+            template <class... T>
+            struct type_list
+            {
+            };
+
+            template <class RTL, class TTL>
+            struct bitwise_caster;
+
+            template <class... R, class... T>
+            struct bitwise_caster<type_list<R...>, type_list<T...>>
+            {
+                using container_type = std::tuple<bitwise_caster_impl<R, T...>...>;
+                container_type m_caster;
+
+                template <class V, class U>
+                V apply(U rhs) const noexcept
+                {
+                    using caster_type = bitwise_caster_impl<V, T...>;
+                    auto caster = xsimd::detail::get<caster_type>(m_caster);
+                    return caster.apply(rhs);
+                }
+            };
+
+            template <class... T>
+            using bitwise_caster_t = bitwise_caster<type_list<T...>, type_list<T...>>;
+
+            using neon_bitwise_caster = bitwise_caster_t<uint8x16_t, int8x16_t,
+                                                         uint16x8_t, int16x8_t,
+                                                         uint32x4_t, int32x4_t,
+                                                         uint64x2_t, int64x2_t,
+                                                         float32x4_t>;
+        }
+
+        template <class A, class T, class R>
+        inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<neon>) noexcept
+        {
+            const detail::neon_bitwise_caster caster = {
+                std::make_tuple(
+                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_u8_u8, wrap::vreinterpretq_u8_s8, wrap::vreinterpretq_u8_u16, wrap::vreinterpretq_u8_s16,
+                                                     wrap::vreinterpretq_u8_u32, wrap::vreinterpretq_u8_s32, wrap::vreinterpretq_u8_u64, wrap::vreinterpretq_u8_s64,
+                                                     wrap::vreinterpretq_u8_f32),
+                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_s8_u8, wrap::vreinterpretq_s8_s8, wrap::vreinterpretq_s8_u16, wrap::vreinterpretq_s8_s16,
+                                                     wrap::vreinterpretq_s8_u32, wrap::vreinterpretq_s8_s32, wrap::vreinterpretq_s8_u64, wrap::vreinterpretq_s8_s64,
+                                                     wrap::vreinterpretq_s8_f32),
+                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_u16_u8, wrap::vreinterpretq_u16_s8, wrap::vreinterpretq_u16_u16, wrap::vreinterpretq_u16_s16,
+                                                     wrap::vreinterpretq_u16_u32, wrap::vreinterpretq_u16_s32, wrap::vreinterpretq_u16_u64, wrap::vreinterpretq_u16_s64,
+                                                     wrap::vreinterpretq_u16_f32),
+                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_s16_u8, wrap::vreinterpretq_s16_s8, wrap::vreinterpretq_s16_u16, wrap::vreinterpretq_s16_s16,
+                                                     wrap::vreinterpretq_s16_u32, wrap::vreinterpretq_s16_s32, wrap::vreinterpretq_s16_u64, wrap::vreinterpretq_s16_s64,
+                                                     wrap::vreinterpretq_s16_f32),
+                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_u32_u8, wrap::vreinterpretq_u32_s8, wrap::vreinterpretq_u32_u16, wrap::vreinterpretq_u32_s16,
+                                                     wrap::vreinterpretq_u32_u32, wrap::vreinterpretq_u32_s32, wrap::vreinterpretq_u32_u64, wrap::vreinterpretq_u32_s64,
+                                                     wrap::vreinterpretq_u32_f32),
+                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_s32_u8, wrap::vreinterpretq_s32_s8, wrap::vreinterpretq_s32_u16, wrap::vreinterpretq_s32_s16,
+                                                     wrap::vreinterpretq_s32_u32, wrap::vreinterpretq_s32_s32, wrap::vreinterpretq_s32_u64, wrap::vreinterpretq_s32_s64,
+                                                     wrap::vreinterpretq_s32_f32),
+                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_u64_u8, wrap::vreinterpretq_u64_s8, wrap::vreinterpretq_u64_u16, wrap::vreinterpretq_u64_s16,
+                                                     wrap::vreinterpretq_u64_u32, wrap::vreinterpretq_u64_s32, wrap::vreinterpretq_u64_u64, wrap::vreinterpretq_u64_s64,
+                                                     wrap::vreinterpretq_u64_f32),
+                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_s64_u8, wrap::vreinterpretq_s64_s8, wrap::vreinterpretq_s64_u16, wrap::vreinterpretq_s64_s16,
+                                                     wrap::vreinterpretq_s64_u32, wrap::vreinterpretq_s64_s32, wrap::vreinterpretq_s64_u64, wrap::vreinterpretq_s64_s64,
+                                                     wrap::vreinterpretq_s64_f32),
+                    detail::make_bitwise_caster_impl(wrap::vreinterpretq_f32_u8, wrap::vreinterpretq_f32_s8, wrap::vreinterpretq_f32_u16, wrap::vreinterpretq_f32_s16,
+                                                     wrap::vreinterpretq_f32_u32, wrap::vreinterpretq_f32_s32, wrap::vreinterpretq_f32_u64, wrap::vreinterpretq_f32_s64,
+                                                     wrap::vreinterpretq_f32_f32))
+            };
+            using src_register_type = typename batch<T, A>::register_type;
+            using dst_register_type = typename batch<R, A>::register_type;
+            return caster.apply<dst_register_type>(src_register_type(arg));
+        }
+
+        /*********
+         * isnan *
+         *********/
+
+        template <class A>
+        inline batch_bool<float, A> isnan(batch<float, A> const& arg, requires_arch<neon>) noexcept
+        {
+            return !(arg == arg);
+        }
+
+        // slide_left
+        namespace detail
+        {
+            template <size_t N>
+            struct slider_left
+            {
+                template <class A, class T>
+                inline batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
+                {
+                    const auto left = vdupq_n_u8(0);
+                    const auto right = bitwise_cast<uint8_t>(x).data;
+                    const batch<uint8_t, A> res(vextq_u8(left, right, 16 - N));
+                    return bitwise_cast<T>(res);
+                }
+            };
+
+            template <>
+            struct slider_left<0>
+            {
+                template <class A, class T>
+                inline batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
+                {
+                    return x;
+                }
+            };
+        } // namespace detail
+
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<neon>) noexcept
+        {
+            return detail::slider_left<N> {}(x, A {});
+        }
+
+        // slide_right
+        namespace detail
+        {
+            template <size_t N>
+            struct slider_right
+            {
+                template <class A, class T>
+                inline batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept
+                {
+                    const auto left = bitwise_cast<uint8_t>(x).data;
+                    const auto right = vdupq_n_u8(0);
+                    const batch<uint8_t, A> res(vextq_u8(left, right, N));
+                    return bitwise_cast<T>(res);
+                }
+            };
+
+            template <>
+            struct slider_right<16>
+            {
+                template <class A, class T>
+                inline batch<T, A> operator()(batch<T, A> const&, requires_arch<neon>) noexcept
+                {
+                    return batch<T, A> {};
+                }
+            };
+        } // namespace detail
+
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<neon>) noexcept
+        {
+            return detail::slider_right<N> {}(x, A {});
+        }
+    }
+
+    template <class batch_type, typename batch_type::value_type... Values>
+    struct batch_constant;
+
+    namespace kernel
+    {
+        /***********
+         * swizzle *
+         ***********/
+
+        template <class A, class T, class I, I... idx>
+        inline batch<T, A> swizzle(batch<T, A> const& self,
+                                   batch_constant<batch<I, A>, idx...>,
+                                   requires_arch<neon>) noexcept
+        {
+            static_assert(batch<T, A>::size == sizeof...(idx), "valid swizzle indices");
+            std::array<T, batch<T, A>::size> data;
+            self.store_aligned(data.data());
+            return set(batch<T, A>(), A(), data[idx]...);
+        }
+    }
+}
+
+#undef WRAP_BINARY_INT_EXCLUDING_64
+#undef WRAP_BINARY_INT
+#undef WRAP_BINARY_FLOAT
+#undef WRAP_UNARY_INT_EXCLUDING_64
+#undef WRAP_UNARY_INT
+#undef WRAP_UNARY_FLOAT
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp
new file mode 100644
index 0000000000..31ab6210bd
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp
@@ -0,0 +1,1322 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_NEON64_HPP
+#define XSIMD_NEON64_HPP
+
+#include <complex>
+#include <cstddef>
+#include <tuple>
+
+#include "../types/xsimd_neon64_register.hpp"
+#include "../types/xsimd_utils.hpp"
+
+namespace xsimd
+{
+    template <class batch_type, bool... Values>
+    struct batch_bool_constant;
+
+    namespace kernel
+    {
+        using namespace types;
+
+        /*******
+         * all *
+         *******/
+
+        template <class A, class T, detail::enable_sized_t<T, 4> = 0>
+        inline bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return vminvq_u32(arg) == ~0U;
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 1> = 0>
+        inline bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return all(batch_bool<uint32_t, A>(vreinterpretq_u32_u8(arg)), neon64 {});
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 2> = 0>
+        inline bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return all(batch_bool<uint32_t, A>(vreinterpretq_u32_u16(arg)), neon64 {});
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 8> = 0>
+        inline bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return all(batch_bool<uint32_t, A>(vreinterpretq_u32_u64(arg)), neon64 {});
+        }
+
+        /*******
+         * any *
+         *******/
+
+        template <class A, class T, detail::enable_sized_t<T, 4> = 0>
+        inline bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return vmaxvq_u32(arg) != 0;
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 1> = 0>
+        inline bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return any(batch_bool<uint32_t, A>(vreinterpretq_u32_u8(arg)), neon64 {});
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 2> = 0>
+        inline bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return any(batch_bool<uint32_t, A>(vreinterpretq_u32_u16(arg)), neon64 {});
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 8> = 0>
+        inline bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return any(batch_bool<uint32_t, A>(vreinterpretq_u32_u64(arg)), neon64 {});
+        }
+
+        /*************
+         * broadcast *
+         *************/
+
+        // Required to avoid ambiguous call
+        template <class A, class T>
+        inline batch<T, A> broadcast(T val, requires_arch<neon64>) noexcept
+        {
+            return broadcast<neon64>(val, neon {});
+        }
+
+        template <class A>
+        inline batch<double, A> broadcast(double val, requires_arch<neon64>) noexcept
+        {
+            return vdupq_n_f64(val);
+        }
+
+        /*******
+         * set *
+         *******/
+
+        template <class A>
+        inline batch<double, A> set(batch<double, A> const&, requires_arch<neon64>, double d0, double d1) noexcept
+        {
+            return float64x2_t { d0, d1 };
+        }
+
+        template <class A>
+        inline batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<neon64>, bool b0, bool b1) noexcept
+        {
+            using register_type = typename batch_bool<double, A>::register_type;
+            using unsigned_type = as_unsigned_integer_t<double>;
+            return register_type { static_cast<unsigned_type>(b0 ? -1LL : 0LL),
+                                   static_cast<unsigned_type>(b1 ? -1LL : 0LL) };
+        }
+
+        /*************
+         * from_bool *
+         *************/
+
+        template <class A>
+        inline batch<double, A> from_bool(batch_bool<double, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return vreinterpretq_f64_u64(vandq_u64(arg, vreinterpretq_u64_f64(vdupq_n_f64(1.))));
+        }
+
+        /********
+         * load *
+         ********/
+#if defined(__clang__) || defined(__GNUC__)
+#define xsimd_aligned_load(inst, type, expr) inst((type)__builtin_assume_aligned(expr, 16))
+#elif defined(_MSC_VER)
+#define xsimd_aligned_load(inst, type, expr) inst##_ex((type)expr, 128)
+#else
+#define xsimd_aligned_load(inst, type, expr) inst((type)expr)
+#endif
+
+        template <class A>
+        inline batch<double, A> load_aligned(double const* src, convert<double>, requires_arch<neon64>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_f64, double*, src);
+        }
+
+        template <class A>
+        inline batch<double, A> load_unaligned(double const* src, convert<double>, requires_arch<neon64>) noexcept
+        {
+            return vld1q_f64(src);
+        }
+#undef xsimd_aligned_load
+
+        /*********
+         * store *
+         *********/
+
+        template <class A>
+        inline void store_aligned(double* dst, batch<double, A> const& src, requires_arch<neon64>) noexcept
+        {
+            vst1q_f64(dst, src);
+        }
+
+        template <class A>
+        inline void store_unaligned(double* dst, batch<double, A> const& src, requires_arch<neon64>) noexcept
+        {
+            return store_aligned<A>(dst, src, A {});
+        }
+
+        /****************
+         * load_complex *
+         ****************/
+
+        template <class A>
+        inline batch<std::complex<double>, A> load_complex_aligned(std::complex<double> const* mem, convert<std::complex<double>>, requires_arch<neon64>) noexcept
+        {
+            using real_batch = batch<double, A>;
+            const double* buf = reinterpret_cast<const double*>(mem);
+            float64x2x2_t tmp = vld2q_f64(buf);
+            real_batch real = tmp.val[0],
+                       imag = tmp.val[1];
+            return batch<std::complex<double>, A> { real, imag };
+        }
+
+        template <class A>
+        inline batch<std::complex<double>, A> load_complex_unaligned(std::complex<double> const* mem, convert<std::complex<double>> cvt, requires_arch<neon64>) noexcept
+        {
+            return load_complex_aligned<A>(mem, cvt, A {});
+        }
+
+        /*****************
+         * store_complex *
+         *****************/
+
+        template <class A>
+        inline void store_complex_aligned(std::complex<double>* dst, batch<std::complex<double>, A> const& src, requires_arch<neon64>) noexcept
+        {
+            float64x2x2_t tmp;
+            tmp.val[0] = src.real();
+            tmp.val[1] = src.imag();
+            double* buf = reinterpret_cast<double*>(dst);
+            vst2q_f64(buf, tmp);
+        }
+
+        template <class A>
+        inline void store_complex_unaligned(std::complex<double>* dst, batch<std::complex<double>, A> const& src, requires_arch<neon64>) noexcept
+        {
+            store_complex_aligned(dst, src, A {});
+        }
+
+        /*******
+         * neg *
+         *******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vreinterpretq_u64_s64(vnegq_s64(vreinterpretq_s64_u64(rhs)));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vnegq_s64(rhs);
+        }
+
+        template <class A>
+        inline batch<double, A> neg(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vnegq_f64(rhs);
+        }
+
+        /*******
+         * add *
+         *******/
+
+        template <class A>
+        inline batch<double, A> add(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vaddq_f64(lhs, rhs);
+        }
+
+        /********
+         * sadd *
+         ********/
+
+        template <class A>
+        inline batch<double, A> sadd(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return add(lhs, rhs, neon64 {});
+        }
+
+        /*******
+         * sub *
+         *******/
+
+        template <class A>
+        inline batch<double, A> sub(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vsubq_f64(lhs, rhs);
+        }
+
+        /********
+         * ssub *
+         ********/
+
+        template <class A>
+        inline batch<double, A> ssub(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return sub(lhs, rhs, neon64 {});
+        }
+
+        /*******
+         * mul *
+         *******/
+
+        template <class A>
+        inline batch<double, A> mul(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vmulq_f64(lhs, rhs);
+        }
+
+        /*******
+         * div *
+         *******/
+
+#if defined(XSIMD_FAST_INTEGER_DIVISION)
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcvtq_u64_f64(vcvtq_f64_u64(lhs) / vcvtq_f64_u64(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcvtq_s64_f64(vcvtq_f64_s64(lhs) / vcvtq_f64_s64(rhs));
+        }
+#endif
+        template <class A>
+        inline batch<double, A> div(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vdivq_f64(lhs, rhs);
+        }
+
+        /******
+         * eq *
+         ******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vceqq_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vceqq_s64(lhs, rhs);
+        }
+
+        template <class A>
+        inline batch_bool<double, A> eq(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vceqq_f64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vceqq_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vceqq_u64(lhs, rhs);
+        }
+
+        template <class A>
+        inline batch_bool<double, A> eq(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vceqq_u64(lhs, rhs);
+        }
+
+        /*************
+         * fast_cast *
+         *************/
+        namespace detail
+        {
+            template <class A>
+            inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<neon64>) noexcept
+            {
+                return vcvtq_f64_s64(x);
+            }
+
+            template <class A>
+            inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<neon64>) noexcept
+            {
+                return vcvtq_f64_u64(x);
+            }
+
+            template <class A>
+            inline batch<int64_t, A> fast_cast(batch<double, A> const& x, batch<int64_t, A> const&, requires_arch<neon64>) noexcept
+            {
+                return vcvtq_s64_f64(x);
+            }
+
+            template <class A>
+            inline batch<uint64_t, A> fast_cast(batch<double, A> const& x, batch<uint64_t, A> const&, requires_arch<neon64>) noexcept
+            {
+                return vcvtq_u64_f64(x);
+            }
+
+        }
+
+        /******
+         * lt *
+         ******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcltq_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcltq_s64(lhs, rhs);
+        }
+
+        template <class A>
+        inline batch_bool<double, A> lt(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcltq_f64(lhs, rhs);
+        }
+
+        /******
+         * le *
+         ******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcleq_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcleq_s64(lhs, rhs);
+        }
+
+        template <class A>
+        inline batch_bool<double, A> le(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcleq_f64(lhs, rhs);
+        }
+
+        /******
+         * gt *
+         ******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcgtq_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcgtq_s64(lhs, rhs);
+        }
+
+        template <class A>
+        inline batch_bool<double, A> gt(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcgtq_f64(lhs, rhs);
+        }
+
+        /******
+         * ge *
+         ******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcgeq_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcgeq_s64(lhs, rhs);
+        }
+
+        template <class A>
+        inline batch_bool<double, A> ge(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcgeq_f64(lhs, rhs);
+        }
+
+        /*******************
+         * batch_bool_cast *
+         *******************/
+
+        template <class A, class T_out, class T_in>
+        inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<neon64>) noexcept
+        {
+            using register_type = typename batch_bool<T_out, A>::register_type;
+            return register_type(self);
+        }
+
+        /***************
+         * bitwise_and *
+         ***************/
+
+        template <class A>
+        inline batch<double, A> bitwise_and(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(lhs),
+                                                   vreinterpretq_u64_f64(rhs)));
+        }
+
+        template <class A>
+        inline batch_bool<double, A> bitwise_and(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vandq_u64(lhs, rhs);
+        }
+
+        /**************
+         * bitwise_or *
+         **************/
+
+        template <class A>
+        inline batch<double, A> bitwise_or(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(lhs),
+                                                   vreinterpretq_u64_f64(rhs)));
+        }
+
+        template <class A>
+        inline batch_bool<double, A> bitwise_or(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vorrq_u64(lhs, rhs);
+        }
+
+        /***************
+         * bitwise_xor *
+         ***************/
+
+        template <class A>
+        inline batch<double, A> bitwise_xor(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(lhs),
+                                                   vreinterpretq_u64_f64(rhs)));
+        }
+
+        template <class A>
+        inline batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return veorq_u64(lhs, rhs);
+        }
+
+        /*******
+         * neq *
+         *******/
+
+        template <class A>
+        inline batch_bool<double, A> neq(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return bitwise_xor(lhs, rhs, A {});
+        }
+
+        /***************
+         * bitwise_not *
+         ***************/
+
+        template <class A>
+        inline batch<double, A> bitwise_not(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_f64(rhs)));
+        }
+
+        template <class A>
+        inline batch_bool<double, A> bitwise_not(batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return detail::bitwise_not_u64(rhs);
+        }
+
+        /******************
+         * bitwise_andnot *
+         ******************/
+
+        template <class A>
+        inline batch<double, A> bitwise_andnot(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(lhs),
+                                                   vreinterpretq_u64_f64(rhs)));
+        }
+
+        template <class A>
+        inline batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vbicq_u64(lhs, rhs);
+        }
+
+        /*******
+         * min *
+         *******/
+
+        template <class A>
+        inline batch<double, A> min(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vminq_f64(lhs, rhs);
+        }
+
+        /*******
+         * max *
+         *******/
+
+        template <class A>
+        inline batch<double, A> max(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vmaxq_f64(lhs, rhs);
+        }
+
+        /*******
+         * abs *
+         *******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> abs(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return rhs;
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> abs(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vabsq_s64(rhs);
+        }
+
+        template <class A>
+        inline batch<double, A> abs(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vabsq_f64(rhs);
+        }
+
+        template <class A>
+        inline batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
+                                                  requires_arch<neon64>) noexcept
+        {
+            return vcvtnq_s32_f32(self);
+        }
+
+#if !defined(__GNUC__)
+        template <class A>
+        inline batch<int64_t, A> nearbyint_as_int(batch<double, A> const& self,
+                                                  requires_arch<neon64>) noexcept
+        {
+            return vcvtnq_s64_f64(self);
+        }
+#endif
+
+        /**************
+         * reciprocal *
+         **************/
+
+        template <class A>
+        inline batch<double, A>
+        reciprocal(const batch<double, A>& x,
+                   kernel::requires_arch<neon64>) noexcept
+        {
+            return vrecpeq_f64(x);
+        }
+
+        /********
+         * rsqrt *
+         ********/
+
+        template <class A>
+        inline batch<double, A> rsqrt(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vrsqrteq_f64(rhs);
+        }
+
+        /********
+         * sqrt *
+         ********/
+
+        template <class A>
+        inline batch<double, A> sqrt(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vsqrtq_f64(rhs);
+        }
+
+        /********************
+         * Fused operations *
+         ********************/
+
+#ifdef __ARM_FEATURE_FMA
+        template <class A>
+        inline batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<neon64>) noexcept
+        {
+            return vfmaq_f64(z, x, y);
+        }
+
+        template <class A>
+        inline batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<neon64>) noexcept
+        {
+            return vfmaq_f64(-z, x, y);
+        }
+#endif
+
+        /*********
+         * haddp *
+         *********/
+
+        template <class A>
+        inline batch<double, A> haddp(const batch<double, A>* row, requires_arch<neon64>) noexcept
+        {
+            return vpaddq_f64(row[0], row[1]);
+        }
+
+        /**********
+         * insert *
+         **********/
+
+        template <class A, size_t I>
+        inline batch<double, A> insert(batch<double, A> const& self, double val, index<I>, requires_arch<neon64>) noexcept
+        {
+            return vsetq_lane_f64(val, self, I);
+        }
+
+        /******************
+         * reducer macros *
+         ******************/
+
+        // Wrap reducer intrinsics so we can pass them as function pointers
+        // - OP: intrinsics name prefix, e.g., vorrq
+
+#define WRAP_REDUCER_INT_EXCLUDING_64(OP)               \
+    namespace wrap                                      \
+    {                                                   \
+        inline uint8_t OP##_u8(uint8x16_t a) noexcept   \
+        {                                               \
+            return ::OP##_u8(a);                        \
+        }                                               \
+        inline int8_t OP##_s8(int8x16_t a) noexcept     \
+        {                                               \
+            return ::OP##_s8(a);                        \
+        }                                               \
+        inline uint16_t OP##_u16(uint16x8_t a) noexcept \
+        {                                               \
+            return ::OP##_u16(a);                       \
+        }                                               \
+        inline int16_t OP##_s16(int16x8_t a) noexcept   \
+        {                                               \
+            return ::OP##_s16(a);                       \
+        }                                               \
+        inline uint32_t OP##_u32(uint32x4_t a) noexcept \
+        {                                               \
+            return ::OP##_u32(a);                       \
+        }                                               \
+        inline int32_t OP##_s32(int32x4_t a) noexcept   \
+        {                                               \
+            return ::OP##_s32(a);                       \
+        }                                               \
+    }
+
+#define WRAP_REDUCER_INT(OP)                            \
+    WRAP_REDUCER_INT_EXCLUDING_64(OP)                   \
+    namespace wrap                                      \
+    {                                                   \
+        inline uint64_t OP##_u64(uint64x2_t a) noexcept \
+        {                                               \
+            return ::OP##_u64(a);                       \
+        }                                               \
+        inline int64_t OP##_s64(int64x2_t a) noexcept   \
+        {                                               \
+            return ::OP##_s64(a);                       \
+        }                                               \
+    }
+
+#define WRAP_REDUCER_FLOAT(OP)                         \
+    namespace wrap                                     \
+    {                                                  \
+        inline float OP##_f32(float32x4_t a) noexcept  \
+        {                                              \
+            return ::OP##_f32(a);                      \
+        }                                              \
+        inline double OP##_f64(float64x2_t a) noexcept \
+        {                                              \
+            return ::OP##_f64(a);                      \
+        }                                              \
+    }
+
+        namespace detail
+        {
+            template <class R>
+            struct reducer_return_type_impl;
+
+            template <>
+            struct reducer_return_type_impl<uint8x16_t>
+            {
+                using type = uint8_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<int8x16_t>
+            {
+                using type = int8_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<uint16x8_t>
+            {
+                using type = uint16_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<int16x8_t>
+            {
+                using type = int16_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<uint32x4_t>
+            {
+                using type = uint32_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<int32x4_t>
+            {
+                using type = int32_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<uint64x2_t>
+            {
+                using type = uint64_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<int64x2_t>
+            {
+                using type = int64_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<float32x4_t>
+            {
+                using type = float;
+            };
+
+            template <>
+            struct reducer_return_type_impl<float64x2_t>
+            {
+                using type = double;
+            };
+
+            template <class R>
+            using reducer_return_type = typename reducer_return_type_impl<R>::type;
+
+            template <class... T>
+            struct neon_reducer_dispatcher_impl : neon_dispatcher_base<reducer_return_type, T...>
+            {
+            };
+
+            using neon_reducer_dispatcher = neon_reducer_dispatcher_impl<uint8x16_t, int8x16_t,
+                                                                         uint16x8_t, int16x8_t,
+                                                                         uint32x4_t, int32x4_t,
+                                                                         uint64x2_t, int64x2_t,
+                                                                         float32x4_t, float64x2_t>;
+            template <class T>
+            using enable_neon64_type_t = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, float>::value || std::is_same<T, double>::value,
+                                                                 int>::type;
+        }
+
+        /**************
+         * reduce_add *
+         **************/
+
+        WRAP_REDUCER_INT(vaddvq)
+        WRAP_REDUCER_FLOAT(vaddvq)
+
+        template <class A, class T, detail::enable_neon64_type_t<T> = 0>
+        inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_reducer_dispatcher::unary dispatcher = {
+                std::make_tuple(wrap::vaddvq_u8, wrap::vaddvq_s8, wrap::vaddvq_u16, wrap::vaddvq_s16,
+                                wrap::vaddvq_u32, wrap::vaddvq_s32, wrap::vaddvq_u64, wrap::vaddvq_s64,
+                                wrap::vaddvq_f32, wrap::vaddvq_f64)
+            };
+            return dispatcher.apply(register_type(arg));
+        }
+
+        /**************
+         * reduce_max *
+         **************/
+
+        WRAP_REDUCER_INT_EXCLUDING_64(vmaxvq)
+        WRAP_REDUCER_FLOAT(vmaxvq)
+
+        namespace wrap
+        {
+            inline uint64_t vmaxvq_u64(uint64x2_t a) noexcept
+            {
+                return std::max(vdupd_laneq_u64(a, 0), vdupd_laneq_u64(a, 1));
+            }
+
+            inline int64_t vmaxvq_s64(int64x2_t a) noexcept
+            {
+                return std::max(vdupd_laneq_s64(a, 0), vdupd_laneq_s64(a, 1));
+            }
+        }
+
+        template <class A, class T, detail::enable_neon64_type_t<T> = 0>
+        inline typename batch<T, A>::value_type reduce_max(batch<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_reducer_dispatcher::unary dispatcher = {
+                std::make_tuple(wrap::vmaxvq_u8, wrap::vmaxvq_s8, wrap::vmaxvq_u16, wrap::vmaxvq_s16,
+                                wrap::vmaxvq_u32, wrap::vmaxvq_s32, wrap::vmaxvq_u64, wrap::vmaxvq_s64,
+                                wrap::vmaxvq_f32, wrap::vmaxvq_f64)
+            };
+            return dispatcher.apply(register_type(arg));
+        }
+
+        /**************
+         * reduce_min *
+         **************/
+
+        WRAP_REDUCER_INT_EXCLUDING_64(vminvq)
+        WRAP_REDUCER_FLOAT(vminvq)
+
+        namespace wrap
+        {
+            inline uint64_t vminvq_u64(uint64x2_t a) noexcept
+            {
+                return std::min(vdupd_laneq_u64(a, 0), vdupd_laneq_u64(a, 1));
+            }
+
+            inline int64_t vminvq_s64(int64x2_t a) noexcept
+            {
+                return std::min(vdupd_laneq_s64(a, 0), vdupd_laneq_s64(a, 1));
+            }
+        }
+
+        template <class A, class T, detail::enable_neon64_type_t<T> = 0>
+        inline typename batch<T, A>::value_type reduce_min(batch<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_reducer_dispatcher::unary dispatcher = {
+                std::make_tuple(wrap::vminvq_u8, wrap::vminvq_s8, wrap::vminvq_u16, wrap::vminvq_s16,
+                                wrap::vminvq_u32, wrap::vminvq_s32, wrap::vminvq_u64, wrap::vminvq_s64,
+                                wrap::vminvq_f32, wrap::vminvq_f64)
+            };
+            return dispatcher.apply(register_type(arg));
+        }
+
+#undef WRAP_REDUCER_INT_EXCLUDING_64
+#undef WRAP_REDUCER_INT
+#undef WRAP_REDUCER_FLOAT
+
+        /**********
+         * select *
+         **********/
+
+        template <class A>
+        inline batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& a, batch<double, A> const& b, requires_arch<neon64>) noexcept
+        {
+            return vbslq_f64(cond, a, b);
+        }
+
+        template <class A, bool... b>
+        inline batch<double, A> select(batch_bool_constant<batch<double, A>, b...> const&,
+                                       batch<double, A> const& true_br,
+                                       batch<double, A> const& false_br,
+                                       requires_arch<neon64>) noexcept
+        {
+            return select(batch_bool<double, A> { b... }, true_br, false_br, neon64 {});
+        }
+        /**********
+         * zip_lo *
+         **********/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip1q_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip1q_s64(lhs, rhs);
+        }
+
+        template <class A>
+        inline batch<double, A> zip_lo(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip1q_f64(lhs, rhs);
+        }
+
+        /**********
+         * zip_hi *
+         **********/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip2q_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip2q_s64(lhs, rhs);
+        }
+
+        template <class A>
+        inline batch<double, A> zip_hi(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip2q_f64(lhs, rhs);
+        }
+
+        /****************
+         * extract_pair *
+         ****************/
+
+        namespace detail
+        {
+            template <class A, size_t I, size_t... Is>
+            inline batch<double, A> extract_pair(batch<double, A> const& lhs, batch<double, A> const& rhs, std::size_t n,
+                                                 ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_f64(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+        }
+
+        template <class A>
+        inline batch<double, A> extract_pair(batch<double, A> const& lhs, batch<double, A> const& rhs, std::size_t n, requires_arch<neon64>) noexcept
+        {
+            constexpr std::size_t size = batch<double, A>::size;
+            assert(n < size && "index in bounds");
+            return detail::extract_pair(lhs, rhs, n, ::xsimd::detail::make_index_sequence<size>());
+        }
+
+        /******************
+         * bitwise_rshift *
+         ******************/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon64>) noexcept
+        {
+            return bitwise_rshift<A>(lhs, n, neon {});
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vshlq_u64(lhs, vnegq_s64(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon64>) noexcept
+        {
+            return bitwise_rshift<A>(lhs, n, neon {});
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vshlq_s64(lhs, vnegq_s64(rhs));
+        }
+
+        /****************
+         * bitwise_cast *
+         ****************/
+
+#define WRAP_CAST(SUFFIX, TYPE)                                          \
+    namespace wrap                                                       \
+    {                                                                    \
+        inline float64x2_t vreinterpretq_f64_##SUFFIX(TYPE a) noexcept   \
+        {                                                                \
+            return ::vreinterpretq_f64_##SUFFIX(a);                      \
+        }                                                                \
+        inline TYPE vreinterpretq_##SUFFIX##_f64(float64x2_t a) noexcept \
+        {                                                                \
+            return ::vreinterpretq_##SUFFIX##_f64(a);                    \
+        }                                                                \
+    }
+
+        WRAP_CAST(u8, uint8x16_t)
+        WRAP_CAST(s8, int8x16_t)
+        WRAP_CAST(u16, uint16x8_t)
+        WRAP_CAST(s16, int16x8_t)
+        WRAP_CAST(u32, uint32x4_t)
+        WRAP_CAST(s32, int32x4_t)
+        WRAP_CAST(u64, uint64x2_t)
+        WRAP_CAST(s64, int64x2_t)
+        WRAP_CAST(f32, float32x4_t)
+
+#undef WRAP_CAST
+
+        template <class A, class T>
+        inline batch<double, A> bitwise_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<neon64>) noexcept
+        {
+            using caster_type = detail::bitwise_caster_impl<float64x2_t,
+                                                            uint8x16_t, int8x16_t,
+                                                            uint16x8_t, int16x8_t,
+                                                            uint32x4_t, int32x4_t,
+                                                            uint64x2_t, int64x2_t,
+                                                            float32x4_t>;
+            const caster_type caster = {
+                std::make_tuple(wrap::vreinterpretq_f64_u8, wrap::vreinterpretq_f64_s8, wrap::vreinterpretq_f64_u16, wrap::vreinterpretq_f64_s16,
+                                wrap::vreinterpretq_f64_u32, wrap::vreinterpretq_f64_s32, wrap::vreinterpretq_f64_u64, wrap::vreinterpretq_f64_s64,
+                                wrap::vreinterpretq_f64_f32)
+            };
+            using register_type = typename batch<T, A>::register_type;
+            return caster.apply(register_type(arg));
+        }
+
+        namespace detail
+        {
+            template <class S, class... R>
+            struct bitwise_caster_neon64
+            {
+                using container_type = std::tuple<R (*)(S)...>;
+                container_type m_func;
+
+                template <class V>
+                V apply(float64x2_t rhs) const
+                {
+                    using func_type = V (*)(float64x2_t);
+                    auto func = xsimd::detail::get<func_type>(m_func);
+                    return func(rhs);
+                }
+            };
+        }
+
+        template <class A, class R>
+        inline batch<R, A> bitwise_cast(batch<double, A> const& arg, batch<R, A> const&, requires_arch<neon64>) noexcept
+        {
+            using caster_type = detail::bitwise_caster_neon64<float64x2_t,
+                                                              uint8x16_t, int8x16_t,
+                                                              uint16x8_t, int16x8_t,
+                                                              uint32x4_t, int32x4_t,
+                                                              uint64x2_t, int64x2_t,
+                                                              float32x4_t>;
+            const caster_type caster = {
+                std::make_tuple(wrap::vreinterpretq_u8_f64, wrap::vreinterpretq_s8_f64, wrap::vreinterpretq_u16_f64, wrap::vreinterpretq_s16_f64,
+                                wrap::vreinterpretq_u32_f64, wrap::vreinterpretq_s32_f64, wrap::vreinterpretq_u64_f64, wrap::vreinterpretq_s64_f64,
+                                wrap::vreinterpretq_f32_f64)
+            };
+            using src_register_type = typename batch<double, A>::register_type;
+            using dst_register_type = typename batch<R, A>::register_type;
+            return caster.apply<dst_register_type>(src_register_type(arg));
+        }
+
+        template <class A>
+        inline batch<double, A> bitwise_cast(batch<double, A> const& arg, batch<double, A> const&, requires_arch<neon64>) noexcept
+        {
+            return arg;
+        }
+
+        /*********
+         * isnan *
+         *********/
+
+        template <class A>
+        inline batch_bool<double, A> isnan(batch<double, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return !(arg == arg);
+        }
+    }
+
+    template <class batch_type, typename batch_type::value_type... Values>
+    struct batch_constant;
+
+    namespace kernel
+    {
+        /***********
+         * swizzle *
+         ***********/
+
+        namespace detail
+        {
+            using ::xsimd::batch_constant;
+            using ::xsimd::detail::integer_sequence;
+            using ::xsimd::detail::make_integer_sequence;
+
+            template <class CB1, class CB2, class IS>
+            struct index_burst_impl;
+
+            template <class B1, class B2, typename B2::value_type... V,
+                      typename B2::value_type... incr>
+            struct index_burst_impl<batch_constant<B1>, batch_constant<B2, V...>,
+                                    integer_sequence<typename B2::value_type, incr...>>
+            {
+                using type = batch_constant<B2, V...>;
+            };
+
+            template <class B1, typename B1::value_type V0, typename B1::value_type... V1,
+                      class B2, typename B2::value_type... V2,
+                      typename B2::value_type... incr>
+            struct index_burst_impl<batch_constant<B1, V0, V1...>, batch_constant<B2, V2...>,
+                                    integer_sequence<typename B2::value_type, incr...>>
+            {
+                using value_type = typename B2::value_type;
+                using next_input = batch_constant<B1, V1...>;
+                using next_output = batch_constant<B2, V2..., (V0 + incr)...>;
+                using type = typename index_burst_impl<next_input, next_output, integer_sequence<value_type, incr...>>::type;
+            };
+
+            template <class B, class T>
+            struct index_burst;
+
+            template <class B, typename B::value_type... V, class T>
+            struct index_burst<batch_constant<B, V...>, T>
+            {
+                static constexpr size_t mul = sizeof(typename B::value_type) / sizeof(T);
+                using input = batch_constant<B, (mul * V)...>;
+                using output = batch_constant<batch<T, typename B::arch_type>>;
+                using type = typename index_burst_impl<input, output, make_integer_sequence<T, mul>>::type;
+            };
+
+            template <class B, class T>
+            using index_burst_t = typename index_burst<B, T>::type;
+
+            template <class T, class B>
+            inline index_burst_t<B, T> burst_index(B)
+            {
+                return index_burst_t<B, T>();
+            }
+        }
+
+        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
+                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
+        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self,
+                                         batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
+                                         requires_arch<neon64>) noexcept
+        {
+            return vqtbl1q_u8(self, batch<uint8_t, A>(idx));
+        }
+
+        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
+                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
+        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self,
+                                        batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
+                                        requires_arch<neon64>) noexcept
+        {
+            return vqtbl1q_s8(self, batch<uint8_t, A>(idx));
+        }
+
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self,
+                                          batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> idx,
+                                          requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<uint8_t, A>;
+            return vreinterpretq_u16_u8(swizzle<A>(batch_type(vreinterpretq_u8_u16(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self,
+                                         batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> idx,
+                                         requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<int8_t, A>;
+            return vreinterpretq_s16_s8(swizzle<A>(batch_type(vreinterpretq_s8_s16(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self,
+                                          batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
+                                          requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<uint8_t, A>;
+            return vreinterpretq_u32_u8(swizzle<A>(batch_type(vreinterpretq_u8_u32(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self,
+                                         batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
+                                         requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<int8_t, A>;
+            return vreinterpretq_s32_s8(swizzle<A>(batch_type(vreinterpretq_s8_s32(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self,
+                                          batch_constant<batch<uint64_t, A>, V0, V1> idx,
+                                          requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<uint8_t, A>;
+            return vreinterpretq_u64_u8(swizzle<A>(batch_type(vreinterpretq_u8_u64(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self,
+                                         batch_constant<batch<uint64_t, A>, V0, V1> idx,
+                                         requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<int8_t, A>;
+            return vreinterpretq_s64_s8(swizzle<A>(batch_type(vreinterpretq_s8_s64(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        inline batch<float, A> swizzle(batch<float, A> const& self,
+                                       batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
+                                       requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<uint8_t, A>;
+            return vreinterpretq_f32_u8(swizzle<A>(batch_type(vreinterpretq_u8_f32(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        inline batch<double, A> swizzle(batch<double, A> const& self,
+                                        batch_constant<batch<uint64_t, A>, V0, V1> idx,
+                                        requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<uint8_t, A>;
+            return vreinterpretq_f64_u8(swizzle<A>(batch_type(vreinterpretq_u8_f64(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        inline batch<std::complex<float>, A> swizzle(batch<std::complex<float>, A> const& self,
+                                                     batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
+                                                     requires_arch<neon64>) noexcept
+        {
+            return batch<std::complex<float>>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A()));
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        inline batch<std::complex<double>, A> swizzle(batch<std::complex<double>, A> const& self,
+                                                      batch_constant<batch<uint64_t, A>, V0, V1> idx,
+                                                      requires_arch<neon64>) noexcept
+        {
+            return batch<std::complex<double>>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A()));
+        }
+    }
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp
new file mode 100644
index 0000000000..d5116cbd71
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp
@@ -0,0 +1,1043 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SCALAR_HPP
+#define XSIMD_SCALAR_HPP
+
+#include <cassert>
+#include <cmath>
+#include <complex>
+#include <cstdint>
+#include <cstring>
+#include <limits>
+#include <type_traits>
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+#include "xtl/xcomplex.hpp"
+#endif
+
+namespace xsimd
+{
+    template <class T, class A>
+    class batch;
+    template <class T, class A>
+    class batch_bool;
+
+    using std::abs;
+
+    using std::acos;
+    using std::acosh;
+    using std::arg;
+    using std::asin;
+    using std::asinh;
+    using std::atan;
+    using std::atan2;
+    using std::atanh;
+    using std::cbrt;
+    using std::ceil;
+    using std::conj;
+    using std::copysign;
+    using std::cos;
+    using std::cosh;
+    using std::erf;
+    using std::erfc;
+    using std::exp;
+    using std::exp2;
+    using std::expm1;
+    using std::fabs;
+    using std::fdim;
+    using std::floor;
+    using std::fmax;
+    using std::fmin;
+    using std::fmod;
+    using std::hypot;
+    using std::ldexp;
+    using std::lgamma;
+    using std::log;
+    using std::log10;
+    using std::log1p;
+    using std::log2;
+    using std::modf;
+    using std::nearbyint;
+    using std::nextafter;
+    using std::norm;
+    using std::polar;
+    using std::proj;
+    using std::remainder;
+    using std::rint;
+    using std::round;
+    using std::sin;
+    using std::sinh;
+    using std::sqrt;
+    using std::tan;
+    using std::tanh;
+    using std::tgamma;
+    using std::trunc;
+
+#ifndef _WIN32
+    using std::isfinite;
+    using std::isinf;
+    using std::isnan;
+#else
+
+    // Windows defines catch all templates
+    template <class T>
+    inline typename std::enable_if<std::is_floating_point<T>::value, bool>::type
+    isfinite(T var) noexcept
+    {
+        return std::isfinite(var);
+    }
+
+    template <class T>
+    inline typename std::enable_if<std::is_integral<T>::value, bool>::type
+    isfinite(T var) noexcept
+    {
+        return isfinite(double(var));
+    }
+
+    template <class T>
+    inline typename std::enable_if<std::is_floating_point<T>::value, bool>::type
+    isinf(T var) noexcept
+    {
+        return std::isinf(var);
+    }
+
+    template <class T>
+    inline typename std::enable_if<std::is_integral<T>::value, bool>::type
+    isinf(T var) noexcept
+    {
+        return isinf(double(var));
+    }
+
+    template <class T>
+    inline typename std::enable_if<std::is_floating_point<T>::value, bool>::type
+    isnan(T var) noexcept
+    {
+        return std::isnan(var);
+    }
+
+    template <class T>
+    inline typename std::enable_if<std::is_integral<T>::value, bool>::type
+    isnan(T var) noexcept
+    {
+        return isnan(double(var));
+    }
+#endif
+
+    template <class T, class Tp>
+    inline auto add(T const& x, Tp const& y) noexcept -> decltype(x + y)
+    {
+        return x + y;
+    }
+
+    template <class T>
+    inline typename std::enable_if<std::is_integral<T>::value, T>::type
+    bitwise_and(T x, T y) noexcept
+    {
+        return x & y;
+    }
+
+    inline float bitwise_and(float x, float y) noexcept
+    {
+        uint32_t ix, iy;
+        std::memcpy((void*)&ix, (void*)&x, sizeof(float));
+        std::memcpy((void*)&iy, (void*)&y, sizeof(float));
+        uint32_t ir = bitwise_and(ix, iy);
+        float r;
+        std::memcpy((void*)&r, (void*)&ir, sizeof(float));
+        return r;
+    }
+
+    inline double bitwise_and(double x, double y) noexcept
+    {
+        uint64_t ix, iy;
+        std::memcpy((void*)&ix, (void*)&x, sizeof(double));
+        std::memcpy((void*)&iy, (void*)&y, sizeof(double));
+        uint64_t ir = bitwise_and(ix, iy);
+        double r;
+        std::memcpy((void*)&r, (void*)&ir, sizeof(double));
+        return r;
+    }
+
+    template <class T>
+    inline typename std::enable_if<std::is_integral<T>::value, T>::type
+    bitwise_andnot(T x, T y) noexcept
+    {
+        return x & ~y;
+    }
+
+    inline float bitwise_andnot(float x, float y) noexcept
+    {
+        uint32_t ix, iy;
+        std::memcpy((void*)&ix, (void*)&x, sizeof(float));
+        std::memcpy((void*)&iy, (void*)&y, sizeof(float));
+        uint32_t ir = bitwise_andnot(ix, iy);
+        float r;
+        std::memcpy((void*)&r, (void*)&ir, sizeof(float));
+        return r;
+    }
+
+    inline double bitwise_andnot(double x, double y) noexcept
+    {
+        uint64_t ix, iy;
+        std::memcpy((void*)&ix, (void*)&x, sizeof(double));
+        std::memcpy((void*)&iy, (void*)&y, sizeof(double));
+        uint64_t ir = bitwise_andnot(ix, iy);
+        double r;
+        std::memcpy((void*)&r, (void*)&ir, sizeof(double));
+        return r;
+    }
+
+    template <class T>
+    inline typename std::enable_if<std::is_integral<T>::value, T>::type
+    bitwise_not(T x) noexcept
+    {
+        return ~x;
+    }
+
+    inline float bitwise_not(float x) noexcept
+    {
+        uint32_t ix;
+        std::memcpy((void*)&ix, (void*)&x, sizeof(float));
+        uint32_t ir = bitwise_not(ix);
+        float r;
+        std::memcpy((void*)&r, (void*)&ir, sizeof(float));
+        return r;
+    }
+
+    inline double bitwise_not(double x) noexcept
+    {
+        uint64_t ix;
+        std::memcpy((void*)&ix, (void*)&x, sizeof(double));
+        uint64_t ir = bitwise_not(ix);
+        double r;
+        std::memcpy((void*)&r, (void*)&ir, sizeof(double));
+        return r;
+    }
+
+    template <class T>
+    inline typename std::enable_if<std::is_integral<T>::value, T>::type
+    bitwise_or(T x, T y) noexcept
+    {
+        return x | y;
+    }
+
+    inline float bitwise_or(float x, float y) noexcept
+    {
+        uint32_t ix, iy;
+        std::memcpy((void*)&ix, (void*)&x, sizeof(float));
+        std::memcpy((void*)&iy, (void*)&y, sizeof(float));
+        uint32_t ir = bitwise_or(ix, iy);
+        float r;
+        std::memcpy((void*)&r, (void*)&ir, sizeof(float));
+        return r;
+    }
+
+    inline double bitwise_or(double x, double y) noexcept
+    {
+        uint64_t ix, iy;
+        std::memcpy((void*)&ix, (void*)&x, sizeof(double));
+        std::memcpy((void*)&iy, (void*)&y, sizeof(double));
+        uint64_t ir = bitwise_or(ix, iy);
+        double r;
+        std::memcpy((void*)&r, (void*)&ir, sizeof(double));
+        return r;
+    }
+
+    template <class T>
+    inline typename std::enable_if<std::is_integral<T>::value, T>::type
+    bitwise_xor(T x, T y) noexcept
+    {
+        return x ^ y;
+    }
+
+    inline float bitwise_xor(float x, float y) noexcept
+    {
+        uint32_t ix, iy;
+        std::memcpy((void*)&ix, (void*)&x, sizeof(float));
+        std::memcpy((void*)&iy, (void*)&y, sizeof(float));
+        uint32_t ir = bitwise_xor(ix, iy);
+        float r;
+        std::memcpy((void*)&r, (void*)&ir, sizeof(float));
+        return r;
+    }
+
+    inline double bitwise_xor(double x, double y) noexcept
+    {
+        uint64_t ix, iy;
+        std::memcpy((void*)&ix, (void*)&x, sizeof(double));
+        std::memcpy((void*)&iy, (void*)&y, sizeof(double));
+        uint64_t ir = bitwise_xor(ix, iy);
+        double r;
+        std::memcpy((void*)&r, (void*)&ir, sizeof(double));
+        return r;
+    }
+
+    template <class T, class Tp>
+    inline auto div(T const& x, Tp const& y) noexcept -> decltype(x / y)
+    {
+        return x / y;
+    }
+
+    template <class T, class Tp>
+    inline auto mod(T const& x, Tp const& y) noexcept -> decltype(x % y)
+    {
+        return x % y;
+    }
+
+    template <class T, class Tp>
+    inline auto mul(T const& x, Tp const& y) noexcept -> decltype(x * y)
+    {
+        return x * y;
+    }
+
+    template <class T>
+    inline auto neg(T const& x) noexcept -> decltype(-x)
+    {
+        return -x;
+    }
+
+    template <class T>
+    inline auto pos(T const& x) noexcept -> decltype(+x)
+    {
+        return +x;
+    }
+
+    inline float reciprocal(float const& x) noexcept
+    {
+        return 1.f / x;
+    }
+
+    inline double reciprocal(double const& x) noexcept
+    {
+        return 1. / x;
+    }
+
+#ifdef XSIMD_ENABLE_NUMPY_COMPLEX
+    template <class T>
+    inline bool isnan(std::complex<T> var) noexcept
+    {
+        return std::isnan(std::real(var)) || std::isnan(std::imag(var));
+    }
+
+    template <class T>
+    inline bool isinf(std::complex<T> var) noexcept
+    {
+        return std::isinf(std::real(var)) || std::isinf(std::imag(var));
+    }
+#endif
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    using xtl::abs;
+    using xtl::acos;
+    using xtl::acosh;
+    using xtl::asin;
+    using xtl::asinh;
+    using xtl::atan;
+    using xtl::atanh;
+    using xtl::cos;
+    using xtl::cosh;
+    using xtl::exp;
+    using xtl::log;
+    using xtl::log10;
+    using xtl::norm;
+    using xtl::pow;
+    using xtl::proj;
+    using xtl::sin;
+    using xtl::sinh;
+    using xtl::sqrt;
+    using xtl::tan;
+    using xtl::tanh;
+#endif
+
+    template <typename T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline T clip(const T& val, const T& low, const T& hi) noexcept
+    {
+        assert(low <= hi && "ordered clipping bounds");
+        return low > val ? low : (hi < val ? hi : val);
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline bool is_flint(const T& x) noexcept
+    {
+        return std::isnan(x - x) ? false : (x - std::trunc(x)) == T(0);
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline bool is_even(const T& x) noexcept
+    {
+        return is_flint(x * T(0.5));
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline bool is_odd(const T& x) noexcept
+    {
+        return is_even(x - 1.);
+    }
+
+    inline int32_t nearbyint_as_int(float var) noexcept
+    {
+        return static_cast<int32_t>(std::nearbyint(var));
+    }
+
+    inline int64_t nearbyint_as_int(double var) noexcept
+    {
+        return static_cast<int64_t>(std::nearbyint(var));
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline bool eq(const T& x0, const T& x1) noexcept
+    {
+        return x0 == x1;
+    }
+
+    template <class T>
+    inline bool eq(const std::complex<T>& x0, const std::complex<T>& x1) noexcept
+    {
+        return x0 == x1;
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline bool ge(const T& x0, const T& x1) noexcept
+    {
+        return x0 >= x1;
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline bool gt(const T& x0, const T& x1) noexcept
+    {
+        return x0 > x1;
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline bool le(const T& x0, const T& x1) noexcept
+    {
+        return x0 <= x1;
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline bool lt(const T& x0, const T& x1) noexcept
+    {
+        return x0 < x1;
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline bool neq(const T& x0, const T& x1) noexcept
+    {
+        return x0 != x1;
+    }
+
+    template <class T>
+    inline bool neq(const std::complex<T>& x0, const std::complex<T>& x1) noexcept
+    {
+        return !(x0 == x1);
+    }
+
+#if defined(__APPLE__)
+    inline float exp10(const float& x) noexcept
+    {
+        return __exp10f(x);
+    }
+    inline double exp10(const double& x) noexcept
+    {
+        return __exp10(x);
+    }
+#elif defined(__GLIBC__)
+    inline float exp10(const float& x) noexcept
+    {
+        return ::exp10f(x);
+    }
+    inline double exp10(const double& x) noexcept
+    {
+        return ::exp10(x);
+    }
+#elif defined(_WIN32)
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline T exp10(const T& x) noexcept
+    {
+        // Very inefficient but other implementations give incorrect results
+        // on Windows
+        return std::pow(T(10), x);
+    }
+#else
+    inline float exp10(const float& x) noexcept
+    {
+        return std::exp(0x1.26bb1cp+1f * x);
+    }
+    inline double exp10(const double& x) noexcept
+    {
+        return std::exp(0x1.26bb1bbb55516p+1 * x);
+    }
+#endif
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline auto rsqrt(const T& x) noexcept -> decltype(std::sqrt(x))
+    {
+        using float_type = decltype(std::sqrt(x));
+        return static_cast<float_type>(1) / std::sqrt(x);
+    }
+
+    namespace detail
+    {
+        template <class C>
+        inline C expm1_complex_scalar_impl(const C& val) noexcept
+        {
+            using T = typename C::value_type;
+            T isin = std::sin(val.imag());
+            T rem1 = std::expm1(val.real());
+            T re = rem1 + T(1.);
+            T si = std::sin(val.imag() * T(0.5));
+            return std::complex<T>(rem1 - T(2.) * re * si * si, re * isin);
+        }
+    }
+
+    template <class T>
+    inline std::complex<T> expm1(const std::complex<T>& val) noexcept
+    {
+        return detail::expm1_complex_scalar_impl(val);
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T, bool i3ec>
+    inline xtl::xcomplex<T, T, i3ec> expm1(const xtl::xcomplex<T, T, i3ec>& val) noexcept
+    {
+        return detail::expm1_complex_scalar_impl(val);
+    }
+#endif
+
+    namespace detail
+    {
+        template <class C>
+        inline C log1p_complex_scalar_impl(const C& val) noexcept
+        {
+            using T = typename C::value_type;
+            C u = C(1.) + val;
+            return u == C(1.) ? val : (u.real() <= T(0.) ? log(u) : log(u) * val / (u - C(1.)));
+        }
+    }
+
+    template <class T>
+    inline std::complex<T> log1p(const std::complex<T>& val) noexcept
+    {
+        return detail::log1p_complex_scalar_impl(val);
+    }
+
+    template <class T>
+    inline std::complex<T> log2(const std::complex<T>& val) noexcept
+    {
+        return log(val) / std::log(T(2));
+    }
+
+    template <typename T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline T sadd(const T& lhs, const T& rhs) noexcept
+    {
+        if (std::numeric_limits<T>::is_signed)
+        {
+            if ((lhs > 0) && (rhs > std::numeric_limits<T>::max() - lhs))
+            {
+                return std::numeric_limits<T>::max();
+            }
+            else if ((lhs < 0) && (rhs < std::numeric_limits<T>::lowest() - lhs))
+            {
+                return std::numeric_limits<T>::lowest();
+            }
+            else
+            {
+                return lhs + rhs;
+            }
+        }
+        else
+        {
+            if (rhs > std::numeric_limits<T>::max() - lhs)
+            {
+                return std::numeric_limits<T>::max();
+            }
+            else
+            {
+                return lhs + rhs;
+            }
+        }
+    }
+
+    template <typename T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline T ssub(const T& lhs, const T& rhs) noexcept
+    {
+        if (std::numeric_limits<T>::is_signed)
+        {
+            return sadd(lhs, (T)-rhs);
+        }
+        else
+        {
+            if (lhs < rhs)
+            {
+                return std::numeric_limits<T>::lowest();
+            }
+            else
+            {
+                return lhs - rhs;
+            }
+        }
+    }
+
+    namespace detail
+    {
+        template <class T>
+        struct value_type_or_type_helper
+        {
+            using type = T;
+        };
+        template <class T, class A>
+        struct value_type_or_type_helper<batch<T, A>>
+        {
+            using type = T;
+        };
+
+        template <class T>
+        using value_type_or_type = typename value_type_or_type_helper<T>::type;
+
+        template <class T0, class T1>
+        inline typename std::enable_if<std::is_integral<T1>::value, T0>::type
+        ipow(const T0& x, const T1& n) noexcept
+        {
+            static_assert(std::is_integral<T1>::value, "second argument must be an integer");
+            T0 a = x;
+            T1 b = n;
+            bool const recip = b < 0;
+            T0 r(static_cast<value_type_or_type<T0>>(1));
+            while (1)
+            {
+                if (b & 1)
+                {
+                    r *= a;
+                }
+                b /= 2;
+                if (b == 0)
+                {
+                    break;
+                }
+                a *= a;
+            }
+            return recip ? static_cast<T0>(1) / r : r;
+        }
+    }
+
+    template <class T0, class T1>
+    inline typename std::enable_if<std::is_integral<T1>::value, T0>::type
+    pow(const T0& x, const T1& n) noexcept
+    {
+        return detail::ipow(x, n);
+    }
+
+    template <class T0, class T1>
+    inline auto
+    pow(const T0& t0, const T1& t1) noexcept
+        -> typename std::enable_if<std::is_scalar<T0>::value && std::is_floating_point<T1>::value, decltype(std::pow(t0, t1))>::type
+    {
+        return std::pow(t0, t1);
+    }
+
+    template <class T0, class T1>
+    inline typename std::enable_if<std::is_integral<T1>::value, std::complex<T0>>::type
+    pow(const std::complex<T0>& t0, const T1& t1) noexcept
+    {
+        return detail::ipow(t0, t1);
+    }
+
+    template <class T0, class T1>
+    inline typename std::enable_if<!std::is_integral<T1>::value, std::complex<T0>>::type
+    pow(const std::complex<T0>& t0, const T1& t1) noexcept
+    {
+        return std::pow(t0, t1);
+    }
+
+    template <class T0, class T1>
+    inline auto
+    pow(const T0& t0, const std::complex<T1>& t1) noexcept
+        -> typename std::enable_if<std::is_scalar<T0>::value, decltype(std::pow(t0, t1))>::type
+    {
+        return std::pow(t0, t1);
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline bool bitofsign(T const& x) noexcept
+    {
+        return x < T(0);
+    }
+
+    template <class T>
+    inline auto signbit(T const& v) noexcept -> decltype(bitofsign(v))
+    {
+        return bitofsign(v);
+    }
+
+    inline double sign(bool const& v) noexcept
+    {
+        return v;
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline T sign(const T& v) noexcept
+    {
+        return v < T(0) ? T(-1.) : v == T(0) ? T(0.)
+                                             : T(1.);
+    }
+
+    namespace detail
+    {
+        template <class C>
+        inline C sign_complex_scalar_impl(const C& v) noexcept
+        {
+            using value_type = typename C::value_type;
+            if (v.real())
+            {
+                return C(sign(v.real()), value_type(0));
+            }
+            else
+            {
+                return C(sign(v.imag()), value_type(0));
+            }
+        }
+    }
+
+    template <class T>
+    inline std::complex<T> sign(const std::complex<T>& v) noexcept
+    {
+        return detail::sign_complex_scalar_impl(v);
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T, bool i3ec>
+    inline xtl::xcomplex<T, T, i3ec> sign(const xtl::xcomplex<T, T, i3ec>& v) noexcept
+    {
+        return detail::sign_complex_scalar_impl(v);
+    }
+#endif
+
+    inline double signnz(bool const&) noexcept
+    {
+        return 1;
+    }
+
+    template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
+    inline T signnz(const T& v) noexcept
+    {
+        return v < T(0) ? T(-1.) : T(1.);
+    }
+
+    template <class T, class Tp>
+    inline auto sub(T const& x, Tp const& y) noexcept -> decltype(x - y)
+    {
+        return x - y;
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T, bool i3ec>
+    inline xtl::xcomplex<T, T, i3ec> log2(const xtl::xcomplex<T, T, i3ec>& val) noexcept
+    {
+        return log(val) / log(T(2));
+    }
+#endif
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T, bool i3ec>
+    inline xtl::xcomplex<T, T, i3ec> log1p(const xtl::xcomplex<T, T, i3ec>& val) noexcept
+    {
+        return detail::log1p_complex_scalar_impl(val);
+    }
+#endif
+
+    template <class T0, class T1>
+    inline auto min(T0 const& self, T1 const& other) noexcept
+        -> typename std::enable_if<std::is_scalar<T0>::value && std::is_scalar<T1>::value,
+                                   typename std::decay<decltype(self > other ? other : self)>::type>::type
+    {
+        return self > other ? other : self;
+    }
+
+    // numpy defines minimum operator on complex using lexical comparison
+    template <class T0, class T1>
+    inline std::complex<typename std::common_type<T0, T1>::type>
+    min(std::complex<T0> const& self, std::complex<T1> const& other) noexcept
+    {
+        return (self.real() < other.real()) ? (self) : (self.real() == other.real() ? (self.imag() < other.imag() ? self : other) : other);
+    }
+
+    template <class T0, class T1>
+    inline auto max(T0 const& self, T1 const& other) noexcept
+        -> typename std::enable_if<std::is_scalar<T0>::value && std::is_scalar<T1>::value,
+                                   typename std::decay<decltype(self > other ? other : self)>::type>::type
+    {
+        return self < other ? other : self;
+    }
+
+    // numpy defines maximum operator on complex using lexical comparison
+    template <class T0, class T1>
+    inline std::complex<typename std::common_type<T0, T1>::type>
+    max(std::complex<T0> const& self, std::complex<T1> const& other) noexcept
+    {
+        return (self.real() > other.real()) ? (self) : (self.real() == other.real() ? (self.imag() > other.imag() ? self : other) : other);
+    }
+
+    template <class T>
+    inline typename std::enable_if<std::is_integral<T>::value, T>::type fma(const T& a, const T& b, const T& c) noexcept
+    {
+        return a * b + c;
+    }
+
+    template <class T>
+    inline typename std::enable_if<std::is_floating_point<T>::value, T>::type fma(const T& a, const T& b, const T& c) noexcept
+    {
+        return std::fma(a, b, c);
+    }
+
+    template <class T>
+    inline typename std::enable_if<std::is_scalar<T>::value, T>::type fms(const T& a, const T& b, const T& c) noexcept
+    {
+        return a * b - c;
+    }
+
+    namespace detail
+    {
+        template <class C>
+        inline C fma_complex_scalar_impl(const C& a, const C& b, const C& c) noexcept
+        {
+            return { fms(a.real(), b.real(), fms(a.imag(), b.imag(), c.real())),
+                     fma(a.real(), b.imag(), fma(a.imag(), b.real(), c.imag())) };
+        }
+    }
+
+    template <class T>
+    inline std::complex<T> fma(const std::complex<T>& a, const std::complex<T>& b, const std::complex<T>& c) noexcept
+    {
+        return detail::fma_complex_scalar_impl(a, b, c);
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T, bool i3ec>
+    inline xtl::xcomplex<T, T, i3ec> fma(const xtl::xcomplex<T, T, i3ec>& a, const xtl::xcomplex<T, T, i3ec>& b, const xtl::xcomplex<T, T, i3ec>& c) noexcept
+    {
+        return detail::fma_complex_scalar_impl(a, b, c);
+    }
+#endif
+
+    namespace detail
+    {
+        template <class C>
+        inline C fms_complex_scalar_impl(const C& a, const C& b, const C& c) noexcept
+        {
+            return { fms(a.real(), b.real(), fma(a.imag(), b.imag(), c.real())),
+                     fma(a.real(), b.imag(), fms(a.imag(), b.real(), c.imag())) };
+        }
+    }
+
+    template <class T>
+    inline std::complex<T> fms(const std::complex<T>& a, const std::complex<T>& b, const std::complex<T>& c) noexcept
+    {
+        return detail::fms_complex_scalar_impl(a, b, c);
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T, bool i3ec>
+    inline xtl::xcomplex<T, T, i3ec> fms(const xtl::xcomplex<T, T, i3ec>& a, const xtl::xcomplex<T, T, i3ec>& b, const xtl::xcomplex<T, T, i3ec>& c) noexcept
+    {
+        return detail::fms_complex_scalar_impl(a, b, c);
+    }
+#endif
+
+    template <class T>
+    inline typename std::enable_if<std::is_integral<T>::value, T>::type fnma(const T& a, const T& b, const T& c) noexcept
+    {
+        return -(a * b) + c;
+    }
+
+    template <class T>
+    inline typename std::enable_if<std::is_floating_point<T>::value, T>::type fnma(const T& a, const T& b, const T& c) noexcept
+    {
+        return std::fma(-a, b, c);
+    }
+
+    namespace detail
+    {
+        template <class C>
+        inline C fnma_complex_scalar_impl(const C& a, const C& b, const C& c) noexcept
+        {
+            return { fms(a.imag(), b.imag(), fms(a.real(), b.real(), c.real())),
+                     -fma(a.real(), b.imag(), fms(a.imag(), b.real(), c.imag())) };
+        }
+    }
+
+    template <class T>
+    inline std::complex<T> fnma(const std::complex<T>& a, const std::complex<T>& b, const std::complex<T>& c) noexcept
+    {
+        return detail::fnma_complex_scalar_impl(a, b, c);
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T, bool i3ec>
+    inline xtl::xcomplex<T, T, i3ec> fnma(const xtl::xcomplex<T, T, i3ec>& a, const xtl::xcomplex<T, T, i3ec>& b, const xtl::xcomplex<T, T, i3ec>& c) noexcept
+    {
+        return detail::fnma_complex_scalar_impl(a, b, c);
+    }
+#endif
+
+    template <class T>
+    inline typename std::enable_if<std::is_integral<T>::value, T>::type fnms(const T& a, const T& b, const T& c) noexcept
+    {
+        return -(a * b) - c;
+    }
+
+    template <class T>
+    inline typename std::enable_if<std::is_floating_point<T>::value, T>::type fnms(const T& a, const T& b, const T& c) noexcept
+    {
+        return -std::fma(a, b, c);
+    }
+
+    namespace detail
+    {
+        template <class C>
+        inline C fnms_complex_scalar_impl(const C& a, const C& b, const C& c) noexcept
+        {
+            return { fms(a.imag(), b.imag(), fma(a.real(), b.real(), c.real())),
+                     -fma(a.real(), b.imag(), fma(a.imag(), b.real(), c.imag())) };
+        }
+    }
+
+    template <class T>
+    inline std::complex<T> fnms(const std::complex<T>& a, const std::complex<T>& b, const std::complex<T>& c) noexcept
+    {
+        return detail::fnms_complex_scalar_impl(a, b, c);
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T, bool i3ec>
+    inline xtl::xcomplex<T, T, i3ec> fnms(const xtl::xcomplex<T, T, i3ec>& a, const xtl::xcomplex<T, T, i3ec>& b, const xtl::xcomplex<T, T, i3ec>& c) noexcept
+    {
+        return detail::fnms_complex_scalar_impl(a, b, c);
+    }
+#endif
+
+    namespace detail
+    {
+#define XSIMD_HASSINCOS_TRAIT(func)                                                                                              \
+    template <class S>                                                                                                           \
+    struct has##func                                                                                                             \
+    {                                                                                                                            \
+        template <class T>                                                                                                       \
+        static auto get(T* ptr) -> decltype(func(std::declval<T>(), std::declval<T*>(), std::declval<T*>()), std::true_type {}); \
+        static std::false_type get(...);                                                                                         \
+        static constexpr bool value = decltype(get((S*)nullptr))::value;                                                         \
+    }
+
+#define XSIMD_HASSINCOS(func, T) has##func<T>::value
+
+        XSIMD_HASSINCOS_TRAIT(sincos);
+        XSIMD_HASSINCOS_TRAIT(sincosf);
+        XSIMD_HASSINCOS_TRAIT(__sincos);
+        XSIMD_HASSINCOS_TRAIT(__sincosf);
+
+        struct generic_sincosf
+        {
+            template <class T>
+            typename std::enable_if<XSIMD_HASSINCOS(sincosf, T), void>::type
+            operator()(float val, T& s, T& c)
+            {
+                sincosf(val, &s, &c);
+            }
+
+            template <class T>
+            typename std::enable_if<!XSIMD_HASSINCOS(sincosf, T) && XSIMD_HASSINCOS(__sincosf, T), void>::type
+            operator()(float val, T& s, T& c)
+            {
+                __sincosf(val, &s, &c);
+            }
+
+            template <class T>
+            typename std::enable_if<!XSIMD_HASSINCOS(sincosf, T) && !XSIMD_HASSINCOS(__sincosf, T), void>::type
+            operator()(float val, T& s, T& c)
+            {
+                s = std::sin(val);
+                c = std::cos(val);
+            }
+        };
+
+        struct generic_sincos
+        {
+            template <class T>
+            typename std::enable_if<XSIMD_HASSINCOS(sincos, T), void>::type
+            operator()(double val, T& s, T& c)
+            {
+                sincos(val, &s, &c);
+            }
+
+            template <class T>
+            typename std::enable_if<!XSIMD_HASSINCOS(sincos, T) && XSIMD_HASSINCOS(__sincos, T), void>::type
+            operator()(double val, T& s, T& c)
+            {
+                __sincos(val, &s, &c);
+            }
+
+            template <class T>
+            typename std::enable_if<!XSIMD_HASSINCOS(sincos, T) && !XSIMD_HASSINCOS(__sincos, T), void>::type
+            operator()(double val, T& s, T& c)
+            {
+                s = std::sin(val);
+                c = std::cos(val);
+            }
+        };
+
+#undef XSIMD_HASSINCOS_TRAIT
+#undef XSIMD_HASSINCOS
+    }
+
+    inline std::pair<float, float> sincos(float val) noexcept
+    {
+        float s, c;
+        detail::generic_sincosf {}(val, s, c);
+        return std::make_pair(s, c);
+    }
+
+    inline std::pair<double, double> sincos(double val) noexcept
+    {
+        double s, c;
+        detail::generic_sincos {}(val, s, c);
+        return std::make_pair(s, c);
+    }
+
+    template <class T>
+    inline std::pair<std::complex<T>, std::complex<T>>
+    sincos(const std::complex<T>& val) noexcept
+    {
+        return std::make_pair(std::sin(val), std::cos(val));
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T>
+    inline std::pair<xtl::xcomplex<T>, xtl::xcomplex<T>> sincos(const xtl::xcomplex<T>& val) noexcept
+    {
+        return std::make_pair(sin(val), cos(val));
+    }
+#endif
+
+    template <class T, class _ = typename std::enable_if<std::is_floating_point<T>::value, void>::type>
+    inline T frexp(T const& val, int& exp) noexcept
+    {
+        return std::frexp(val, &exp);
+    }
+
+    template <class T>
+    inline T select(bool cond, T const& true_br, T const& false_br) noexcept
+    {
+        return cond ? true_br : false_br;
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp
new file mode 100644
index 0000000000..e4949523ca
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp
@@ -0,0 +1,1695 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE2_HPP
+#define XSIMD_SSE2_HPP
+
+#include <complex>
+#include <limits>
+#include <type_traits>
+
+#include "../types/xsimd_sse2_register.hpp"
+
+namespace xsimd
+{
+    template <class batch_type, bool... Values>
+    struct batch_bool_constant;
+
+    template <class T_out, class T_in, class A>
+    inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept;
+
+    template <class batch_type, typename batch_type::value_type... Values>
+    struct batch_constant;
+
+    namespace kernel
+    {
+        using namespace types;
+
+        namespace detail
+        {
+            constexpr uint32_t shuffle(uint32_t w, uint32_t x, uint32_t y, uint32_t z)
+            {
+                return (z << 6) | (y << 4) | (x << 2) | w;
+            }
+            constexpr uint32_t shuffle(uint32_t x, uint32_t y)
+            {
+                return (y << 1) | x;
+            }
+        }
+
+        // fwd
+        template <class A, class T, size_t I>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
+
+        // abs
+        template <class A>
+        inline batch<double, A> abs(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            __m128d sign_mask = _mm_set1_pd(-0.f); // -0.f = 1 << 31
+            return _mm_andnot_pd(sign_mask, self);
+        }
+        template <class A>
+        inline batch<float, A> abs(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31
+            return _mm_andnot_ps(sign_mask, self);
+        }
+
+        // add
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_add_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_add_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_add_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_add_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        template <class A>
+        inline batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_add_ps(self, other);
+        }
+
+        template <class A>
+        inline batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_add_pd(self, other);
+        }
+
+        // all
+        template <class A>
+        inline bool all(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_ps(self) == 0x0F;
+        }
+        template <class A>
+        inline bool all(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_pd(self) == 0x03;
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline bool all(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_epi8(self) == 0xFFFF;
+        }
+
+        // any
+        template <class A>
+        inline bool any(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_ps(self) != 0;
+        }
+        template <class A>
+        inline bool any(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_pd(self) != 0;
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline bool any(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_epi8(self) != 0;
+        }
+
+        // batch_bool_cast
+        template <class A, class T_out, class T_in>
+        inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<sse2>) noexcept
+        {
+            return { bitwise_cast<T_out>(batch<T_in, A>(self.data)).data };
+        }
+
+        // bitwise_and
+        template <class A>
+        inline batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_and_ps(self, other);
+        }
+        template <class A>
+        inline batch_bool<float, A> bitwise_and(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_and_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_and_si128(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_and_si128(self, other);
+        }
+
+        template <class A>
+        batch<double, A> inline bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_and_pd(self, other);
+        }
+
+        template <class A>
+        inline batch_bool<double, A> bitwise_and(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_and_pd(self, other);
+        }
+
+        // bitwise_andnot
+        template <class A>
+        inline batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_andnot_ps(other, self);
+        }
+
+        template <class A>
+        inline batch_bool<float, A> bitwise_andnot(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_andnot_ps(other, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_andnot_si128(other, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_andnot_si128(other, self);
+        }
+
+        template <class A>
+        inline batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_andnot_pd(other, self);
+        }
+
+        template <class A>
+        inline batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_andnot_pd(other, self);
+        }
+
+        // bitwise_lshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_and_si128(_mm_set1_epi8(0xFF << other), _mm_slli_epi32(self, other));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_slli_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_slli_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_slli_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        // bitwise_not
+        template <class A>
+        inline batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1)));
+        }
+        template <class A>
+        inline batch_bool<float, A> bitwise_not(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(-1)));
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_si128(self, _mm_set1_epi32(-1));
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_si128(self, _mm_set1_epi32(-1));
+        }
+        template <class A>
+        inline batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1)));
+        }
+        template <class A>
+        inline batch_bool<double, A> bitwise_not(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_pd(self, _mm_castsi128_pd(_mm_set1_epi32(-1)));
+        }
+
+        // bitwise_or
+        template <class A>
+        inline batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_ps(self, other);
+        }
+        template <class A>
+        inline batch_bool<float, A> bitwise_or(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_si128(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_si128(self, other);
+        }
+
+        template <class A>
+        inline batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_pd(self, other);
+        }
+
+        template <class A>
+        inline batch_bool<double, A> bitwise_or(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_pd(self, other);
+        }
+
+        // bitwise_rshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<sse2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    __m128i sign_mask = _mm_set1_epi16((0xFF00 >> other) & 0x00FF);
+                    __m128i cmp_is_negative = _mm_cmpgt_epi8(_mm_setzero_si128(), self);
+                    __m128i res = _mm_srai_epi16(self, other);
+                    return _mm_or_si128(_mm_and_si128(sign_mask, cmp_is_negative), _mm_andnot_si128(sign_mask, res));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_srai_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_srai_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    // from https://github.com/samyvilar/vect/blob/master/vect_128.h
+                    return _mm_or_si128(
+                        _mm_srli_epi64(self, other),
+                        _mm_slli_epi64(
+                            _mm_srai_epi32(_mm_shuffle_epi32(self, _MM_SHUFFLE(3, 3, 1, 1)), 32),
+                            64 - other));
+                }
+                else
+                {
+                    assert(false && "unsupported arch/op combination");
+                    return {};
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_and_si128(_mm_set1_epi8(0xFF >> other), _mm_srli_epi32(self, other));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_srli_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_srli_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    return _mm_srli_epi64(self, other);
+                }
+                else
+                {
+                    assert(false && "unsupported arch/op combination");
+                    return {};
+                }
+            }
+        }
+
+        // bitwise_xor
+        template <class A>
+        inline batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_ps(self, other);
+        }
+        template <class A>
+        inline batch_bool<float, A> bitwise_xor(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_si128(self, other);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_pd(self, other);
+        }
+        template <class A>
+        inline batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_si128(self, other);
+        }
+
+        // bitwise_cast
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<float, A> bitwise_cast(batch<T, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
+        {
+            return _mm_castsi128_ps(self);
+        }
+        template <class A, class T, class Tp, class = typename std::enable_if<std::is_integral<typename std::common_type<T, Tp>::type>::value, void>::type>
+        inline batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<sse2>) noexcept
+        {
+            return batch<Tp, A>(self.data);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_cast(batch<float, A> const& self, batch<T, A> const&, requires_arch<sse2>) noexcept
+        {
+            return _mm_castps_si128(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<double, A> bitwise_cast(batch<T, A> const& self, batch<double, A> const&, requires_arch<sse2>) noexcept
+        {
+            return _mm_castsi128_pd(self);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_cast(batch<float, A> const& self, batch<double, A> const&, requires_arch<sse2>) noexcept
+        {
+            return _mm_castps_pd(self);
+        }
+        template <class A>
+        inline batch<float, A> bitwise_cast(batch<double, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
+        {
+            return _mm_castpd_ps(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_cast(batch<double, A> const& self, batch<T, A> const&, requires_arch<sse2>) noexcept
+        {
+            return _mm_castpd_si128(self);
+        }
+
+        // broadcast
+        template <class A>
+        batch<float, A> inline broadcast(float val, requires_arch<sse2>) noexcept
+        {
+            return _mm_set1_ps(val);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> broadcast(T val, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_set1_epi8(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_set1_epi16(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_set1_epi32(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_set1_epi64x(val);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        inline batch<double, A> broadcast(double val, requires_arch<sse2>) noexcept
+        {
+            return _mm_set1_pd(val);
+        }
+
+        // store_complex
+        namespace detail
+        {
+            // Override these methods in SSE-based archs, no need to override store_aligned / store_unaligned
+            // complex_low
+            template <class A>
+            inline batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<sse2>) noexcept
+            {
+                return _mm_unpacklo_ps(self.real(), self.imag());
+            }
+            // complex_high
+            template <class A>
+            inline batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<sse2>) noexcept
+            {
+                return _mm_unpackhi_ps(self.real(), self.imag());
+            }
+            template <class A>
+            inline batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<sse2>) noexcept
+            {
+                return _mm_unpacklo_pd(self.real(), self.imag());
+            }
+            template <class A>
+            inline batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<sse2>) noexcept
+            {
+                return _mm_unpackhi_pd(self.real(), self.imag());
+            }
+        }
+
+        // div
+        template <class A>
+        inline batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_div_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_div_pd(self, other);
+        }
+
+        // fast_cast
+        namespace detail
+        {
+            template <class A>
+            inline batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<sse2>) noexcept
+            {
+                return _mm_cvtepi32_ps(self);
+            }
+
+            template <class A>
+            inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<sse2>) noexcept
+            {
+                // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
+                __m128i msk_lo = _mm_set1_epi32(0xFFFF);
+                __m128 cnst65536f = _mm_set1_ps(65536.0f);
+
+                __m128i v_lo = _mm_and_si128(v, msk_lo); /* extract the 16 lowest significant bits of self                             */
+                __m128i v_hi = _mm_srli_epi32(v, 16); /* 16 most significant bits of v                                                 */
+                __m128 v_lo_flt = _mm_cvtepi32_ps(v_lo); /* No rounding                                                                */
+                __m128 v_hi_flt = _mm_cvtepi32_ps(v_hi); /* No rounding                                                                */
+                v_hi_flt = _mm_mul_ps(cnst65536f, v_hi_flt); /* No rounding                                                            */
+                return _mm_add_ps(v_hi_flt, v_lo_flt); /* Rounding may occur here, mul and add may fuse to fma for haswell and newer   */
+            }
+
+            template <class A>
+            inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<sse2>) noexcept
+            {
+                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
+                // adapted to sse2
+                __m128i xH = _mm_srli_epi64(x, 32);
+                xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); //  2^84
+                __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000);
+                __m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); //  2^52
+                __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); //  2^84 + 2^52
+                return _mm_add_pd(f, _mm_castsi128_pd(xL));
+            }
+
+            template <class A>
+            inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<sse2>) noexcept
+            {
+                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
+                // adapted to sse2
+                __m128i xH = _mm_srai_epi32(x, 16);
+                xH = _mm_and_si128(xH, _mm_setr_epi16(0x0000, 0x0000, 0xFFFF, 0xFFFF, 0x0000, 0x0000, 0xFFFF, 0xFFFF));
+                xH = _mm_add_epi64(xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.))); //  3*2^67
+                __m128i mask = _mm_setr_epi16(0xFFFF, 0xFFFF, 0xFFFF, 0x0000, 0xFFFF, 0xFFFF, 0xFFFF, 0x0000);
+                __m128i xL = _mm_or_si128(_mm_and_si128(mask, x), _mm_andnot_si128(mask, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)))); //  2^52
+                __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); //  3*2^67 + 2^52
+                return _mm_add_pd(f, _mm_castsi128_pd(xL));
+            }
+
+            template <class A>
+            inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<sse2>) noexcept
+            {
+                return _mm_cvttps_epi32(self);
+            }
+
+            template <class A>
+            inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<sse2>) noexcept
+            {
+                __m128 mask = _mm_cmpge_ps(self, _mm_set1_ps(1u << 31));
+                __m128 lhs = _mm_castsi128_ps(_mm_cvttps_epi32(self));
+                __m128 rhs = _mm_castsi128_ps(_mm_xor_si128(
+                    _mm_cvttps_epi32(_mm_sub_ps(self, _mm_set1_ps(1u << 31))),
+                    _mm_set1_epi32(1u << 31)));
+                return _mm_castps_si128(_mm_or_ps(_mm_and_ps(mask, rhs), _mm_andnot_ps(mask, lhs)));
+            }
+
+        }
+
+        // eq
+        template <class A>
+        inline batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpeq_ps(self, other);
+        }
+        template <class A>
+        inline batch_bool<float, A> eq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_castsi128_ps(_mm_cmpeq_epi32(_mm_castps_si128(self), _mm_castps_si128(other)));
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_cmpeq_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_cmpeq_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_cmpeq_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                __m128i tmp1 = _mm_cmpeq_epi32(self, other);
+                __m128i tmp2 = _mm_shuffle_epi32(tmp1, 0xB1);
+                __m128i tmp3 = _mm_and_si128(tmp1, tmp2);
+                __m128i tmp4 = _mm_srai_epi32(tmp3, 31);
+                return _mm_shuffle_epi32(tmp4, 0xF5);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return ~(self != other);
+        }
+        template <class A>
+        inline batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpeq_pd(self, other);
+        }
+        template <class A>
+        inline batch_bool<double, A> eq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_castsi128_pd(_mm_cmpeq_epi32(_mm_castpd_si128(self), _mm_castpd_si128(other)));
+        }
+
+        // from_mask
+        template <class A>
+        inline batch_bool<float, A> from_mask(batch_bool<float, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
+        {
+            alignas(A::alignment()) static const uint32_t lut[][4] = {
+                { 0x00000000, 0x00000000, 0x00000000, 0x00000000 },
+                { 0xFFFFFFFF, 0x00000000, 0x00000000, 0x00000000 },
+                { 0x00000000, 0xFFFFFFFF, 0x00000000, 0x00000000 },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0x00000000 },
+                { 0x00000000, 0x00000000, 0xFFFFFFFF, 0x00000000 },
+                { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0x00000000 },
+                { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000 },
+                { 0x00000000, 0x00000000, 0x00000000, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0x00000000, 0x00000000, 0xFFFFFFFF },
+                { 0x00000000, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF },
+                { 0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF },
+                { 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+                { 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF },
+            };
+            assert(!(mask & ~0xFul) && "inbound mask");
+            return _mm_castsi128_ps(_mm_load_si128((const __m128i*)lut[mask]));
+        }
+        template <class A>
+        inline batch_bool<double, A> from_mask(batch_bool<double, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
+        {
+            alignas(A::alignment()) static const uint64_t lut[][4] = {
+                { 0x0000000000000000ul, 0x0000000000000000ul },
+                { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
+                { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
+                { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
+            };
+            assert(!(mask & ~0x3ul) && "inbound mask");
+            return _mm_castsi128_pd(_mm_load_si128((const __m128i*)lut[mask]));
+        }
+        template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<sse2>) noexcept
+        {
+            alignas(A::alignment()) static const uint64_t lut64[] = {
+                0x0000000000000000,
+                0x000000000000FFFF,
+                0x00000000FFFF0000,
+                0x00000000FFFFFFFF,
+                0x0000FFFF00000000,
+                0x0000FFFF0000FFFF,
+                0x0000FFFFFFFF0000,
+                0x0000FFFFFFFFFFFF,
+                0xFFFF000000000000,
+                0xFFFF00000000FFFF,
+                0xFFFF0000FFFF0000,
+                0xFFFF0000FFFFFFFF,
+                0xFFFFFFFF00000000,
+                0xFFFFFFFF0000FFFF,
+                0xFFFFFFFFFFFF0000,
+                0xFFFFFFFFFFFFFFFF,
+            };
+            alignas(A::alignment()) static const uint32_t lut32[] = {
+                0x00000000,
+                0x000000FF,
+                0x0000FF00,
+                0x0000FFFF,
+                0x00FF0000,
+                0x00FF00FF,
+                0x00FFFF00,
+                0x00FFFFFF,
+                0xFF000000,
+                0xFF0000FF,
+                0xFF00FF00,
+                0xFF00FFFF,
+                0xFFFF0000,
+                0xFFFF00FF,
+                0xFFFFFF00,
+                0xFFFFFFFF,
+            };
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                assert(!(mask & ~0xFFFF) && "inbound mask");
+                return _mm_setr_epi32(lut32[mask & 0xF], lut32[(mask >> 4) & 0xF], lut32[(mask >> 8) & 0xF], lut32[mask >> 12]);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                assert(!(mask & ~0xFF) && "inbound mask");
+                return _mm_set_epi64x(lut64[mask >> 4], lut64[mask & 0xF]);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_castps_si128(from_mask(batch_bool<float, A> {}, mask, sse2 {}));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_castpd_si128(from_mask(batch_bool<double, A> {}, mask, sse2 {}));
+            }
+        }
+
+        // ge
+        template <class A>
+        inline batch_bool<float, A> ge(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpge_ps(self, other);
+        }
+        template <class A>
+        inline batch_bool<double, A> ge(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpge_pd(self, other);
+        }
+
+        // gt
+        template <class A>
+        inline batch_bool<float, A> gt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpgt_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_cmpgt_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_cmpgt_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_cmpgt_epi32(self, other);
+                }
+                else
+                {
+                    return gt(self, other, generic {});
+                }
+            }
+            else
+            {
+                return gt(self, other, generic {});
+            }
+        }
+
+        template <class A>
+        inline batch_bool<double, A> gt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpgt_pd(self, other);
+        }
+
+        // haddp
+        template <class A>
+        inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<sse2>) noexcept
+        {
+            __m128 tmp0 = _mm_unpacklo_ps(row[0], row[1]);
+            __m128 tmp1 = _mm_unpackhi_ps(row[0], row[1]);
+            __m128 tmp2 = _mm_unpackhi_ps(row[2], row[3]);
+            tmp0 = _mm_add_ps(tmp0, tmp1);
+            tmp1 = _mm_unpacklo_ps(row[2], row[3]);
+            tmp1 = _mm_add_ps(tmp1, tmp2);
+            tmp2 = _mm_movehl_ps(tmp1, tmp0);
+            tmp0 = _mm_movelh_ps(tmp0, tmp1);
+            return _mm_add_ps(tmp0, tmp2);
+        }
+        template <class A>
+        inline batch<double, A> haddp(batch<double, A> const* row, requires_arch<sse2>) noexcept
+        {
+            return _mm_add_pd(_mm_unpacklo_pd(row[0], row[1]),
+                              _mm_unpackhi_pd(row[0], row[1]));
+        }
+
+        // insert
+        template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_insert_epi16(self, val, I);
+            }
+            else
+            {
+                return insert(self, val, pos, generic {});
+            }
+        }
+
+        // isnan
+        template <class A>
+        inline batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpunord_ps(self, self);
+        }
+        template <class A>
+        inline batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpunord_pd(self, self);
+        }
+
+        // load_aligned
+        template <class A>
+        inline batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<sse2>) noexcept
+        {
+            return _mm_load_ps(mem);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<sse2>) noexcept
+        {
+            return _mm_load_si128((__m128i const*)mem);
+        }
+        template <class A>
+        inline batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<sse2>) noexcept
+        {
+            return _mm_load_pd(mem);
+        }
+
+        // load_unaligned
+        template <class A>
+        inline batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<sse2>) noexcept
+        {
+            return _mm_loadu_ps(mem);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<sse2>) noexcept
+        {
+            return _mm_loadu_si128((__m128i const*)mem);
+        }
+        template <class A>
+        inline batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<sse2>) noexcept
+        {
+            return _mm_loadu_pd(mem);
+        }
+
+        // load_complex
+        namespace detail
+        {
+            // Redefine these methods in the SSE-based archs if required
+            template <class A>
+            inline batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<sse2>) noexcept
+            {
+                return { _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0)), _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1)) };
+            }
+            template <class A>
+            inline batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<sse2>) noexcept
+            {
+                return { _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(0, 0)), _mm_shuffle_pd(hi, lo, _MM_SHUFFLE2(1, 1)) };
+            }
+        }
+
+        // le
+        template <class A>
+        inline batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmple_ps(self, other);
+        }
+        template <class A>
+        inline batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmple_pd(self, other);
+        }
+
+        // lt
+        template <class A>
+        inline batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmplt_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_cmplt_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_cmplt_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_cmplt_epi32(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    __m128i tmp1 = _mm_sub_epi64(self, other);
+                    __m128i tmp2 = _mm_xor_si128(self, other);
+                    __m128i tmp3 = _mm_andnot_si128(other, self);
+                    __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1);
+                    __m128i tmp5 = _mm_or_si128(tmp3, tmp4);
+                    __m128i tmp6 = _mm_srai_epi32(tmp5, 31);
+                    return _mm_shuffle_epi32(tmp6, 0xF5);
+                }
+                else
+                {
+                    assert(false && "unsupported arch/op combination");
+                    return {};
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_cmplt_epi8(_mm_xor_si128(self, _mm_set1_epi8(std::numeric_limits<int8_t>::lowest())), _mm_xor_si128(other, _mm_set1_epi8(std::numeric_limits<int8_t>::lowest())));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_cmplt_epi16(_mm_xor_si128(self, _mm_set1_epi16(std::numeric_limits<int16_t>::lowest())), _mm_xor_si128(other, _mm_set1_epi16(std::numeric_limits<int16_t>::lowest())));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_cmplt_epi32(_mm_xor_si128(self, _mm_set1_epi32(std::numeric_limits<int32_t>::lowest())), _mm_xor_si128(other, _mm_set1_epi32(std::numeric_limits<int32_t>::lowest())));
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+                {
+                    auto xself = _mm_xor_si128(self, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
+                    auto xother = _mm_xor_si128(other, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
+                    __m128i tmp1 = _mm_sub_epi64(xself, xother);
+                    __m128i tmp2 = _mm_xor_si128(xself, xother);
+                    __m128i tmp3 = _mm_andnot_si128(xother, xself);
+                    __m128i tmp4 = _mm_andnot_si128(tmp2, tmp1);
+                    __m128i tmp5 = _mm_or_si128(tmp3, tmp4);
+                    __m128i tmp6 = _mm_srai_epi32(tmp5, 31);
+                    return _mm_shuffle_epi32(tmp6, 0xF5);
+                }
+                else
+                {
+                    assert(false && "unsupported arch/op combination");
+                    return {};
+                }
+            }
+        }
+
+        template <class A>
+        inline batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmplt_pd(self, other);
+        }
+
+        /* compression table to turn 0b10 into 0b1,
+         * 0b100010 into 0b101 etc
+         */
+        namespace detail
+        {
+            inline int mask_lut(int mask)
+            {
+                // clang-format off
+                static const int mask_lut[256] = {
+                  0x0, 0x0, 0x1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x2, 0x0, 0x3, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x4, 0x0, 0x5, 0x0, 0x0, 0x0, 0x0, 0x0, 0x6, 0x0, 0x7, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x8, 0x0, 0x9, 0x0, 0x0, 0x0, 0x0, 0x0, 0xA, 0x0, 0xB, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0xC, 0x0, 0xD, 0x0, 0x0, 0x0, 0x0, 0x0, 0xE, 0x0, 0xF, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                  0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0,
+                };
+                // clang-format on
+                return mask_lut[mask & 0xAA];
+            }
+        }
+
+        // mask
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_movemask_epi8(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                uint64_t mask8 = _mm_movemask_epi8(self);
+                return detail::mask_lut(mask8) | (detail::mask_lut(mask8 >> 8) << 4);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_movemask_ps(_mm_castsi128_ps(self));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_movemask_pd(_mm_castsi128_pd(self));
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        inline uint64_t mask(batch_bool<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_ps(self);
+        }
+
+        template <class A>
+        inline uint64_t mask(batch_bool<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_movemask_pd(self);
+        }
+
+        // max
+        template <class A>
+        inline batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_max_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return select(self > other, self, other);
+        }
+        template <class A>
+        inline batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_max_pd(self, other);
+        }
+
+        // min
+        template <class A>
+        inline batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_min_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return select(self <= other, self, other);
+        }
+        template <class A>
+        inline batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_min_pd(self, other);
+        }
+
+        // mul
+        template <class A>
+        inline batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_mul_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_mul_pd(self, other);
+        }
+
+        // nearbyint_as_int
+        template <class A>
+        inline batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
+                                                  requires_arch<sse2>) noexcept
+        {
+            return _mm_cvtps_epi32(self);
+        }
+
+        // neg
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> neg(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return 0 - self;
+        }
+        template <class A>
+        inline batch<float, A> neg(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_ps(self, _mm_castsi128_ps(_mm_set1_epi32(0x80000000)));
+        }
+
+        template <class A>
+        inline batch<double, A> neg(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_xor_pd(
+                self, _mm_castsi128_pd(_mm_setr_epi32(0, 0x80000000, 0, 0x80000000)));
+        }
+
+        // neq
+        template <class A>
+        inline batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpneq_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return ~(self == other);
+        }
+        template <class A>
+        inline batch_bool<float, A> neq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpneq_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_castps_si128(_mm_xor_ps(_mm_castsi128_ps(self.data), _mm_castsi128_ps(other.data)));
+        }
+
+        template <class A>
+        inline batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpneq_pd(self, other);
+        }
+        template <class A>
+        inline batch_bool<double, A> neq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_cmpneq_pd(self, other);
+        }
+
+        // reciprocal
+        template <class A>
+        inline batch<float, A> reciprocal(batch<float, A> const& self,
+                                          kernel::requires_arch<sse2>)
+        {
+            return _mm_rcp_ps(self);
+        }
+
+        // reduce_add
+        template <class A>
+        inline float reduce_add(batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            __m128 tmp0 = _mm_add_ps(self, _mm_movehl_ps(self, self));
+            __m128 tmp1 = _mm_add_ss(tmp0, _mm_shuffle_ps(tmp0, tmp0, 1));
+            return _mm_cvtss_f32(tmp1);
+        }
+
+        // reduce_max
+        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
+        inline T reduce_max(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            constexpr auto mask0 = detail::shuffle(2, 3, 0, 0);
+            batch<T, A> step0 = _mm_shuffle_epi32(self, mask0);
+            batch<T, A> acc0 = max(self, step0);
+
+            constexpr auto mask1 = detail::shuffle(1, 0, 0, 0);
+            batch<T, A> step1 = _mm_shuffle_epi32(acc0, mask1);
+            batch<T, A> acc1 = max(acc0, step1);
+
+            constexpr auto mask2 = detail::shuffle(1, 0, 0, 0);
+            batch<T, A> step2 = _mm_shufflelo_epi16(acc1, mask2);
+            batch<T, A> acc2 = max(acc1, step2);
+            if (sizeof(T) == 2)
+                return acc2.get(0);
+            batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
+            batch<T, A> acc3 = max(acc2, step3);
+            return acc3.get(0);
+        }
+
+        // reduce_min
+        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
+        inline T reduce_min(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            constexpr auto mask0 = detail::shuffle(2, 3, 0, 0);
+            batch<T, A> step0 = _mm_shuffle_epi32(self, mask0);
+            batch<T, A> acc0 = min(self, step0);
+
+            constexpr auto mask1 = detail::shuffle(1, 0, 0, 0);
+            batch<T, A> step1 = _mm_shuffle_epi32(acc0, mask1);
+            batch<T, A> acc1 = min(acc0, step1);
+
+            constexpr auto mask2 = detail::shuffle(1, 0, 0, 0);
+            batch<T, A> step2 = _mm_shufflelo_epi16(acc1, mask2);
+            batch<T, A> acc2 = min(acc1, step2);
+            if (sizeof(T) == 2)
+                return acc2.get(0);
+            batch<T, A> step3 = bitwise_cast<T>(bitwise_cast<uint16_t>(acc2) >> 8);
+            batch<T, A> acc3 = min(acc2, step3);
+            return acc3.get(0);
+        }
+        // TODO: move this in xsimd_generic
+        namespace detail
+        {
+            template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+            inline T hadd_default(batch<T, A> const& self, requires_arch<sse2>) noexcept
+            {
+                alignas(A::alignment()) T buffer[batch<T, A>::size];
+                self.store_aligned(buffer);
+                T res = 0;
+                for (T val : buffer)
+                {
+                    res += val;
+                }
+                return res;
+            }
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline T reduce_add(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
+                __m128i tmp2 = _mm_add_epi32(self, tmp1);
+                __m128i tmp3 = _mm_shuffle_epi32(tmp2, 0x01);
+                __m128i tmp4 = _mm_add_epi32(tmp2, tmp3);
+                return _mm_cvtsi128_si32(tmp4);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
+                __m128i tmp2 = _mm_add_epi64(self, tmp1);
+#if defined(__x86_64__)
+                return _mm_cvtsi128_si64(tmp2);
+#else
+                __m128i m;
+                _mm_storel_epi64(&m, tmp2);
+                int64_t i;
+                std::memcpy(&i, &m, sizeof(i));
+                return i;
+#endif
+            }
+            else
+            {
+                return detail::hadd_default(self, A {});
+            }
+        }
+        template <class A>
+        inline double reduce_add(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_cvtsd_f64(_mm_add_sd(self, _mm_unpackhi_pd(self, self)));
+        }
+
+        // rsqrt
+        template <class A>
+        inline batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<sse2>) noexcept
+        {
+            return _mm_rsqrt_ps(val);
+        }
+        template <class A>
+        inline batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<sse2>) noexcept
+        {
+            return _mm_cvtps_pd(_mm_rsqrt_ps(_mm_cvtpd_ps(val)));
+        }
+
+        // select
+        template <class A>
+        inline batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_ps(_mm_and_ps(cond, true_br), _mm_andnot_ps(cond, false_br));
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_si128(_mm_and_si128(cond, true_br), _mm_andnot_si128(cond, false_br));
+        }
+        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse2>) noexcept
+        {
+            return select(batch_bool<T, A> { Values... }, true_br, false_br, sse2 {});
+        }
+        template <class A>
+        inline batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse2>) noexcept
+        {
+            return _mm_or_pd(_mm_and_pd(cond, true_br), _mm_andnot_pd(cond, false_br));
+        }
+
+        // sqrt
+        template <class A>
+        inline batch<float, A> sqrt(batch<float, A> const& val, requires_arch<sse2>) noexcept
+        {
+            return _mm_sqrt_ps(val);
+        }
+        template <class A>
+        inline batch<double, A> sqrt(batch<double, A> const& val, requires_arch<sse2>) noexcept
+        {
+            return _mm_sqrt_pd(val);
+        }
+
+        // slide_left
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<sse2>) noexcept
+        {
+            return _mm_slli_si128(x, N);
+        }
+
+        // slide_right
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<sse2>) noexcept
+        {
+            return _mm_srli_si128(x, N);
+        }
+
+        // sadd
+
+        // TODO: move this in xsimd_generic
+        namespace detail
+        {
+            template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+            inline batch<T, A> sadd_default(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+            {
+                if (std::is_signed<T>::value)
+                {
+                    auto mask = (other >> (8 * sizeof(T) - 1));
+                    auto self_pos_branch = min(std::numeric_limits<T>::max() - other, self);
+                    auto self_neg_branch = max(std::numeric_limits<T>::min() - other, self);
+                    return other + select(batch_bool<T, A>(mask.data), self_neg_branch, self_pos_branch);
+                }
+                else
+                {
+                    const auto diffmax = std::numeric_limits<T>::max() - self;
+                    const auto mindiff = min(diffmax, other);
+                    return self + mindiff;
+                }
+            }
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_adds_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_adds_epi16(self, other);
+                }
+                else
+                {
+                    return detail::sadd_default(self, other, A {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_adds_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_adds_epu16(self, other);
+                }
+                else
+                {
+                    return detail::sadd_default(self, other, A {});
+                }
+            }
+        }
+
+        // set
+        template <class A, class... Values>
+        inline batch<float, A> set(batch<float, A> const&, requires_arch<sse2>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<float, A>::size, "consistent init");
+            return _mm_setr_ps(values...);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1) noexcept
+        {
+            return _mm_set_epi64x(v1, v0);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3) noexcept
+        {
+            return _mm_setr_epi32(v0, v1, v2, v3);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
+        {
+            return _mm_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<sse2>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
+        {
+            return _mm_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+        }
+
+        template <class A, class... Values>
+        inline batch<double, A> set(batch<double, A> const&, requires_arch<sse2>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<double, A>::size, "consistent init");
+            return _mm_setr_pd(values...);
+        }
+
+        template <class A, class T, class... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<sse2>, Values... values) noexcept
+        {
+            return set(batch<T, A>(), A {}, static_cast<T>(values ? -1LL : 0LL)...).data;
+        }
+
+        template <class A, class... Values>
+        inline batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<sse2>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch_bool<float, A>::size, "consistent init");
+            return _mm_castsi128_ps(set(batch<int32_t, A>(), A {}, static_cast<int32_t>(values ? -1LL : 0LL)...).data);
+        }
+
+        template <class A, class... Values>
+        inline batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<sse2>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch_bool<double, A>::size, "consistent init");
+            return _mm_castsi128_pd(set(batch<int64_t, A>(), A {}, static_cast<int64_t>(values ? -1LL : 0LL)...).data);
+        }
+
+        // ssub
+        // TODO: move this in xsimd_generic
+        namespace detail
+        {
+            template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+            inline batch<T, A> ssub_default(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+            {
+                if (std::is_signed<T>::value)
+                {
+                    return sadd(self, -other);
+                }
+                else
+                {
+                    const auto diff = min(self, other);
+                    return self - diff;
+                }
+            }
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_subs_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_subs_epi16(self, other);
+                }
+                else
+                {
+                    return detail::ssub_default(self, other, A {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_subs_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_subs_epu16(self, other);
+                }
+                else
+                {
+                    return detail::ssub_default(self, other, A {});
+                }
+            }
+        }
+
+        // store_aligned
+        template <class A>
+        inline void store_aligned(float* mem, batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_store_ps(mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_aligned(T* mem, batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_store_si128((__m128i*)mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_store_si128((__m128i*)mem, self);
+        }
+        template <class A>
+        inline void store_aligned(double* mem, batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_store_pd(mem, self);
+        }
+
+        // store_unaligned
+        template <class A>
+        inline void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_storeu_ps(mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_storeu_si128((__m128i*)mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_storeu_si128((__m128i*)mem, self);
+        }
+        template <class A>
+        inline void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_storeu_pd(mem, self);
+        }
+
+        // sub
+        template <class A>
+        inline batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_sub_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_sub_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_sub_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_sub_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_sub_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        inline batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_sub_pd(self, other);
+        }
+
+        // swizzle
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
+        {
+            constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
+            return _mm_shuffle_ps(self, self, index);
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1>, requires_arch<sse2>) noexcept
+        {
+            constexpr uint32_t index = detail::shuffle(V0, V1);
+            return _mm_shuffle_pd(self, self, index);
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1>, requires_arch<sse2>) noexcept
+        {
+            constexpr uint32_t index = detail::shuffle(2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1);
+            return _mm_shuffle_epi32(self, index);
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1> mask, requires_arch<sse2>) noexcept
+        {
+            return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, sse2 {}));
+        }
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
+        {
+            constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
+            return _mm_shuffle_epi32(self, index);
+        }
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> mask, requires_arch<sse2>) noexcept
+        {
+            return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, sse2 {}));
+        }
+
+        // zip_hi
+        template <class A>
+        inline batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_unpackhi_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_unpackhi_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_unpackhi_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_unpackhi_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_unpackhi_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        inline batch<double, A> zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_unpackhi_pd(self, other);
+        }
+
+        // zip_lo
+        template <class A>
+        inline batch<float, A> zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_unpacklo_ps(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_unpacklo_epi8(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_unpacklo_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_unpacklo_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_unpacklo_epi64(self, other);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        inline batch<double, A> zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<sse2>) noexcept
+        {
+            return _mm_unpacklo_pd(self, other);
+        }
+    }
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_sse3.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_sse3.hpp
new file mode 100644
index 0000000000..ccc049795c
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_sse3.hpp
@@ -0,0 +1,64 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE3_HPP
+#define XSIMD_SSE3_HPP
+
+#include "../types/xsimd_sse3_register.hpp"
+#include <type_traits>
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // haddp
+        template <class A>
+        inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<sse3>) noexcept
+        {
+            return _mm_hadd_ps(_mm_hadd_ps(row[0], row[1]),
+                               _mm_hadd_ps(row[2], row[3]));
+        }
+        template <class A>
+        inline batch<double, A> haddp(batch<double, A> const* row, requires_arch<sse3>) noexcept
+        {
+            return _mm_hadd_pd(row[0], row[1]);
+        }
+
+        // load_unaligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<sse3>) noexcept
+        {
+            return _mm_lddqu_si128((__m128i const*)mem);
+        }
+
+        // reduce_add
+        template <class A>
+        inline float reduce_add(batch<float, A> const& self, requires_arch<sse3>) noexcept
+        {
+            __m128 tmp0 = _mm_hadd_ps(self, self);
+            __m128 tmp1 = _mm_hadd_ps(tmp0, tmp0);
+            return _mm_cvtss_f32(tmp1);
+        }
+        template <class A>
+        inline double reduce_add(batch<double, A> const& self, requires_arch<sse3>) noexcept
+        {
+            __m128d tmp0 = _mm_hadd_pd(self, self);
+            return _mm_cvtsd_f64(tmp0);
+        }
+
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp
new file mode 100644
index 0000000000..c0e2878ef9
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp
@@ -0,0 +1,350 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE4_1_HPP
+#define XSIMD_SSE4_1_HPP
+
+#include <type_traits>
+
+#include "../types/xsimd_sse4_1_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+        // any
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline bool any(batch<T, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return !_mm_testz_si128(self, self);
+        }
+        // ceil
+        template <class A>
+        inline batch<float, A> ceil(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_ceil_ps(self);
+        }
+        template <class A>
+        inline batch<double, A> ceil(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_ceil_pd(self);
+        }
+
+        // fast_cast
+        namespace detail
+        {
+            template <class A>
+            inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<sse4_1>) noexcept
+            {
+                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
+                __m128i xH = _mm_srai_epi32(x, 16);
+                xH = _mm_blend_epi16(xH, _mm_setzero_si128(), 0x33);
+                xH = _mm_add_epi64(xH, _mm_castpd_si128(_mm_set1_pd(442721857769029238784.))); //  3*2^67
+                __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0x88); //  2^52
+                __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(442726361368656609280.)); //  3*2^67 + 2^52
+                return _mm_add_pd(f, _mm_castsi128_pd(xL));
+            }
+
+            template <class A>
+            inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<sse4_1>) noexcept
+            {
+                // from https://stackoverflow.com/questions/41144668/how-to-efficiently-perform-double-int64-conversions-with-sse-avx
+                __m128i xH = _mm_srli_epi64(x, 32);
+                xH = _mm_or_si128(xH, _mm_castpd_si128(_mm_set1_pd(19342813113834066795298816.))); //  2^84
+                __m128i xL = _mm_blend_epi16(x, _mm_castpd_si128(_mm_set1_pd(0x0010000000000000)), 0xcc); //  2^52
+                __m128d f = _mm_sub_pd(_mm_castsi128_pd(xH), _mm_set1_pd(19342813118337666422669312.)); //  2^84 + 2^52
+                return _mm_add_pd(f, _mm_castsi128_pd(xL));
+            }
+
+            template <class A>
+            inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<sse4_1>) noexcept
+            {
+                return _mm_castps_si128(
+                    _mm_blendv_ps(_mm_castsi128_ps(_mm_cvttps_epi32(self)),
+                                  _mm_castsi128_ps(_mm_xor_si128(
+                                      _mm_cvttps_epi32(_mm_sub_ps(self, _mm_set1_ps(1u << 31))),
+                                      _mm_set1_epi32(1u << 31))),
+                                  _mm_cmpge_ps(self, _mm_set1_ps(1u << 31))));
+            }
+        }
+
+        // eq
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_cmpeq_epi64(self, other);
+            }
+            else
+            {
+                return eq(self, other, ssse3 {});
+            }
+        }
+
+        // floor
+        template <class A>
+        inline batch<float, A> floor(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_floor_ps(self);
+        }
+        template <class A>
+        inline batch<double, A> floor(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_floor_pd(self);
+        }
+
+        // insert
+        template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<sse4_1>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_insert_epi8(self, val, I);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_insert_epi32(self, val, I);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+#if (!defined(_MSC_VER) && __x86_64__) || (_MSC_VER > 1900 && defined(_M_X64))
+                return _mm_insert_epi64(self, val, I);
+#else
+                uint32_t lo, hi;
+                memcpy(&lo, (reinterpret_cast<uint32_t*>(&val)), sizeof(lo));
+                memcpy(&hi, (reinterpret_cast<uint32_t*>(&val)) + 1, sizeof(hi));
+                return _mm_insert_epi32(_mm_insert_epi32(self, lo, 2 * I), hi, 2 * I + 1);
+#endif
+            }
+            else
+            {
+                return insert(self, val, pos, ssse3 {});
+            }
+        }
+
+        // max
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_max_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_max_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_max_epi32(self, other);
+                }
+                else
+                {
+                    return max(self, other, ssse3 {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_max_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_max_epu16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_max_epu32(self, other);
+                }
+                else
+                {
+                    return max(self, other, ssse3 {});
+                }
+            }
+        }
+
+        // min
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_min_epi8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_min_epi16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_min_epi32(self, other);
+                }
+                else
+                {
+                    return min(self, other, ssse3 {});
+                }
+            }
+            else
+            {
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    return _mm_min_epu8(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+                {
+                    return _mm_min_epu16(self, other);
+                }
+                else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+                {
+                    return _mm_min_epu32(self, other);
+                }
+                else
+                {
+                    return min(self, other, ssse3 {});
+                }
+            }
+        }
+
+        // mul
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse4_1>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_or_si128(
+                    _mm_and_si128(_mm_mullo_epi16(self, other), _mm_srli_epi16(_mm_cmpeq_epi8(self, self), 8)),
+                    _mm_slli_epi16(_mm_mullo_epi16(_mm_srli_epi16(self, 8), _mm_srli_epi16(other, 8)), 8));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_mullo_epi16(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_mullo_epi32(self, other);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_add_epi64(
+                    _mm_mul_epu32(self, other),
+                    _mm_slli_epi64(
+                        _mm_add_epi64(
+                            _mm_mul_epu32(other, _mm_shuffle_epi32(self, _MM_SHUFFLE(2, 3, 0, 1))),
+                            _mm_mul_epu32(self, _mm_shuffle_epi32(other, _MM_SHUFFLE(2, 3, 0, 1)))),
+                        32));
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        // nearbyint
+        template <class A>
+        inline batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_round_ps(self, _MM_FROUND_TO_NEAREST_INT);
+        }
+        template <class A>
+        inline batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_round_pd(self, _MM_FROUND_TO_NEAREST_INT);
+        }
+
+        // select
+        namespace detail
+        {
+            template <class T>
+            inline constexpr T interleave(T const& cond) noexcept
+            {
+                return (((cond * 0x0101010101010101ULL & 0x8040201008040201ULL) * 0x0102040810204081ULL >> 49) & 0x5555) | (((cond * 0x0101010101010101ULL & 0x8040201008040201ULL) * 0x0102040810204081ULL >> 48) & 0xAAAA);
+            }
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_blendv_epi8(false_br, true_br, cond);
+        }
+        template <class A>
+        inline batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_blendv_ps(false_br, true_br, cond);
+        }
+        template <class A>
+        inline batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_blendv_pd(false_br, true_br, cond);
+        }
+
+        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse4_1>) noexcept
+        {
+            constexpr int mask = batch_bool_constant<batch<T, A>, Values...>::mask();
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_blend_epi16(false_br, true_br, mask);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                constexpr int imask = detail::interleave(mask);
+                return _mm_blend_epi16(false_br, true_br, imask);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                constexpr int imask = detail::interleave(mask);
+                constexpr int imask2 = detail::interleave(imask);
+                return _mm_blend_epi16(false_br, true_br, imask2);
+            }
+            else
+            {
+                return select(batch_bool_constant<batch<T, A>, Values...>(), true_br, false_br, ssse3 {});
+            }
+        }
+        template <class A, bool... Values>
+        inline batch<float, A> select(batch_bool_constant<batch<float, A>, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse4_1>) noexcept
+        {
+            constexpr int mask = batch_bool_constant<batch<float, A>, Values...>::mask();
+            return _mm_blend_ps(false_br, true_br, mask);
+        }
+        template <class A, bool... Values>
+        inline batch<double, A> select(batch_bool_constant<batch<double, A>, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse4_1>) noexcept
+        {
+            constexpr int mask = batch_bool_constant<batch<double, A>, Values...>::mask();
+            return _mm_blend_pd(false_br, true_br, mask);
+        }
+
+        // trunc
+        template <class A>
+        inline batch<float, A> trunc(batch<float, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_round_ps(self, _MM_FROUND_TO_ZERO);
+        }
+        template <class A>
+        inline batch<double, A> trunc(batch<double, A> const& self, requires_arch<sse4_1>) noexcept
+        {
+            return _mm_round_pd(self, _MM_FROUND_TO_ZERO);
+        }
+
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp
new file mode 100644
index 0000000000..8f9b7a76e6
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_sse4_2.hpp
@@ -0,0 +1,44 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE4_2_HPP
+#define XSIMD_SSE4_2_HPP
+
+#include <limits>
+
+#include "../types/xsimd_sse4_2_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // lt
+        template <class A>
+        inline batch_bool<int64_t, A> lt(batch<int64_t, A> const& self, batch<int64_t, A> const& other, requires_arch<sse4_2>) noexcept
+        {
+            return _mm_cmpgt_epi64(other, self);
+        }
+        template <class A>
+        inline batch_bool<uint64_t, A> lt(batch<uint64_t, A> const& self, batch<uint64_t, A> const& other, requires_arch<sse4_2>) noexcept
+        {
+            auto xself = _mm_xor_si128(self, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
+            auto xother = _mm_xor_si128(other, _mm_set1_epi64x(std::numeric_limits<int64_t>::lowest()));
+            return _mm_cmpgt_epi64(xother, xself);
+        }
+
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_ssse3.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_ssse3.hpp
new file mode 100644
index 0000000000..0aa1b2552d
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_ssse3.hpp
@@ -0,0 +1,142 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSSE3_HPP
+#define XSIMD_SSSE3_HPP
+
+#include <cstddef>
+#include <type_traits>
+
+#include "../types/xsimd_ssse3_register.hpp"
+#include "../types/xsimd_utils.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // abs
+        template <class A, class T, typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, void>::type>
+        inline batch<T, A> abs(batch<T, A> const& self, requires_arch<ssse3>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm_abs_epi8(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm_abs_epi16(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm_abs_epi32(self);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm_abs_epi64(self);
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        // extract_pair
+        namespace detail
+        {
+
+            template <class T, class A>
+            inline batch<T, A> extract_pair(batch<T, A> const&, batch<T, A> const& other, std::size_t, ::xsimd::detail::index_sequence<>) noexcept
+            {
+                return other;
+            }
+
+            template <class T, class A, std::size_t I, std::size_t... Is>
+            inline batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (i == I)
+                {
+                    return _mm_alignr_epi8(self, other, sizeof(T) * I);
+                }
+                else
+                    return extract_pair(self, other, i, ::xsimd::detail::index_sequence<Is...>());
+            }
+        }
+
+        template <class A, class T, class _ = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, requires_arch<ssse3>) noexcept
+        {
+            constexpr std::size_t size = batch<T, A>::size;
+            assert(0 <= i && i < size && "index in bounds");
+            return detail::extract_pair(self, other, i, ::xsimd::detail::make_index_sequence<size>());
+        }
+
+        // reduce_add
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline T reduce_add(batch<T, A> const& self, requires_arch<ssse3>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                __m128i tmp1 = _mm_hadd_epi16(self, self);
+                __m128i tmp2 = _mm_hadd_epi16(tmp1, tmp1);
+                __m128i tmp3 = _mm_hadd_epi16(tmp2, tmp2);
+                return _mm_cvtsi128_si32(tmp3) & 0xFFFF;
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                __m128i tmp1 = _mm_hadd_epi32(self, self);
+                __m128i tmp2 = _mm_hadd_epi32(tmp1, tmp1);
+                return _mm_cvtsi128_si32(tmp2);
+            }
+            else
+            {
+                return reduce_add(self, sse3 {});
+            }
+        }
+
+        // swizzle
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<ssse3>) noexcept
+        {
+            constexpr batch_constant<batch<uint8_t, A>, 2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1, 2 * V2, 2 * V2 + 1, 2 * V3, 2 * V3 + 1,
+                                     2 * V4, 2 * V4 + 1, 2 * V5, 2 * V5 + 1, 2 * V6, 2 * V6 + 1, 2 * V7, 2 * V7 + 1>
+                mask8;
+            return _mm_shuffle_epi8(self, (batch<uint8_t, A>)mask8);
+        }
+
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<ssse3>) noexcept
+        {
+            return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, ssse3 {}));
+        }
+
+        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
+                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
+        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
+        {
+            return _mm_shuffle_epi8(self, (batch<uint8_t, A>)mask);
+        }
+
+        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
+                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
+        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
+        {
+            return bitwise_cast<int8_t>(swizzle(bitwise_cast<uint8_t>(self), mask, ssse3 {}));
+        }
+
+    }
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_sve.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_sve.hpp
new file mode 100644
index 0000000000..fa6e44e316
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_sve.hpp
@@ -0,0 +1,1126 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ * Copyright (c) Yibo Cai                                                   *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SVE_HPP
+#define XSIMD_SVE_HPP
+
+#include <complex>
+#include <type_traits>
+
+#include "../types/xsimd_sve_register.hpp"
+
+namespace xsimd
+{
+    template <class batch_type, typename batch_type::value_type... Values>
+    struct batch_constant;
+
+    namespace kernel
+    {
+        namespace detail
+        {
+            using xsimd::index;
+            using xsimd::types::detail::sve_vector_type;
+
+            // predicate creation
+            inline svbool_t sve_ptrue_impl(index<1>) noexcept { return svptrue_b8(); }
+            inline svbool_t sve_ptrue_impl(index<2>) noexcept { return svptrue_b16(); }
+            inline svbool_t sve_ptrue_impl(index<4>) noexcept { return svptrue_b32(); }
+            inline svbool_t sve_ptrue_impl(index<8>) noexcept { return svptrue_b64(); }
+
+            template <class T>
+            svbool_t sve_ptrue() noexcept { return sve_ptrue_impl(index<sizeof(T)> {}); }
+
+            // count active lanes in a predicate
+            inline uint64_t sve_pcount_impl(svbool_t p, index<1>) noexcept { return svcntp_b8(p, p); }
+            inline uint64_t sve_pcount_impl(svbool_t p, index<2>) noexcept { return svcntp_b16(p, p); }
+            inline uint64_t sve_pcount_impl(svbool_t p, index<4>) noexcept { return svcntp_b32(p, p); }
+            inline uint64_t sve_pcount_impl(svbool_t p, index<8>) noexcept { return svcntp_b64(p, p); }
+
+            template <class T>
+            inline uint64_t sve_pcount(svbool_t p) noexcept { return sve_pcount_impl(p, index<sizeof(T)> {}); }
+
+            // enable for signed integers
+            template <class T>
+            using sve_enable_signed_int_t = typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, int>::type;
+
+            // enable for unsigned integers
+            template <class T>
+            using sve_enable_unsigned_int_t = typename std::enable_if<std::is_integral<T>::value && !std::is_signed<T>::value, int>::type;
+
+            // enable for floating points
+            template <class T>
+            using sve_enable_floating_point_t = typename std::enable_if<std::is_floating_point<T>::value, int>::type;
+
+            // enable for signed integers or floating points
+            template <class T>
+            using sve_enable_signed_int_or_floating_point_t = typename std::enable_if<std::is_signed<T>::value, int>::type;
+
+            // enable for all SVE supported types
+            template <class T>
+            using sve_enable_all_t = typename std::enable_if<std::is_arithmetic<T>::value, int>::type;
+        } // namespace detail
+
+        /*********
+         * Load *
+         *********/
+
+        namespace detail
+        {
+            // "char" is not allowed in SVE load/store operations
+            using sve_fix_char_t_impl = typename std::conditional<std::is_signed<char>::value, int8_t, uint8_t>::type;
+
+            template <class T>
+            using sve_fix_char_t = typename std::conditional<std::is_same<char, typename std::decay<T>::type>::value,
+                                                             sve_fix_char_t_impl, T>::type;
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<sve>) noexcept
+        {
+            return svld1(detail::sve_ptrue<T>(), reinterpret_cast<detail::sve_fix_char_t<T> const*>(src));
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<sve>) noexcept
+        {
+            return load_aligned<A>(src, convert<T>(), sve {});
+        }
+
+        // load_complex
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        inline batch<std::complex<T>, A> load_complex_aligned(std::complex<T> const* mem, convert<std::complex<T>>, requires_arch<sve>) noexcept
+        {
+            const T* buf = reinterpret_cast<const T*>(mem);
+            const auto tmp = svld2(detail::sve_ptrue<T>(), buf);
+            const auto real = svget2(tmp, 0);
+            const auto imag = svget2(tmp, 1);
+            return batch<std::complex<T>, A> { real, imag };
+        }
+
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        inline batch<std::complex<T>, A> load_complex_unaligned(std::complex<T> const* mem, convert<std::complex<T>>, requires_arch<sve>) noexcept
+        {
+            return load_complex_aligned<A>(mem, convert<std::complex<T>> {}, sve {});
+        }
+
+        /*********
+         * Store *
+         *********/
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<sve>) noexcept
+        {
+            svst1(detail::sve_ptrue<T>(), reinterpret_cast<detail::sve_fix_char_t<T>*>(dst), src);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline void store_unaligned(T* dst, batch<T, A> const& src, requires_arch<sve>) noexcept
+        {
+            store_aligned<A>(dst, src, sve {});
+        }
+
+        // store_complex
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        inline void store_complex_aligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<sve>) noexcept
+        {
+            using v2type = typename std::conditional<(sizeof(T) == 4), svfloat32x2_t, svfloat64x2_t>::type;
+            v2type tmp {};
+            tmp = svset2(tmp, 0, src.real());
+            tmp = svset2(tmp, 1, src.imag());
+            T* buf = reinterpret_cast<T*>(dst);
+            svst2(detail::sve_ptrue<T>(), buf, tmp);
+        }
+
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        inline void store_complex_unaligned(std::complex<T>* dst, batch<std::complex<T>, A> const& src, requires_arch<sve>) noexcept
+        {
+            store_complex_aligned(dst, src, sve {});
+        }
+
+        /******************
+         * scatter/gather *
+         ******************/
+
+        namespace detail
+        {
+            template <class T, class U>
+            using sve_enable_sg_t = typename std::enable_if<(sizeof(T) == sizeof(U) && (sizeof(T) == 4 || sizeof(T) == 8)), int>::type;
+        }
+
+        // scatter
+        template <class A, class T, class U, detail::sve_enable_sg_t<T, U> = 0>
+        inline void scatter(batch<T, A> const& src, T* dst, batch<U, A> const& index, kernel::requires_arch<sve>) noexcept
+        {
+            svst1_scatter_index(detail::sve_ptrue<T>(), dst, index.data, src.data);
+        }
+
+        // gather
+        template <class A, class T, class U, detail::sve_enable_sg_t<T, U> = 0>
+        inline batch<T, A> gather(batch<T, A> const&, T const* src, batch<U, A> const& index, kernel::requires_arch<sve>) noexcept
+        {
+            return svld1_gather_index(detail::sve_ptrue<T>(), src, index.data);
+        }
+
+        /********************
+         * Scalar to vector *
+         ********************/
+
+        // broadcast
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        inline batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_u8(uint8_t(arg));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+        inline batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_s8(int8_t(arg));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        inline batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_u16(uint16_t(arg));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+        inline batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_s16(int16_t(arg));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_u32(uint32_t(arg));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+        inline batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_s32(int32_t(arg));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_u64(uint64_t(arg));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> broadcast(T arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_s64(int64_t(arg));
+        }
+
+        template <class A>
+        inline batch<float, A> broadcast(float arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_f32(arg);
+        }
+
+        template <class A>
+        inline batch<double, A> broadcast(double arg, requires_arch<sve>) noexcept
+        {
+            return svdup_n_f64(arg);
+        }
+
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        inline batch<T, A> broadcast(T val, requires_arch<sve>) noexcept
+        {
+            return broadcast<sve>(val, sve {});
+        }
+
+        /**************
+         * Arithmetic *
+         **************/
+
+        // add
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> add(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svadd_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // sadd
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> sadd(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svqadd(lhs, rhs);
+        }
+
+        // sub
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> sub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svsub_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // ssub
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> ssub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svqsub(lhs, rhs);
+        }
+
+        // mul
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> mul(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svmul_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // div
+        template <class A, class T, typename std::enable_if<sizeof(T) >= 4, int>::type = 0>
+        inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svdiv_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // max
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svmax_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // min
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svmin_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // neg
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+        inline batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u8(svneg_x(detail::sve_ptrue<T>(), svreinterpret_s8(arg)));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+        inline batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u16(svneg_x(detail::sve_ptrue<T>(), svreinterpret_s16(arg)));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+        inline batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u32(svneg_x(detail::sve_ptrue<T>(), svreinterpret_s32(arg)));
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u64(svneg_x(detail::sve_ptrue<T>(), svreinterpret_s64(arg)));
+        }
+
+        template <class A, class T, detail::sve_enable_signed_int_or_floating_point_t<T> = 0>
+        inline batch<T, A> neg(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svneg_x(detail::sve_ptrue<T>(), arg);
+        }
+
+        // abs
+        template <class A, class T, detail::sve_enable_unsigned_int_t<T> = 0>
+        inline batch<T, A> abs(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return arg;
+        }
+
+        template <class A, class T, detail::sve_enable_signed_int_or_floating_point_t<T> = 0>
+        inline batch<T, A> abs(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svabs_x(detail::sve_ptrue<T>(), arg);
+        }
+
+        // fma: x * y + z
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
+        {
+            return svmad_x(detail::sve_ptrue<T>(), x, y, z);
+        }
+
+        // fnma: z - x * y
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
+        {
+            return svmsb_x(detail::sve_ptrue<T>(), x, y, z);
+        }
+
+        // fms: x * y - z
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
+        {
+            return -fnma(x, y, z, sve {});
+        }
+
+        // fnms: - x * y - z
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z, requires_arch<sve>) noexcept
+        {
+            return -fma(x, y, z, sve {});
+        }
+
+        /**********************
+         * Logical operations *
+         **********************/
+
+        // bitwise_and
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svand_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        template <class A>
+        inline batch<float, A> bitwise_and(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u32(lhs);
+            const auto rhs_bits = svreinterpret_u32(rhs);
+            const auto result_bits = svand_x(detail::sve_ptrue<float>(), lhs_bits, rhs_bits);
+            return svreinterpret_f32(result_bits);
+        }
+
+        template <class A>
+        inline batch<double, A> bitwise_and(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u64(lhs);
+            const auto rhs_bits = svreinterpret_u64(rhs);
+            const auto result_bits = svand_x(detail::sve_ptrue<double>(), lhs_bits, rhs_bits);
+            return svreinterpret_f64(result_bits);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svand_z(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // bitwise_andnot
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svbic_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        template <class A>
+        inline batch<float, A> bitwise_andnot(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u32(lhs);
+            const auto rhs_bits = svreinterpret_u32(rhs);
+            const auto result_bits = svbic_x(detail::sve_ptrue<float>(), lhs_bits, rhs_bits);
+            return svreinterpret_f32(result_bits);
+        }
+
+        template <class A>
+        inline batch<double, A> bitwise_andnot(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u64(lhs);
+            const auto rhs_bits = svreinterpret_u64(rhs);
+            const auto result_bits = svbic_x(detail::sve_ptrue<double>(), lhs_bits, rhs_bits);
+            return svreinterpret_f64(result_bits);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svbic_z(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // bitwise_or
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svorr_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        template <class A>
+        inline batch<float, A> bitwise_or(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u32(lhs);
+            const auto rhs_bits = svreinterpret_u32(rhs);
+            const auto result_bits = svorr_x(detail::sve_ptrue<float>(), lhs_bits, rhs_bits);
+            return svreinterpret_f32(result_bits);
+        }
+
+        template <class A>
+        inline batch<double, A> bitwise_or(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u64(lhs);
+            const auto rhs_bits = svreinterpret_u64(rhs);
+            const auto result_bits = svorr_x(detail::sve_ptrue<double>(), lhs_bits, rhs_bits);
+            return svreinterpret_f64(result_bits);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svorr_z(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // bitwise_xor
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return sveor_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        template <class A>
+        inline batch<float, A> bitwise_xor(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u32(lhs);
+            const auto rhs_bits = svreinterpret_u32(rhs);
+            const auto result_bits = sveor_x(detail::sve_ptrue<float>(), lhs_bits, rhs_bits);
+            return svreinterpret_f32(result_bits);
+        }
+
+        template <class A>
+        inline batch<double, A> bitwise_xor(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto lhs_bits = svreinterpret_u64(lhs);
+            const auto rhs_bits = svreinterpret_u64(rhs);
+            const auto result_bits = sveor_x(detail::sve_ptrue<double>(), lhs_bits, rhs_bits);
+            return svreinterpret_f64(result_bits);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return sveor_z(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // bitwise_not
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svnot_x(detail::sve_ptrue<T>(), arg);
+        }
+
+        template <class A>
+        inline batch<float, A> bitwise_not(batch<float, A> const& arg, requires_arch<sve>) noexcept
+        {
+            const auto arg_bits = svreinterpret_u32(arg);
+            const auto result_bits = svnot_x(detail::sve_ptrue<float>(), arg_bits);
+            return svreinterpret_f32(result_bits);
+        }
+
+        template <class A>
+        inline batch<double, A> bitwise_not(batch<double, A> const& arg, requires_arch<sve>) noexcept
+        {
+            const auto arg_bits = svreinterpret_u64(arg);
+            const auto result_bits = svnot_x(detail::sve_ptrue<double>(), arg_bits);
+            return svreinterpret_f64(result_bits);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svnot_z(detail::sve_ptrue<T>(), arg);
+        }
+
+        /**********
+         * Shifts *
+         **********/
+
+        namespace detail
+        {
+            template <class A, class T, class U>
+            inline batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<1>) noexcept
+            {
+                return svreinterpret_u8(arg);
+            }
+
+            template <class A, class T, class U>
+            inline batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<2>) noexcept
+            {
+                return svreinterpret_u16(arg);
+            }
+
+            template <class A, class T, class U>
+            inline batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<4>) noexcept
+            {
+                return svreinterpret_u32(arg);
+            }
+
+            template <class A, class T, class U>
+            inline batch<U, A> sve_to_unsigned_batch_impl(batch<T, A> const& arg, index<8>) noexcept
+            {
+                return svreinterpret_u64(arg);
+            }
+
+            template <class A, class T, class U = as_unsigned_integer_t<T>>
+            inline batch<U, A> sve_to_unsigned_batch(batch<T, A> const& arg) noexcept
+            {
+                return sve_to_unsigned_batch_impl<A, T, U>(arg, index<sizeof(T)> {});
+            }
+        } // namespace detail
+
+        // bitwise_lshift
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& arg, int n, requires_arch<sve>) noexcept
+        {
+            constexpr std::size_t size = sizeof(typename batch<T, A>::value_type) * 8;
+            assert(0 <= n && static_cast<std::size_t>(n) < size && "index in bounds");
+            return svlsl_x(detail::sve_ptrue<T>(), arg, n);
+        }
+
+        template <class A, class T, detail::enable_integral_t<T> = 0>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svlsl_x(detail::sve_ptrue<T>(), lhs, detail::sve_to_unsigned_batch<A, T>(rhs));
+        }
+
+        // bitwise_rshift
+        template <class A, class T, detail::sve_enable_unsigned_int_t<T> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& arg, int n, requires_arch<sve>) noexcept
+        {
+            constexpr std::size_t size = sizeof(typename batch<T, A>::value_type) * 8;
+            assert(0 <= n && static_cast<std::size_t>(n) < size && "index in bounds");
+            return svlsr_x(detail::sve_ptrue<T>(), arg, static_cast<T>(n));
+        }
+
+        template <class A, class T, detail::sve_enable_unsigned_int_t<T> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svlsr_x(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        template <class A, class T, detail::sve_enable_signed_int_t<T> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& arg, int n, requires_arch<sve>) noexcept
+        {
+            constexpr std::size_t size = sizeof(typename batch<T, A>::value_type) * 8;
+            assert(0 <= n && static_cast<std::size_t>(n) < size && "index in bounds");
+            return svasr_x(detail::sve_ptrue<T>(), arg, static_cast<as_unsigned_integer_t<T>>(n));
+        }
+
+        template <class A, class T, detail::sve_enable_signed_int_t<T> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svasr_x(detail::sve_ptrue<T>(), lhs, detail::sve_to_unsigned_batch<A, T>(rhs));
+        }
+
+        /**************
+         * Reductions *
+         **************/
+
+        // reduce_add
+        template <class A, class T, class V = typename batch<T, A>::value_type, detail::sve_enable_all_t<T> = 0>
+        inline V reduce_add(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            // sve integer reduction results are promoted to 64 bits
+            return static_cast<V>(svaddv(detail::sve_ptrue<T>(), arg));
+        }
+
+        // reduce_max
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline T reduce_max(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svmaxv(detail::sve_ptrue<T>(), arg);
+        }
+
+        // reduce_min
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline T reduce_min(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svminv(detail::sve_ptrue<T>(), arg);
+        }
+
+        // haddp
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        inline batch<T, A> haddp(const batch<T, A>* row, requires_arch<sve>) noexcept
+        {
+            constexpr std::size_t size = batch<T, A>::size;
+            T sums[size];
+            for (std::size_t i = 0; i < size; ++i)
+            {
+                sums[i] = reduce_add(row[i], sve {});
+            }
+            return svld1(detail::sve_ptrue<T>(), sums);
+        }
+
+        /***************
+         * Comparisons *
+         ***************/
+
+        // eq
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svcmpeq(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            const auto neq_result = sveor_z(detail::sve_ptrue<T>(), lhs, rhs);
+            return svnot_z(detail::sve_ptrue<T>(), neq_result);
+        }
+
+        // neq
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch_bool<T, A> neq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svcmpne(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch_bool<T, A> neq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return sveor_z(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // lt
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svcmplt(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // le
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svcmple(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // gt
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svcmpgt(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        // ge
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svcmpge(detail::sve_ptrue<T>(), lhs, rhs);
+        }
+
+        /***************
+         * Permutation *
+         ***************/
+
+        // swizzle
+        template <class A, class T, class I, I... idx>
+        inline batch<T, A> swizzle(batch<T, A> const& arg, batch_constant<batch<I, A>, idx...>, requires_arch<sve>) noexcept
+        {
+            static_assert(batch<T, A>::size == sizeof...(idx), "invalid swizzle indices");
+            const batch<I, A> indices { idx... };
+            return svtbl(arg, indices);
+        }
+
+        template <class A, class T, class I, I... idx>
+        inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self,
+                                                 batch_constant<batch<I, A>, idx...>,
+                                                 requires_arch<sve>) noexcept
+        {
+            const auto real = swizzle(self.real(), batch_constant<batch<I, A>, idx...> {}, sve {});
+            const auto imag = swizzle(self.imag(), batch_constant<batch<I, A>, idx...> {}, sve {});
+            return batch<std::complex<T>>(real, imag);
+        }
+
+        /*************
+         * Selection *
+         *************/
+
+        // extract_pair
+        namespace detail
+        {
+            template <class A, class T>
+            inline batch<T, A> sve_extract_pair(batch<T, A> const&, batch<T, A> const& /*rhs*/, std::size_t, ::xsimd::detail::index_sequence<>) noexcept
+            {
+                assert(false && "extract_pair out of bounds");
+                return batch<T, A> {};
+            }
+
+            template <class A, class T, size_t I, size_t... Is>
+            inline batch<T, A> sve_extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return svext(rhs, lhs, I);
+                }
+                else
+                {
+                    return sve_extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+
+            template <class A, class T, size_t... Is>
+            inline batch<T, A> sve_extract_pair_impl(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<0, Is...>) noexcept
+            {
+                if (n == 0)
+                {
+                    return rhs;
+                }
+                else
+                {
+                    return sve_extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, requires_arch<sve>) noexcept
+        {
+            constexpr std::size_t size = batch<T, A>::size;
+            assert(n < size && "index in bounds");
+            return detail::sve_extract_pair_impl(lhs, rhs, n, ::xsimd::detail::make_index_sequence<size>());
+        }
+
+        // select
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& a, batch<T, A> const& b, requires_arch<sve>) noexcept
+        {
+            return svsel(cond, a, b);
+        }
+
+        template <class A, class T, bool... b>
+        inline batch<T, A> select(batch_bool_constant<batch<T, A>, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sve>) noexcept
+        {
+            return select(batch_bool<T, A> { b... }, true_br, false_br, sve {});
+        }
+
+        // zip_lo
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svzip1(lhs, rhs);
+        }
+
+        // zip_hi
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<sve>) noexcept
+        {
+            return svzip2(lhs, rhs);
+        }
+
+        /*****************************
+         * Floating-point arithmetic *
+         *****************************/
+
+        // rsqrt
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        inline batch<T, A> rsqrt(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svrsqrte(arg);
+        }
+
+        // sqrt
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        inline batch<T, A> sqrt(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svsqrt_x(detail::sve_ptrue<T>(), arg);
+        }
+
+        // reciprocal
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        inline batch<T, A> reciprocal(const batch<T, A>& arg, requires_arch<sve>) noexcept
+        {
+            return svrecpe(arg);
+        }
+
+        /******************************
+         * Floating-point conversions *
+         ******************************/
+
+        // fast_cast
+        namespace detail
+        {
+            template <class A, class T, detail::enable_sized_integral_t<T, 4> = 0>
+            inline batch<float, A> fast_cast(batch<T, A> const& arg, batch<float, A> const&, requires_arch<sve>) noexcept
+            {
+                return svcvt_f32_x(detail::sve_ptrue<T>(), arg);
+            }
+
+            template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0>
+            inline batch<double, A> fast_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<sve>) noexcept
+            {
+                return svcvt_f64_x(detail::sve_ptrue<T>(), arg);
+            }
+
+            template <class A>
+            inline batch<int32_t, A> fast_cast(batch<float, A> const& arg, batch<int32_t, A> const&, requires_arch<sve>) noexcept
+            {
+                return svcvt_s32_x(detail::sve_ptrue<float>(), arg);
+            }
+
+            template <class A>
+            inline batch<uint32_t, A> fast_cast(batch<float, A> const& arg, batch<uint32_t, A> const&, requires_arch<sve>) noexcept
+            {
+                return svcvt_u32_x(detail::sve_ptrue<float>(), arg);
+            }
+
+            template <class A>
+            inline batch<int64_t, A> fast_cast(batch<double, A> const& arg, batch<int64_t, A> const&, requires_arch<sve>) noexcept
+            {
+                return svcvt_s64_x(detail::sve_ptrue<double>(), arg);
+            }
+
+            template <class A>
+            inline batch<uint64_t, A> fast_cast(batch<double, A> const& arg, batch<uint64_t, A> const&, requires_arch<sve>) noexcept
+            {
+                return svcvt_u64_x(detail::sve_ptrue<double>(), arg);
+            }
+        }
+
+        /*********
+         * Miscs *
+         *********/
+
+        // set
+        template <class A, class T, class... Args>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<sve>, Args... args) noexcept
+        {
+            return detail::sve_vector_type<T> { args... };
+        }
+
+        template <class A, class T, class... Args>
+        inline batch<std::complex<T>, A> set(batch<std::complex<T>, A> const&, requires_arch<sve>,
+                                             Args... args_complex) noexcept
+        {
+            return batch<std::complex<T>>(detail::sve_vector_type<T> { args_complex.real()... },
+                                          detail::sve_vector_type<T> { args_complex.imag()... });
+        }
+
+        template <class A, class T, class... Args>
+        inline batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<sve>, Args... args) noexcept
+        {
+            using U = as_unsigned_integer_t<T>;
+            const auto values = detail::sve_vector_type<U> { static_cast<U>(args)... };
+            const auto zero = broadcast<A, U>(static_cast<U>(0), sve {});
+            return svcmpne(detail::sve_ptrue<T>(), values, zero);
+        }
+
+        // insert
+        namespace detail
+        {
+            // generate index sequence (iota)
+            inline svuint8_t sve_iota_impl(index<1>) noexcept { return svindex_u8(0, 1); }
+            inline svuint16_t sve_iota_impl(index<2>) noexcept { return svindex_u16(0, 1); }
+            inline svuint32_t sve_iota_impl(index<4>) noexcept { return svindex_u32(0, 1); }
+            inline svuint64_t sve_iota_impl(index<8>) noexcept { return svindex_u64(0, 1); }
+
+            template <class T, class V = sve_vector_type<as_unsigned_integer_t<T>>>
+            inline V sve_iota() noexcept { return sve_iota_impl(index<sizeof(T)> {}); }
+        } // namespace detail
+
+        template <class A, class T, size_t I, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> insert(batch<T, A> const& arg, T val, index<I>, requires_arch<sve>) noexcept
+        {
+            // create a predicate with only the I-th lane activated
+            const auto iota = detail::sve_iota<T>();
+            const auto index_predicate = svcmpeq(detail::sve_ptrue<T>(), iota, static_cast<as_unsigned_integer_t<T>>(I));
+            return svsel(index_predicate, broadcast<A, T>(val, sve {}), arg);
+        }
+
+        // all
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline bool all(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return detail::sve_pcount<T>(arg) == batch_bool<T, A>::size;
+        }
+
+        // any
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline bool any(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svptest_any(arg, arg);
+        }
+
+        // bitwise_cast
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 1> = 0>
+        inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u8(arg);
+        }
+
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 1> = 0>
+        inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_s8(arg);
+        }
+
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 2> = 0>
+        inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u16(arg);
+        }
+
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 2> = 0>
+        inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_s16(arg);
+        }
+
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 4> = 0>
+        inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u32(arg);
+        }
+
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 4> = 0>
+        inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_s32(arg);
+        }
+
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_unsigned_t<R, 8> = 0>
+        inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_u64(arg);
+        }
+
+        template <class A, class T, class R, detail::sve_enable_all_t<T> = 0, detail::enable_sized_signed_t<R, 8> = 0>
+        inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_s64(arg);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<float, A> bitwise_cast(batch<T, A> const& arg, batch<float, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_f32(arg);
+        }
+
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<double, A> bitwise_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<sve>) noexcept
+        {
+            return svreinterpret_f64(arg);
+        }
+
+        // batch_bool_cast
+        template <class A, class T_out, class T_in, detail::sve_enable_all_t<T_in> = 0>
+        inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& arg, batch_bool<T_out, A> const&, requires_arch<sve>) noexcept
+        {
+            return arg.data;
+        }
+
+        // from_bool
+        template <class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return select(arg, batch<T, A>(1), batch<T, A>(0));
+        }
+
+        // slide_left
+        namespace detail
+        {
+            template <size_t N>
+            struct sve_slider_left
+            {
+                template <class A, class T>
+                inline batch<T, A> operator()(batch<T, A> const& arg) noexcept
+                {
+                    using u8_vector = batch<uint8_t, A>;
+                    const auto left = svdup_n_u8(0);
+                    const auto right = bitwise_cast(arg, u8_vector {}, sve {}).data;
+                    const u8_vector result(svext(left, right, u8_vector::size - N));
+                    return bitwise_cast(result, batch<T, A> {}, sve {});
+                }
+            };
+
+            template <>
+            struct sve_slider_left<0>
+            {
+                template <class A, class T>
+                inline batch<T, A> operator()(batch<T, A> const& arg) noexcept
+                {
+                    return arg;
+                }
+            };
+        } // namespace detail
+
+        template <size_t N, class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> slide_left(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return detail::sve_slider_left<N>()(arg);
+        }
+
+        // slide_right
+        namespace detail
+        {
+            template <size_t N>
+            struct sve_slider_right
+            {
+                template <class A, class T>
+                inline batch<T, A> operator()(batch<T, A> const& arg) noexcept
+                {
+                    using u8_vector = batch<uint8_t, A>;
+                    const auto left = bitwise_cast(arg, u8_vector {}, sve {}).data;
+                    const auto right = svdup_n_u8(0);
+                    const u8_vector result(svext(left, right, N));
+                    return bitwise_cast(result, batch<T, A> {}, sve {});
+                }
+            };
+
+            template <>
+            struct sve_slider_right<batch<uint8_t, sve>::size>
+            {
+                template <class A, class T>
+                inline batch<T, A> operator()(batch<T, A> const&) noexcept
+                {
+                    return batch<T, A> {};
+                }
+            };
+        } // namespace detail
+
+        template <size_t N, class A, class T, detail::sve_enable_all_t<T> = 0>
+        inline batch<T, A> slide_right(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return detail::sve_slider_right<N>()(arg);
+        }
+
+        // isnan
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        inline batch_bool<T, A> isnan(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return !(arg == arg);
+        }
+
+        // nearbyint
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        inline batch<T, A> nearbyint(batch<T, A> const& arg, requires_arch<sve>) noexcept
+        {
+            return svrintx_x(detail::sve_ptrue<T>(), arg);
+        }
+
+        // nearbyint_as_int
+        template <class A>
+        inline batch<int32_t, A> nearbyint_as_int(batch<float, A> const& arg, requires_arch<sve>) noexcept
+        {
+            const auto nearest = svrinta_x(detail::sve_ptrue<float>(), arg);
+            return svcvt_s32_x(detail::sve_ptrue<float>(), nearest);
+        }
+
+        template <class A>
+        inline batch<int64_t, A> nearbyint_as_int(batch<double, A> const& arg, requires_arch<sve>) noexcept
+        {
+            const auto nearest = svrinta_x(detail::sve_ptrue<double>(), arg);
+            return svcvt_s64_x(detail::sve_ptrue<double>(), nearest);
+        }
+
+        // ldexp
+        template <class A, class T, detail::sve_enable_floating_point_t<T> = 0>
+        inline batch<T, A> ldexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& exp, requires_arch<sve>) noexcept
+        {
+            return svscale_x(detail::sve_ptrue<T>(), x, exp);
+        }
+
+    } // namespace kernel
+} // namespace xsimd
+
+#endif