36 files changed, 7381 insertions, 0 deletions
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp b/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp
new file mode 100644
index 0000000000..4350ca0a28
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp
@@ -0,0 +1,46 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#include "xsimd_fma3_sse_register.hpp"
+#include "xsimd_fma4_register.hpp"
+#include "xsimd_sse2_register.hpp"
+#include "xsimd_sse3_register.hpp"
+#include "xsimd_sse4_1_register.hpp"
+#include "xsimd_sse4_2_register.hpp"
+
+#include "xsimd_avx2_register.hpp"
+#include "xsimd_avx_register.hpp"
+#include "xsimd_avxvnni_register.hpp"
+#include "xsimd_fma3_avx2_register.hpp"
+#include "xsimd_fma3_avx_register.hpp"
+
+#include "xsimd_avx512vnni_avx512bw_register.hpp"
+#include "xsimd_avx512vnni_avx512vbmi_register.hpp"
+
+#include "xsimd_avx512ifma_register.hpp"
+#include "xsimd_avx512vbmi_register.hpp"
+
+#include "xsimd_avx512er_register.hpp"
+#include "xsimd_avx512pf_register.hpp"
+
+#include "xsimd_avx512bw_register.hpp"
+#include "xsimd_avx512cd_register.hpp"
+#include "xsimd_avx512dq_register.hpp"
+#include "xsimd_avx512f_register.hpp"
+
+#include "xsimd_neon64_register.hpp"
+#include "xsimd_neon_register.hpp"
+
+#include "xsimd_sve_register.hpp"
+
+#include "xsimd_rvv_register.hpp"
+
+#include "xsimd_wasm_register.hpp"
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_api.hpp b/third_party/xsimd/include/xsimd/types/xsimd_api.hpp
new file mode 100644
index 0000000000..0420f0a09d
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_api.hpp
@@ -0,0 +1,2599 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_API_HPP
+#define XSIMD_API_HPP
+
+#include <complex>
+#include <cstddef>
+#include <limits>
+#include <ostream>
+
+#include "../arch/xsimd_isa.hpp"
+#include "../types/xsimd_batch.hpp"
+#include "../types/xsimd_traits.hpp"
+
+namespace xsimd
+{
+    /**
+     * high level free functions
+     *
+     * @defgroup batch_arithmetic Arithmetic operators
+     * @defgroup batch_constant Constant batches
+     * @defgroup batch_data_transfer Memory operators
+     * @defgroup batch_math Basic math operators
+     * @defgroup batch_math_extra Extra math operators
+     * @defgroup batch_fp Floating point manipulation
+     * @defgroup batch_rounding Rounding operators
+     * @defgroup batch_conversion Conversion operators
+     * @defgroup batch_complex_op Complex operators
+     * @defgroup batch_logical Logical operators
+     * @defgroup batch_bitwise Bitwise operators
+     * @defgroup batch_reducers Reducers
+     * @defgroup batch_miscellaneous Miscellaneous
+     * @defgroup batch_trigo Trigonometry
+     *
+     * @defgroup batch_bool_logical Boolean logical operators
+     * @defgroup batch_bool_reducers Boolean reducers
+     */
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the absolute values of each scalar in the batch \c x.
+     * @param x batch of integer or floating point values.
+     * @return the absolute values of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> abs(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::abs<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_complex
+     *
+     * Computes the absolute values of each complex in the batch \c z.
+     * @param z batch of complex values.
+     * @return the absolute values of \c z.
+     */
+    template <class T, class A>
+    inline batch<T, A> abs(batch<std::complex<T>, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::abs<A>(z, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the sum of the batches \c x and \c y.
+     * @param x batch or scalar involved in the addition.
+     * @param y batch or scalar involved in the addition.
+     * @return the sum of \c x and \c y
+     */
+    template <class T, class A>
+    inline auto add(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x + y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x + y;
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the arc cosine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the arc cosine of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> acos(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::acos<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the inverse hyperbolic cosine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the inverse hyperbolic cosine of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> acosh(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::acosh<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_complex
+     *
+     * Computes the argument of the batch \c z.
+     * @param z batch of complex or real values.
+     * @return the argument of \c z.
+     */
+    template <class T, class A>
+    inline real_batch_type_t<batch<T, A>> arg(batch<T, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::arg<A>(z, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the arc sine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the arc sine of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> asin(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::asin<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the inverse hyperbolic sine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the inverse hyperbolic sine of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> asinh(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::asinh<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the arc tangent of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the arc tangent of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> atan(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::atan<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the arc tangent of the batch \c x/y, using the signs of the
+     * arguments to determine the correct quadrant.
+     * @param x batch of floating point values.
+     * @param y batch of floating point values.
+     * @return the arc tangent of \c x/y.
+     */
+    template <class T, class A>
+    inline batch<T, A> atan2(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::atan2<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the inverse hyperbolic tangent of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the inverse hyperbolic tangent of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> atanh(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::atanh<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_conversion
+     *
+     * Perform a static_cast from \c T_in to \c T_out on \c \c x.
+     * @param x batch_bool of \c T_in
+     * @return \c x cast to \c T_out
+     */
+    template <class T_out, class T_in, class A>
+    inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T_out, A>();
+        detail::static_check_supported_config<T_in, A>();
+        static_assert(batch_bool<T_out, A>::size == batch_bool<T_in, A>::size, "Casting between incompatibles batch_bool types.");
+        return kernel::batch_bool_cast<A>(x, batch_bool<T_out, A> {}, A {});
+    }
+
+    /**
+     * @ingroup batch_conversion
+     *
+     * Perform a static_cast from \c T_in to \c T_out on \c \c x.
+     * @param x batch of \c T_in
+     * @return \c x cast to \c T_out
+     */
+    template <class T_out, class T_in, class A>
+    inline batch<T_out, A> batch_cast(batch<T_in, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T_out, A>();
+        detail::static_check_supported_config<T_in, A>();
+        return kernel::batch_cast<A>(x, batch<T_out, A> {}, A {});
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Computes the bit of sign of \c x
+     * @param x batch of scalar
+     * @return bit of sign of \c x
+     */
+    template <class T, class A>
+    inline batch<T, A> bitofsign(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitofsign<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise and of the batches \c x and \c y.
+     * @param x batch involved in the operation.
+     * @param y batch involved in the operation.
+     * @return the result of the bitwise and.
+     */
+    template <class T, class A>
+    inline auto bitwise_and(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x & y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x & y;
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise and of the batches \c x and \c y.
+     * @param x batch involved in the operation.
+     * @param y batch involved in the operation.
+     * @return the result of the bitwise and.
+     */
+    template <class T, class A>
+    inline auto bitwise_and(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept -> decltype(x & y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x & y;
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise and not of batches \c x and \c y.
+     * @param x batch involved in the operation.
+     * @param y batch involved in the operation.
+     * @return the result of the bitwise and not.
+     */
+    template <class T, class A>
+    inline batch<T, A> bitwise_andnot(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_andnot<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_bool_logical
+     *
+     * Computes the bitwise and not of batches \c x and \c y.
+     * @param x batch involved in the operation.
+     * @param y batch involved in the operation.
+     * @return the result of the bitwise and not.
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_andnot<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_conversion
+     *
+     * Perform a reinterpret_cast from \c T_in to \c T_out on \c x.
+     * @param x batch of \c T_in
+     * @return \c x reinterpreted as \c T_out
+     */
+    template <class T_out, class T_in, class A>
+    inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T_in, A>();
+        detail::static_check_supported_config<T_out, A>();
+        return kernel::bitwise_cast<A>(x, batch<T_out, A> {}, A {});
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Perform a bitwise shift to the left
+     * @param x batch of \c T_in
+     * @param shift scalar amount to shift
+     * @return shifted \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> bitwise_lshift(batch<T, A> const& x, int shift) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_lshift<A>(x, shift, A {});
+    }
+    template <class T, class A>
+    inline batch<T, A> bitwise_lshift(batch<T, A> const& x, batch<T, A> const& shift) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_lshift<A>(x, shift, A {});
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise not of batch \c x.
+     * @param x batch involved in the operation.
+     * @return the result of the bitwise not.
+     */
+    template <class T, class A>
+    inline batch<T, A> bitwise_not(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_not<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise not of batch \c x.
+     * @param x batch involved in the operation.
+     * @return the result of the bitwise not.
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_not<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise or of the batches \c x and \c y.
+     * @param x scalar or batch of scalars
+     * @param y scalar or batch of scalars
+     * @return the result of the bitwise or.
+     */
+    template <class T, class A>
+    inline auto bitwise_or(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x | y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x | y;
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise or of the batches \c x and \c y.
+     * @param x scalar or batch of scalars
+     * @param y scalar or batch of scalars
+     * @return the result of the bitwise or.
+     */
+    template <class T, class A>
+    inline auto bitwise_or(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept -> decltype(x | y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x | y;
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Perform a bitwise shift to the right
+     * @param x batch of \c T_in
+     * @param shift scalar amount to shift
+     * @return shifted \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> bitwise_rshift(batch<T, A> const& x, int shift) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_rshift<A>(x, shift, A {});
+    }
+    template <class T, class A>
+    inline batch<T, A> bitwise_rshift(batch<T, A> const& x, batch<T, A> const& shift) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_rshift<A>(x, shift, A {});
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise xor of the batches \c x and \c y.
+     * @param x scalar or batch of scalars
+     * @param y scalar or batch of scalars
+     * @return the result of the bitwise xor.
+     */
+    template <class T, class A>
+    inline auto bitwise_xor(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x ^ y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x ^ y;
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise xor of the batches \c x and \c y.
+     * @param x scalar or batch of scalars
+     * @param y scalar or batch of scalars
+     * @return the result of the bitwise xor.
+     */
+    template <class T, class A>
+    inline auto bitwise_xor(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept -> decltype(x ^ y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x ^ y;
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the single value \c v.
+     * @param v the value used to initialize the batch
+     * @return a new batch instance
+     */
+    template <class T, class A = default_arch>
+    inline batch<T, A> broadcast(T v) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return batch<T, A>::broadcast(v);
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the single value \c v and
+     * the specified batch value type \c To.
+     * @param v the value used to initialize the batch
+     * @return a new batch instance
+     */
+    template <class To, class A = default_arch, class From>
+    inline simd_return_type<From, To, A> broadcast_as(From v) noexcept
+    {
+        detail::static_check_supported_config<From, A>();
+        using batch_value_type = typename simd_return_type<From, To, A>::value_type;
+        using value_type = typename std::conditional<std::is_same<From, bool>::value,
+                                                     bool,
+                                                     batch_value_type>::type;
+        return simd_return_type<From, To, A>(value_type(v));
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the cubic root of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the cubic root of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> cbrt(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::cbrt<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_rounding
+     *
+     * Computes the batch of smallest integer values not less than
+     * scalars in \c x.
+     * @param x batch of floating point values.
+     * @return the batch of smallest integer values not less than \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> ceil(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::ceil<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Clips the values of the batch \c x between those of the batches \c lo and \c hi.
+     * @param x batch of scalar values.
+     * @param lo batch of scalar values.
+     * @param hi batch of scalar values.
+     * @return the result of the clipping.
+     */
+    template <class T, class A>
+    inline batch<T, A> clip(batch<T, A> const& x, batch<T, A> const& lo, batch<T, A> const& hi) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::clip(x, lo, hi, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Pick elements from \c x selected by \c mask, and append them to the
+     * resulting vector, zeroing the remaining slots
+     */
+    template <class T, class A>
+    inline batch<T, A> compress(batch<T, A> const& x, batch_bool<T, A> const& mask) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::compress<A>(x, mask, A {});
+    }
+
+    /**
+     * @ingroup batch_complex
+     *
+     * Computes the conjugate of the batch \c z.
+     * @param z batch of complex values.
+     * @return the argument of \c z.
+     */
+    template <class A, class T>
+    inline complex_batch_type_t<batch<T, A>> conj(batch<T, A> const& z) noexcept
+    {
+        return kernel::conj(z, A {});
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Computes a value whose  absolute  value  matches
+     *        that of \c x, but whose sign bit matches that of \c y.
+     * @param x batch of scalars
+     * @param y batch of scalars
+     * @return batch whose absolute  value  matches that of \c x, but whose sign bit
+     * matches that of \c y.
+     */
+    template <class T, class A>
+    inline batch<T, A> copysign(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::copysign<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the cosine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the cosine of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> cos(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::cos<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * computes the hyperbolic cosine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the hyperbolic cosine of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> cosh(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::cosh<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Subtract 1 to batch \c x.
+     * @param x batch involved in the decrement.
+     * @return the subtraction of \c x and 1.
+     */
+    template <class T, class A>
+    inline batch<T, A> decr(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::decr<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Subtract 1 to batch \c x for each element where \c mask is true.
+     * @param x batch involved in the increment.
+     * @param mask whether to perform the increment or not. Can be a \c
+     *             batch_bool or a \c batch_bool_constant.
+     * @return the subtraction of \c x and 1 when \c mask is true.
+     */
+    template <class T, class A, class Mask>
+    inline batch<T, A> decr_if(batch<T, A> const& x, Mask const& mask) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::decr_if<A>(x, mask, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the division of the batch \c x by the batch \c y.
+     * @param x scalar or batch of scalars
+     * @param y scalar or batch of scalars
+     * @return the result of the division.
+     */
+    template <class T, class A>
+    inline auto div(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x / y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x / y;
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Element-wise equality comparison of batches \c x and \c y.
+     * @param x batch of scalars
+     * @param y batch of scalars
+     * @return a boolean batch.
+     */
+    template <class T, class A>
+    inline auto eq(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x == y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x == y;
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Element-wise equality comparison of batches of boolean values \c x and \c y.
+     * @param x batch of booleans involved in the comparison.
+     * @param y batch of booleans involved in the comparison.
+     * @return a boolean batch.
+     */
+    template <class T, class A>
+    inline auto eq(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept -> decltype(x == y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x == y;
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the natural exponential of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the natural exponential of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> exp(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::exp<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the base 10 exponential of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the base 10 exponential of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> exp10(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::exp10<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the base 2 exponential of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the base 2 exponential of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> exp2(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::exp2<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Load contiguous elements from \c x and place them in slots selected by \c
+     * mask, zeroing the other slots
+     */
+    template <class T, class A>
+    inline batch<T, A> expand(batch<T, A> const& x, batch_bool<T, A> const& mask) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::expand<A>(x, mask, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the natural exponential of the batch \c x, minus one.
+     * @param x batch of floating point values.
+     * @return the natural exponential of \c x, minus one.
+     */
+    template <class T, class A>
+    inline batch<T, A> expm1(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::expm1<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math_extra
+     *
+     * Computes the error function of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the error function of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> erf(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::erf<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math_extra
+     *
+     * Computes the complementary error function of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the error function of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> erfc(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::erfc<A>(x, A {});
+    }
+
+    /**
+     * Extract vector from pair of vectors
+     * extracts the lowest vector elements from the second source \c x
+     * and the highest vector elements from the first source \c y
+     * Concatenates the results into th Return value.
+     * @param x batch of integer or floating point values.
+     * @param y batch of integer or floating point values.
+     * @param i integer specifying the lowest vector element to extract from the first source register
+     * @return.
+     */
+    template <class T, class A>
+    inline batch<T, A> extract_pair(batch<T, A> const& x, batch<T, A> const& y, std::size_t i) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::extract_pair<A>(x, y, i, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the absolute values of each scalar in the batch \c x.
+     * @param x batch floating point values.
+     * @return the absolute values of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> fabs(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::abs<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the positive difference between \c x and \c y, that is,
+     * <tt>max(0, x-y)</tt>.
+     * @param x batch of floating point values.
+     * @param y batch of floating point values.
+     * @return the positive difference.
+     */
+    template <class T, class A>
+    inline batch<T, A> fdim(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::fdim<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_rounding
+     *
+     * Computes the batch of largest integer values not greater than
+     * scalars in \c x.
+     * @param x batch of floating point values.
+     * @return the batch of largest integer values not greater than \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> floor(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::floor<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes <tt>(x*y) + z</tt> in a single instruction when possible.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @param z a batch of integer or floating point values.
+     * @return the result of the fused multiply-add operation.
+     */
+    template <class T, class A>
+    inline batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::fma<A>(x, y, z, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the larger values of the batches \c x and \c y.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @return a batch of the larger values.
+     */
+    template <class T, class A>
+    inline batch<T, A> fmax(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::max<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the smaller values of the batches \c x and \c y.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @return a batch of the smaller values.
+     */
+    template <class T, class A>
+    inline batch<T, A> fmin(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::min<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the modulo of the batch \c x by the batch \c y.
+     * @param x batch involved in the modulo.
+     * @param y batch involved in the modulo.
+     * @return the result of the modulo.
+     */
+    template <class T, class A>
+    inline batch<T, A> fmod(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::fmod<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes <tt>(x*y) - z</tt> in a single instruction when possible.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @param z a batch of integer or floating point values.
+     * @return the result of the fused multiply-sub operation.
+     */
+    template <class T, class A>
+    inline batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::fms<A>(x, y, z, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes <tt>-(x*y) + z</tt> in a single instruction when possible.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @param z a batch of integer or floating point values.
+     * @return the result of the fused negated multiply-add operation.
+     */
+    template <class T, class A>
+    inline batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::fnma<A>(x, y, z, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes <tt>-(x*y) - z</tt> in a single instruction when possible.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @param z a batch of integer or floating point values.
+     * @return the result of the fused negated multiply-sub operation.
+     */
+    template <class T, class A>
+    inline batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::fnms<A>(x, y, z, A {});
+    }
+
+    /**
+     * @ingroup batch_fp
+     *
+     * Split split the number x into a normalized fraction and an exponent which is stored in exp
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @return the normalized fraction of x
+     */
+    template <class T, class A>
+    inline batch<T, A> frexp(const batch<T, A>& x, batch<as_integer_t<T>, A>& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::frexp<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Element-wise greater or equal comparison of batches \c x and \c y.
+     * @tparam X the actual type of batch.
+     * @param x batch involved in the comparison.
+     * @param y batch involved in the comparison.
+     * @return a boolean batch.
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> ge(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return x >= y;
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Element-wise greater than comparison of batches \c x and \c y.
+     * @tparam X the actual type of batch.
+     * @param x batch involved in the comparison.
+     * @param y batch involved in the comparison.
+     * @return a boolean batch.
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> gt(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return x > y;
+    }
+
+    /**
+     * @ingroup batch_reducers
+     *
+     * Parallel horizontal addition: adds the scalars of each batch
+     * in the array pointed by \c row and store them in a returned
+     * batch.
+     * @param row an array of \c N batches
+     * @return the result of the reduction.
+     */
+    template <class T, class A>
+    inline batch<T, A> haddp(batch<T, A> const* row) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::haddp<A>(row, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the square root of the sum of the squares of the batches
+     * \c x, and \c y.
+     * @param x batch of floating point values.
+     * @param y batch of floating point values.
+     * @return the square root of the sum of the squares of \c x and \c y.
+     */
+    template <class T, class A>
+    inline batch<T, A> hypot(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::hypot<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_complex
+     *
+     * Computes the imaginary part of the batch \c x.
+     * @param x batch of complex or real values.
+     * @return the argument of \c x.
+     */
+    template <class T, class A>
+    inline real_batch_type_t<batch<T, A>> imag(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::imag<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Add 1 to batch \c x.
+     * @param x batch involved in the increment.
+     * @return the sum of \c x and 1.
+     */
+    template <class T, class A>
+    inline batch<T, A> incr(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::incr<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Add 1 to batch \c x for each element where \c mask is true.
+     * @param x batch involved in the increment.
+     * @param mask whether to perform the increment or not. Can be a \c
+     *             batch_bool or a \c batch_bool_constant.
+     * @return the sum of \c x and 1 when \c mask is true.
+     */
+    template <class T, class A, class Mask>
+    inline batch<T, A> incr_if(batch<T, A> const& x, Mask const& mask) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::incr_if<A>(x, mask, A {});
+    }
+
+    /**
+     * @ingroup batch_constant
+     *
+     * Return a batch of scalars representing positive infinity
+     * @return a batch of positive infinity
+     */
+    template <class B>
+    inline B infinity()
+    {
+        using T = typename B::value_type;
+        using A = typename B::arch_type;
+        detail::static_check_supported_config<T, A>();
+        return B(std::numeric_limits<T>::infinity());
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Create a new batch equivalent to \c x but with element \c val set at position \c pos
+     * @param x batch
+     * @param val value to set
+     * @param pos index of the updated slot
+     * @return copy of \c x with position \c pos set to \c val
+     */
+    template <class T, class A, size_t I>
+    inline batch<T, A> insert(batch<T, A> const& x, T val, index<I> pos) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::insert<A>(x, val, pos, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Determines if the scalars in the given batch \c x represent an even integer value
+     * @param x batch of floating point values.
+     * @return a batch of booleans.
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> is_even(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::is_even<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Determines if the floating-point scalars in the given batch \c x represent integer value
+     * @param x batch of floating point values.
+     * @return a batch of booleans.
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> is_flint(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::is_flint<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Determines if the scalars in the given batch \c x represent an odd integer value
+     * @param x batch of floating point values.
+     * @return a batch of booleans.
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> is_odd(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::is_odd<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Determines if the scalars in the given batch \c x are inf values.
+     * @param x batch of floating point values.
+     * @return a batch of booleans.
+     */
+    template <class T, class A>
+    inline typename batch<T, A>::batch_bool_type isinf(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::isinf<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Determines if the scalars in the given batch \c x are finite values.
+     * @param x batch of floating point values.
+     * @return a batch of booleans.
+     */
+    template <class T, class A>
+    inline typename batch<T, A>::batch_bool_type isfinite(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::isfinite<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Determines if the scalars in the given batch \c x are NaN values.
+     * @param x batch of floating point values.
+     * @return a batch of booleans.
+     */
+    template <class T, class A>
+    inline typename batch<T, A>::batch_bool_type isnan(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::isnan<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math_extra
+     *
+     * Computes the multiplication of the floating point number \c x by 2 raised to the power \c y.
+     * @param x batch of floating point values.
+     * @param y batch of integer values.
+     * @return a batch of floating point values.
+     */
+    template <class T, class A>
+    inline batch<T, A> ldexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::ldexp<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Element-wise lesser or equal to comparison of batches \c x and \c y.
+     * @param x batch involved in the comparison.
+     * @param y batch involved in the comparison.
+     * @return a boolean batch.
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> le(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return x <= y;
+    }
+
+    /**
+     * @ingroup batch_math_extra
+     *
+     * Computes the natural logarithm of the gamma function of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the natural logarithm of the gamma function of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> lgamma(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::lgamma<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the buffer \c ptr and the specifed
+     * batch value type \c To. The memory needs to be aligned.
+     * @param ptr the memory buffer to read
+     * @return a new batch instance
+     */
+    template <class To, class A = default_arch, class From>
+    inline simd_return_type<From, To, A> load_as(From const* ptr, aligned_mode) noexcept
+    {
+        using batch_value_type = typename simd_return_type<From, To, A>::value_type;
+        detail::static_check_supported_config<From, A>();
+        detail::static_check_supported_config<To, A>();
+        return kernel::load_aligned<A>(ptr, kernel::convert<batch_value_type> {}, A {});
+    }
+
+    template <class To, class A = default_arch>
+    inline simd_return_type<bool, To, A> load_as(bool const* ptr, aligned_mode) noexcept
+    {
+        detail::static_check_supported_config<To, A>();
+        return simd_return_type<bool, To, A>::load_aligned(ptr);
+    }
+
+    template <class To, class A = default_arch, class From>
+    inline simd_return_type<std::complex<From>, To, A> load_as(std::complex<From> const* ptr, aligned_mode) noexcept
+    {
+        detail::static_check_supported_config<To, A>();
+        using batch_value_type = typename simd_return_type<std::complex<From>, To, A>::value_type;
+        return kernel::load_complex_aligned<A>(ptr, kernel::convert<batch_value_type> {}, A {});
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class To, class A = default_arch, class From, bool i3ec>
+    inline simd_return_type<xtl::xcomplex<From, From, i3ec>, To, A> load_as(xtl::xcomplex<From, From, i3ec> const* ptr, aligned_mode) noexcept
+    {
+        detail::static_check_supported_config<To, A>();
+        detail::static_check_supported_config<From, A>();
+        return load_as<To>(reinterpret_cast<std::complex<From> const*>(ptr), aligned_mode());
+    }
+#endif
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the buffer \c ptr and the specifed
+     * batch value type \c To. The memory does not need to be aligned.
+     * @param ptr the memory buffer to read
+     * @return a new batch instance
+     */
+    template <class To, class A = default_arch, class From>
+    inline simd_return_type<From, To, A> load_as(From const* ptr, unaligned_mode) noexcept
+    {
+        using batch_value_type = typename simd_return_type<From, To, A>::value_type;
+        detail::static_check_supported_config<To, A>();
+        detail::static_check_supported_config<From, A>();
+        return kernel::load_unaligned<A>(ptr, kernel::convert<batch_value_type> {}, A {});
+    }
+
+    template <class To, class A = default_arch>
+    inline simd_return_type<bool, To, A> load_as(bool const* ptr, unaligned_mode) noexcept
+    {
+        return simd_return_type<bool, To, A>::load_unaligned(ptr);
+    }
+
+    template <class To, class A = default_arch, class From>
+    inline simd_return_type<std::complex<From>, To, A> load_as(std::complex<From> const* ptr, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<To, A>();
+        detail::static_check_supported_config<From, A>();
+        using batch_value_type = typename simd_return_type<std::complex<From>, To, A>::value_type;
+        return kernel::load_complex_unaligned<A>(ptr, kernel::convert<batch_value_type> {}, A {});
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class To, class A = default_arch, class From, bool i3ec>
+    inline simd_return_type<xtl::xcomplex<From, From, i3ec>, To, A> load_as(xtl::xcomplex<From, From, i3ec> const* ptr, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<To, A>();
+        detail::static_check_supported_config<From, A>();
+        return load_as<To>(reinterpret_cast<std::complex<From> const*>(ptr), unaligned_mode());
+    }
+#endif
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the buffer \c ptr. The
+     * memory needs to be aligned.
+     * @param ptr the memory buffer to read
+     * @return a new batch instance
+     */
+    template <class A = default_arch, class From>
+    inline batch<From, A> load(From const* ptr, aligned_mode = {}) noexcept
+    {
+        detail::static_check_supported_config<From, A>();
+        return load_as<From, A>(ptr, aligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the buffer \c ptr. The
+     * memory does not need to be aligned.
+     * @param ptr the memory buffer to read
+     * @return a new batch instance
+     */
+    template <class A = default_arch, class From>
+    inline batch<From, A> load(From const* ptr, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<From, A>();
+        return load_as<From, A>(ptr, unaligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the buffer \c ptr. The
+     * memory needs to be aligned.
+     * @param ptr the memory buffer to read
+     * @return a new batch instance
+     */
+    template <class A = default_arch, class From>
+    inline batch<From, A> load_aligned(From const* ptr) noexcept
+    {
+        detail::static_check_supported_config<From, A>();
+        return load_as<From, A>(ptr, aligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the buffer \c ptr. The
+     * memory does not need to be aligned.
+     * @param ptr the memory buffer to read
+     * @return a new batch instance
+     */
+    template <class A = default_arch, class From>
+    inline batch<From, A> load_unaligned(From const* ptr) noexcept
+    {
+        detail::static_check_supported_config<From, A>();
+        return load_as<From, A>(ptr, unaligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the natural logarithm of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the natural logarithm of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> log(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::log<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     * Computes the base 2 logarithm of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the base 2 logarithm of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> log2(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::log2<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     * Computes the base 10 logarithm of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the base 10 logarithm of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> log10(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::log10<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     * Computes the natural logarithm of one plus the batch \c x.
+     * @param x batch of floating point values.
+     * @return the natural logarithm of one plus \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> log1p(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::log1p<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Element-wise lesser than comparison of batches \c x and \c y.
+     * @param x batch involved in the comparison.
+     * @param y batch involved in the comparison.
+     * @return a boolean batch.
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> lt(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return x < y;
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the larger values of the batches \c x and \c y.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @return a batch of the larger values.
+     */
+    template <class T, class A>
+    inline batch<T, A> max(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::max<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the smaller values of the batches \c x and \c y.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @return a batch of the smaller values.
+     */
+    template <class T, class A>
+    inline batch<T, A> min(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::min<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_constant
+     *
+     * Return a batch of scalars representing positive infinity
+     * @return a batch of positive infinity
+     */
+    template <class B>
+    inline B minusinfinity() noexcept
+    {
+        using T = typename B::value_type;
+        using A = typename B::arch_type;
+        detail::static_check_supported_config<T, A>();
+        return B(-std::numeric_limits<T>::infinity());
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the integer modulo of the batch \c x by the batch \c y.
+     * @param x batch involved in the modulo.
+     * @param y batch involved in the modulo.
+     * @return the result of the modulo.
+     */
+    template <class T, class A>
+    inline auto mod(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x % y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x % y;
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the product of the batches \c x and \c y.
+     * @tparam X the actual type of batch.
+     * @param x batch involved in the product.
+     * @param y batch involved in the product.
+     * @return the result of the product.
+     */
+    template <class T, class A>
+    inline auto mul(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x * y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x * y;
+    }
+
+    /**
+     * @ingroup batch_rounding
+     *
+     * Rounds the scalars in \c x to integer values (in floating point format), using
+     * the current rounding mode.
+     * @param x batch of floating point values.
+     * @return the batch of nearest integer values.
+     */
+    template <class T, class A>
+    inline batch<T, A> nearbyint(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::nearbyint<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_rounding
+     *
+     * Rounds the scalars in \c x to integer values (in integer format) using
+     * the current rounding mode.
+     * @param x batch of floating point values.
+     * @return the batch of nearest integer values.
+     *
+     * @warning For very large values the conversion to int silently overflows.
+     */
+    template <class T, class A>
+    inline batch<as_integer_t<T>, A>
+    nearbyint_as_int(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::nearbyint_as_int(x, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Element-wise inequality comparison of batches \c x and \c y.
+     * @param x batch involved in the comparison.
+     * @param y batch involved in the comparison.
+     * @return a boolean batch.
+     */
+    template <class T, class A>
+    inline auto neq(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x != y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x != y;
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Element-wise inequality comparison of batches of boolean values \c x and \c y.
+     * @param x batch of booleans involved in the comparison.
+     * @param y batch of booleans involved in the comparison.
+     * @return a boolean batch.
+     */
+    template <class T, class A>
+    inline auto neq(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept -> decltype(x != y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x != y;
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the opposite of the batch \c x.
+     * @param x batch involved in the operation.
+     * @return the opposite of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> neg(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return -x;
+    }
+
+    /**
+     * @ingroup batch_math_extra
+     *
+     * Computes  the next representable  floating-point
+     *        value  following  x  in the direction of y
+     * @param x batch of floating point values.
+     * @param y batch of floating point values.
+     * @return \c x raised to the power \c y.
+     */
+    template <class T, class A>
+    inline batch<T, A> nextafter(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::nextafter<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_complex
+     *
+     * Computes the norm of the batch \c x.
+     * @param x batch of complex or real values.
+     * @return the norm of \c x.
+     */
+    template <class T, class A>
+    inline real_batch_type_t<batch<T, A>> norm(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::norm(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Returns a complex batch with magnitude \c r and phase angle \c theta.
+     * @param r The magnitude of the desired complex result.
+     * @param theta The phase angle of the desired complex result.
+     * @return \c r exp(i * \c theta).
+     */
+    template <class T, class A>
+    inline complex_batch_type_t<batch<T, A>> polar(batch<T, A> const& r, batch<T, A> const& theta = batch<T, A> {}) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::polar<A>(r, theta, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * No-op on \c x.
+     * @param x batch involved in the operation.
+     * @return \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> pos(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return +x;
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the value of the batch \c x raised to the power
+     * \c y.
+     * @param x batch of floating point values.
+     * @param y batch of floating point values.
+     * @return \c x raised to the power \c y.
+     */
+    template <class T, class A>
+    inline batch<T, A> pow(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::pow<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the value of the batch \c x raised to the power
+     * \c y.
+     * @param x batch of integral values.
+     * @param y batch of integral values.
+     * @return \c x raised to the power \c y.
+     */
+    template <class T, class ITy, class A, class = typename std::enable_if<std::is_integral<ITy>::value, void>::type>
+    inline batch<T, A> pow(batch<T, A> const& x, ITy y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::ipow<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_complex
+     *
+     * Computes the projection of the batch \c z.
+     * @param z batch of complex or real values.
+     * @return the projection of \c z.
+     */
+    template <class T, class A>
+    inline complex_batch_type_t<batch<T, A>> proj(batch<T, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::proj(z, A {});
+    }
+
+    /**
+     * @ingroup batch_complex
+     *
+     * Computes the real part of the batch \c z.
+     * @param z batch of complex or real values.
+     * @return the argument of \c z.
+     */
+    template <class T, class A>
+    inline real_batch_type_t<batch<T, A>> real(batch<T, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::real<A>(z, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the approximate reciprocal of the batch \c x.
+     * The maximum relative error for this approximation is
+     * less than 1.5*2^-12.
+     * @param x batch of floating point numbers.
+     * @return the reciprocal.
+     */
+    template <class T, class A, class = typename std::enable_if<std::is_floating_point<T>::value, void>::type>
+    inline batch<T, A> reciprocal(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::reciprocal(x, A {});
+    }
+
+    /**
+     * @ingroup batch_reducers
+     *
+     * Generic reducer using only batch operations
+     * @param f reducing function, accepting `batch ()(batch, batch)`
+     * @param x batch involved in the reduction
+     * @return the result of the reduction, as a scalar.
+     */
+    template <class T, class A, class F>
+    inline T reduce(F&& f, batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::detail::reduce(std::forward<F>(f), x, std::integral_constant<unsigned, batch<T, A>::size>());
+    }
+
+    /**
+     * @ingroup batch_reducers
+     *
+     * Adds all the scalars of the batch \c x.
+     * @param x batch involved in the reduction
+     * @return the result of the reduction.
+     */
+    template <class T, class A>
+    inline T reduce_add(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::reduce_add<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_reducers
+     *
+     * Max of all the scalars of the batch \c x.
+     * @param x batch involved in the reduction
+     * @return the result of the reduction.
+     */
+    template <class T, class A>
+    inline T reduce_max(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::reduce_max<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_reducers
+     *
+     * Min of all the scalars of the batch \c x.
+     * @param x batch involved in the reduction
+     * @return the result of the reduction.
+     */
+    template <class T, class A>
+    inline T reduce_min(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::reduce_min<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the remainder of dividing \c x by \c y
+     * @param x batch of scalar values
+     * @param y batch of scalar values
+     * @return the result of the addition.
+     */
+    template <class T, class A>
+    inline batch<T, A> remainder(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::remainder<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_rounding
+     *
+     * Rounds the scalars in \c x to integer values (in floating point format), using
+     * the current rounding mode.
+     * @param x batch of floating point values.
+     * @return the batch of rounded values.
+     */
+    template <class T, class A>
+    inline batch<T, A> rint(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return nearbyint(x);
+    }
+
+    /**
+     * @ingroup rotate_left
+     *
+     * Slide the whole batch to the left by \c n bytes, and reintroduce the
+     * slided out elements from the right. This is different from
+     * \c rol that rotates each batch element to the left.
+     *
+     * @tparam N Amount of bytes to rotated to the left.
+     * @param x batch of integer values.
+     * @return rotated batch.
+     */
+    template <size_t N, class T, class A>
+    inline batch<T, A> rotate_left(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::rotate_left<N, A>(x, A {});
+    }
+
+    /**
+     * @ingroup rotate_right
+     *
+     * Slide the whole batch to the right by \c n bytes, and reintroduce the
+     * slided out elements from the left. This is different from
+     * \c rol that rotates each batch element to the left.
+     *
+     * @tparam N Amount of bytes to rotate to the right.
+     * @param x batch of integer values.
+     * @return rotated batch.
+     */
+    template <size_t N, class T, class A>
+    inline batch<T, A> rotate_right(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::rotate_right<N, A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Perform a bitwise shift to the left, reintroducing the shifted out bits
+     * to the right
+     * @param x batch to rotate
+     * @param shift scalar amount to shift
+     * @return rotated \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> rotl(batch<T, A> const& x, int shift) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::rotl<A>(x, shift, A {});
+    }
+    template <class T, class A>
+    inline batch<T, A> rotl(batch<T, A> const& x, batch<T, A> const& shift) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::rotl<A>(x, shift, A {});
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Perform a bitwise shift to the right, reintroducing the shifted out bits
+     * to the left.
+     * @param x batch to rotate
+     * @param shift scalar amount to shift
+     * @return rotated \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> rotr(batch<T, A> const& x, int shift) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::rotr<A>(x, shift, A {});
+    }
+    template <class T, class A>
+    inline batch<T, A> rotr(batch<T, A> const& x, batch<T, A> const& shift) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::rotr<A>(x, shift, A {});
+    }
+
+    /**
+     * @ingroup batch_rounding
+     *
+     * Computes the batch of nearest integer values to scalars in \c x (in
+     * floating point format), rounding halfway cases away from zero, regardless
+     * of the current rounding mode.
+     * @param x batch of flaoting point values.
+     * @return the batch of nearest integer values.
+     */
+    template <class T, class A>
+    inline batch<T, A> round(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::round<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes an estimate of the inverse square root of the batch \c x.
+     *
+     * @warning Unlike most xsimd function, this does not return the same result as the
+     * equivalent scalar operation, trading accuracy for speed.
+     *
+     * @param x batch of floating point values.
+     * @return the inverse square root of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> rsqrt(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::rsqrt<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the saturate sum of the batch \c x and the batch \c y.
+
+     * @tparam X the actual type of batch.
+     * @param x batch involved in the saturated addition.
+     * @param y batch involved in the saturated addition.
+     * @return the result of the saturated addition.
+     */
+    template <class T, class A>
+    inline batch<T, A> sadd(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::sadd<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Ternary operator for batches: selects values from the batches \c true_br or \c false_br
+     * depending on the boolean values in the constant batch \c cond. Equivalent to
+     * \code{.cpp}
+     * for(std::size_t i = 0; i < N; ++i)
+     *     res[i] = cond[i] ? true_br[i] : false_br[i];
+     * \endcode
+     * @param cond batch condition.
+     * @param true_br batch values for truthy condition.
+     * @param false_br batch value for falsy condition.
+     * @return the result of the selection.
+     */
+    template <class T, class A>
+    inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::select<A>(cond, true_br, false_br, A {});
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Ternary operator for batches: selects values from the batches \c true_br or \c false_br
+     * depending on the boolean values in the constant batch \c cond. Equivalent to
+     * \code{.cpp}
+     * for(std::size_t i = 0; i < N; ++i)
+     *     res[i] = cond[i] ? true_br[i] : false_br[i];
+     * \endcode
+     * @param cond batch condition.
+     * @param true_br batch values for truthy condition.
+     * @param false_br batch value for falsy condition.
+     * @return the result of the selection.
+     */
+    template <class T, class A>
+    inline batch<std::complex<T>, A> select(batch_bool<T, A> const& cond, batch<std::complex<T>, A> const& true_br, batch<std::complex<T>, A> const& false_br) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::select<A>(cond, true_br, false_br, A {});
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Ternary operator for batches: selects values from the batches \c true_br or \c false_br
+     * depending on the boolean values in the constant batch \c cond. Equivalent to
+     * \code{.cpp}
+     * for(std::size_t i = 0; i < N; ++i)
+     *     res[i] = cond[i] ? true_br[i] : false_br[i];
+     * \endcode
+     * @param cond constant batch condition.
+     * @param true_br batch values for truthy condition.
+     * @param false_br batch value for falsy condition.
+     * @return the result of the selection.
+     */
+    template <class T, class A, bool... Values>
+    inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::select<A>(cond, true_br, false_br, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Combine elements from \c x and \c y according to selector \c mask
+     * @param x batch
+     * @param y batch
+     * @param mask constant batch mask of integer elements of the same size as
+     * element of \c x and \c y. Each element of the mask index the vector that
+     * would be formed by the concatenation of \c x and \c y. For instance
+     * \code{.cpp}
+     * batch_constant<batch<uint32_t, sse2>, 0, 4, 3, 7>
+     * \endcode
+     * Picks \c x[0], \c y[0], \c x[3], \c y[3]
+     *
+     * @return combined batch
+     */
+    template <class T, class A, class Vt, Vt... Values>
+    inline typename std::enable_if<std::is_arithmetic<T>::value, batch<T, A>>::type
+    shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<batch<Vt, A>, Values...> mask) noexcept
+    {
+        static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
+        detail::static_check_supported_config<T, A>();
+        return kernel::shuffle<A>(x, y, mask, A {});
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Computes the sign of \c x
+     * @param x batch
+     * @return -1 for each negative element, -1 or +1 for each null element and +1 for each element
+     */
+    template <class T, class A>
+    inline batch<T, A> sign(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::sign<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Computes the sign of \c x, assuming x doesn't have any zero
+     * @param x batch
+     * @return -1 for each negative element, -1 or +1 for each null element and +1 for each element
+     */
+    template <class T, class A>
+    inline batch<T, A> signnz(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::signnz<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the sine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the sine of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> sin(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::sin<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the sine and the cosine of the batch \c x. This method is faster
+     * than calling sine and cosine independently.
+     * @param x batch of floating point values.
+     * @return a pair containing the sine then the cosine of  batch \c x
+     */
+    template <class T, class A>
+    inline std::pair<batch<T, A>, batch<T, A>> sincos(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::sincos<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the hyperbolic sine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the hyperbolic sine of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> sinh(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::sinh<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Slide the whole batch to the left by \c n bytes. This is different from
+     * \c bitwise_lshift that shifts each batch element to the left.
+     *
+     * @tparam N Amount of bytes to slide to the left.
+     * @param x batch of integer values.
+     * @return slided batch.
+     */
+    template <size_t N, class T, class A>
+    inline batch<T, A> slide_left(batch<T, A> const& x) noexcept
+    {
+        static_assert(std::is_integral<T>::value, "can only slide batch of integers");
+        detail::static_check_supported_config<T, A>();
+        return kernel::slide_left<N, A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Slide the whole batch to the right by \c N bytes. This is different from
+     * \c bitwise_rshift that shifts each batch element to the right.
+     *
+     * @tparam N Amount of bytes to slide to the right.
+     * @param x batch of integer values.
+     * @return slided batch.
+     */
+    template <size_t N, class T, class A>
+    inline batch<T, A> slide_right(batch<T, A> const& x) noexcept
+    {
+        static_assert(std::is_integral<T>::value, "can only slide batch of integers");
+        detail::static_check_supported_config<T, A>();
+        return kernel::slide_right<N, A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the square root of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the square root of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> sqrt(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::sqrt<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the saturate difference of the batch \c x and the batch \c y.
+     * @tparam X the actual type of batch.
+     * @param x batch involved in the saturated difference.
+     * @param y batch involved in the saturated difference.
+     * @return the result of the saturated difference.
+     */
+    template <class T, class A>
+    inline batch<T, A> ssub(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::ssub<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Copy content of batch \c src to the buffer \c dst. The
+     * memory needs to be aligned.
+     * @param dst the memory buffer to write to
+     * @param src the batch to copy
+     */
+    template <class To, class A = default_arch, class From>
+    inline void store_as(To* dst, batch<From, A> const& src, aligned_mode) noexcept
+    {
+        kernel::store_aligned(dst, src, A {});
+    }
+
+    template <class A = default_arch, class From>
+    inline void store_as(bool* dst, batch_bool<From, A> const& src, aligned_mode) noexcept
+    {
+        kernel::store(src, dst, A {});
+    }
+
+    template <class To, class A = default_arch, class From>
+    inline void store_as(std::complex<To>* dst, batch<std::complex<From>, A> const& src, aligned_mode) noexcept
+    {
+        kernel::store_complex_aligned(dst, src, A {});
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class To, class A = default_arch, class From, bool i3ec>
+    inline void store_as(xtl::xcomplex<To, To, i3ec>* dst, batch<std::complex<From>, A> const& src, aligned_mode) noexcept
+    {
+        store_as(reinterpret_cast<std::complex<To>*>(dst), src, aligned_mode());
+    }
+#endif
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Copy content of batch \c src to the buffer \c dst. The
+     * memory does not need to be aligned.
+     * @param dst the memory buffer to write to
+     * @param src the batch to copy
+     */
+    template <class To, class A = default_arch, class From>
+    inline void store_as(To* dst, batch<From, A> const& src, unaligned_mode) noexcept
+    {
+        kernel::store_unaligned(dst, src, A {});
+    }
+
+    template <class A = default_arch, class From>
+    inline void store_as(bool* dst, batch_bool<From, A> const& src, unaligned_mode) noexcept
+    {
+        kernel::store(src, dst, A {});
+    }
+
+    template <class To, class A = default_arch, class From>
+    inline void store_as(std::complex<To>* dst, batch<std::complex<From>, A> const& src, unaligned_mode) noexcept
+    {
+        kernel::store_complex_unaligned(dst, src, A {});
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class To, class A = default_arch, class From, bool i3ec>
+    inline void store_as(xtl::xcomplex<To, To, i3ec>* dst, batch<std::complex<From>, A> const& src, unaligned_mode) noexcept
+    {
+        store_as(reinterpret_cast<std::complex<To>*>(dst), src, unaligned_mode());
+    }
+#endif
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Copy content of batch \c val to the buffer \c mem. The
+     * memory does not need to be aligned.
+     * @param mem the memory buffer to write to
+     * @param val the batch to copy from
+     */
+    template <class A, class T>
+    inline void store(T* mem, batch<T, A> const& val, aligned_mode = {}) noexcept
+    {
+        store_as<T, A>(mem, val, aligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Copy content of batch \c val to the buffer \c mem. The
+     * memory does not need to be aligned.
+     * @param mem the memory buffer to write to
+     * @param val the batch to copy from
+     */
+    template <class A, class T>
+    inline void store(T* mem, batch<T, A> const& val, unaligned_mode) noexcept
+    {
+        store_as<T, A>(mem, val, unaligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Copy content of batch \c val to the buffer \c mem. The
+     * memory needs to be aligned.
+     * @param mem the memory buffer to write to
+     * @param val the batch to copy from
+     */
+    template <class A, class T>
+    inline void store_aligned(T* mem, batch<T, A> const& val) noexcept
+    {
+        store_as<T, A>(mem, val, aligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Copy content of batch \c val to the buffer \c mem. The
+     * memory does not need to be aligned.
+     * @param mem the memory buffer to write to
+     * @param val the batch to copy
+     */
+    template <class A, class T>
+    inline void store_unaligned(T* mem, batch<T, A> const& val) noexcept
+    {
+        store_as<T, A>(mem, val, unaligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the difference between \c x and \c y
+     * @tparam X the actual type of batch.
+     * @param x scalar or batch of scalars
+     * @param y scalar or batch of scalars
+     * @return the difference between \c x and \c y
+     */
+    template <class T, class A>
+    inline auto sub(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x - y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x - y;
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Rearrange elements from \c x according to constant mask \c mask
+     * @param x batch
+     * @param mask constant batch mask of integer elements of the same size as
+     * element of \c x
+     * @return swizzled batch
+     */
+    template <class T, class A, class Vt, Vt... Values>
+    inline typename std::enable_if<std::is_arithmetic<T>::value, batch<T, A>>::type
+    swizzle(batch<T, A> const& x, batch_constant<batch<Vt, A>, Values...> mask) noexcept
+    {
+        static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
+        detail::static_check_supported_config<T, A>();
+        return kernel::swizzle<A>(x, mask, A {});
+    }
+    template <class T, class A, class Vt, Vt... Values>
+    inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& x, batch_constant<batch<Vt, A>, Values...> mask) noexcept
+    {
+        static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
+        detail::static_check_supported_config<T, A>();
+        return kernel::swizzle<A>(x, mask, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Rearrange elements from \c x according to mask \c mask
+     * @param x batch
+     * @param mask batch mask of integer elements of the same size as
+     * element of \c x
+     * @return swizzled batch
+     */
+    template <class T, class A, class Vt>
+    inline typename std::enable_if<std::is_arithmetic<T>::value, batch<T, A>>::type
+    swizzle(batch<T, A> const& x, batch<Vt, A> mask) noexcept
+    {
+        static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
+        detail::static_check_supported_config<T, A>();
+        return kernel::swizzle<A>(x, mask, A {});
+    }
+
+    template <class T, class A, class Vt>
+    inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& x, batch<Vt, A> mask) noexcept
+    {
+        static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
+        detail::static_check_supported_config<T, A>();
+        return kernel::swizzle<A>(x, mask, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the tangent of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the tangent of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> tan(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::tan<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the hyperbolic tangent of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the hyperbolic tangent of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> tanh(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::tanh<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math_extra
+     *
+     * Computes the gamma function of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the gamma function of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> tgamma(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::tgamma<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_conversion
+     *
+     * Perform a conversion from \c i to a value of an floating point type of the same size as \c T.
+     * This is equivalent to \c batch_cast<as_float_t<T>>(i)
+     * @param i batch of integers.
+     * @return \c i converted to a value of an floating point type of the same size as \c T
+     */
+    template <class T, class A>
+    inline batch<as_float_t<T>, A> to_float(batch<T, A> const& i) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return batch_cast<as_float_t<T>>(i);
+    }
+
+    /**
+     * @ingroup batch_conversion
+     *
+     * Perform a conversion from \c x to a value of an integer type of the same size as \c T
+     * This is equivalent to \c batch_cast<as_integer_t<T>>(x)
+     * @param x batch.
+     * @return \c x converted to a value of an integer type of the same size as \c T
+     */
+    template <class T, class A>
+    inline batch<as_integer_t<T>, A> to_int(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return batch_cast<as_integer_t<T>>(x);
+    }
+
+    /**
+     * @ingroup batch_rounding
+     *
+     * Computes the batch of nearest integer values not greater in magnitude
+     * than scalars in \c x.
+     * @param x batch of floating point values.
+     * @return the batch of nearest integer values not greater in magnitude than \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> trunc(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::trunc<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Unpack and interleave data from the HIGH half of batches \c x and \c y.
+     * Store the results in the Return value.
+     * @param x a batch of integer or floating point or double precision values.
+     * @param y a batch of integer or floating point or double precision values.
+     * @return a batch of the high part of shuffled values.
+     */
+    template <class T, class A>
+    inline batch<T, A> zip_hi(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::zip_hi<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Unpack and interleave data from the LOW half of batches \c x and \c y.
+     * Store the results in the Return value.
+     * @param x a batch of integer or floating point or double precision values.
+     * @param y a batch of integer or floating point or double precision values.
+     * @return a batch of the low part of shuffled values.
+     */
+    template <class T, class A>
+    inline batch<T, A> zip_lo(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::zip_lo<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_conversion
+     *
+     * Cast a \c batch_bool of \c T into a \c batch of the same type using the
+     * following rule: if an element of \c self is true, it maps to -1 in the
+     * returned integral batch, otherwise it maps to 0.
+     *
+     * @param self batch_bool of \c T
+     * @return \c self cast to a \c batch of \c T
+     */
+    template <class T, class A, typename std::enable_if<std::is_integral<T>::value, int>::type = 3>
+    inline batch<T, A> bitwise_cast(batch_bool<T, A> const& self) noexcept
+    {
+        T z(0);
+        detail::static_check_supported_config<T, A>();
+        return select(self, batch<T, A>(T(~z)), batch<T, A>(z));
+    }
+
+    template <class T, class A, typename std::enable_if<std::is_floating_point<T>::value, int>::type = 3>
+    inline batch<T, A> bitwise_cast(batch_bool<T, A> const& self) noexcept
+    {
+        T z0(0), z1(0);
+        using int_type = as_unsigned_integer_t<T>;
+        int_type value(~int_type(0));
+        std::memcpy(&z1, &value, sizeof(int_type));
+        detail::static_check_supported_config<T, A>();
+        return select(self, batch<T, A>(z1), batch<T, A>(z0));
+    }
+
+    /**
+     * @ingroup batch_bool_reducers
+     *
+     * Returns true if all the boolean values in the batch are true,
+     * false otherwise.
+     * @param x the batch to reduce.
+     * @return a boolean scalar.
+     */
+    template <class T, class A>
+    inline bool all(batch_bool<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::all<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_bool_reducers
+     *
+     * Return true if any of the boolean values in the batch is true,
+     * false otherwise.
+     * @param x the batch to reduce.
+     * @return a boolean scalar.
+     */
+    template <class T, class A>
+    inline bool any(batch_bool<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::any<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_bool_reducers
+     *
+     * Return true if none of the boolean values in the batch is true,
+     * false otherwise.
+     * @param x the batch to reduce.
+     * @return a boolean scalar.
+     */
+    template <class T, class A>
+    inline bool none(batch_bool<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return !xsimd::any(x);
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Dump the content of batch \c x to stream \c o
+     * @param o the stream where the batch is dumped
+     * @param x batch to dump.
+     * @return a reference to \c o
+     */
+    template <class T, class A>
+    inline std::ostream& operator<<(std::ostream& o, batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        constexpr auto size = batch<T, A>::size;
+        alignas(A::alignment()) T buffer[size];
+        x.store_aligned(&buffer[0]);
+        o << '(';
+        for (std::size_t i = 0; i < size - 1; ++i)
+            o << buffer[i] << ", ";
+        return o << buffer[size - 1] << ')';
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Dump the content of batch \c x to stream \c o
+     * @param o the stream where the batch is dumped
+     * @param x batch to dump.
+     * @return a reference to \c o
+     */
+    template <class T, class A>
+    inline std::ostream& operator<<(std::ostream& o, batch_bool<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        constexpr auto size = batch_bool<T, A>::size;
+        alignas(A::alignment()) bool buffer[size];
+        x.store_aligned(&buffer[0]);
+        o << '(';
+        for (std::size_t i = 0; i < size - 1; ++i)
+            o << buffer[i] << ", ";
+        return o << buffer[size - 1] << ')';
+    }
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx2_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx2_register.hpp
new file mode 100644
index 0000000000..cd10383e2b
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx2_register.hpp
@@ -0,0 +1,40 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX2_REGISTER_HPP
+#define XSIMD_AVX2_REGISTER_HPP
+
+#include "./xsimd_avx_register.hpp"
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * AVX2 instructions
+     */
+    struct avx2 : avx
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX2; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(2, 2, 0); }
+        static constexpr char const* name() noexcept { return "avx2"; }
+    };
+
+#if XSIMD_WITH_AVX2
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx2, avx);
+    }
+#endif
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp
new file mode 100644
index 0000000000..15c19832ae
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512bw_register.hpp
@@ -0,0 +1,48 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512BW_REGISTER_HPP
+#define XSIMD_AVX512BW_REGISTER_HPP
+
+#include "./xsimd_avx512dq_register.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512BW instructions
+     */
+    struct avx512bw : avx512dq
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512BW; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(3, 4, 0); }
+        static constexpr char const* name() noexcept { return "avx512bw"; }
+    };
+
+#if XSIMD_WITH_AVX512BW
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512bw>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512bw, avx512dq);
+
+    }
+#endif
+}
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp
new file mode 100644
index 0000000000..29efca368c
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512cd_register.hpp
@@ -0,0 +1,48 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512CD_REGISTER_HPP
+#define XSIMD_AVX512CD_REGISTER_HPP
+
+#include "./xsimd_avx512f_register.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512CD instructions
+     */
+    struct avx512cd : avx512f
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512CD; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(3, 2, 0); }
+        static constexpr char const* name() noexcept { return "avx512cd"; }
+    };
+
+#if XSIMD_WITH_AVX512CD
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512cd>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512cd, avx512f);
+
+    }
+#endif
+}
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp
new file mode 100644
index 0000000000..25a255ec15
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512dq_register.hpp
@@ -0,0 +1,48 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512DQ_REGISTER_HPP
+#define XSIMD_AVX512DQ_REGISTER_HPP
+
+#include "./xsimd_avx512cd_register.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512DQ instructions
+     */
+    struct avx512dq : avx512cd
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512DQ; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(3, 3, 0); }
+        static constexpr char const* name() noexcept { return "avx512dq"; }
+    };
+
+#if XSIMD_WITH_AVX512DQ
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512dq>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512dq, avx512cd);
+
+    }
+#endif
+}
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp
new file mode 100644
index 0000000000..a99157cf37
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512er_register.hpp
@@ -0,0 +1,48 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512ER_REGISTER_HPP
+#define XSIMD_AVX512ER_REGISTER_HPP
+
+#include "./xsimd_avx512dq_register.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512ER instructions
+     */
+    struct avx512er : avx512cd
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512ER; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(3, 3, 1); }
+        static constexpr char const* name() noexcept { return "avx512er"; }
+    };
+
+#if XSIMD_WITH_AVX512ER
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512er>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512er, avx512cd);
+
+    }
+#endif
+}
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp
new file mode 100644
index 0000000000..c1f80a122d
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512f_register.hpp
@@ -0,0 +1,74 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512F_REGISTER_HPP
+#define XSIMD_AVX512F_REGISTER_HPP
+
+#include "./xsimd_generic_arch.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512F instructions
+     */
+    struct avx512f : generic
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512F; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(3, 1, 0); }
+        static constexpr std::size_t alignment() noexcept { return 64; }
+        static constexpr bool requires_alignment() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "avx512f"; }
+    };
+
+#if XSIMD_WITH_AVX512F
+
+    namespace types
+    {
+        template <class T>
+        struct simd_avx512_bool_register
+        {
+            using register_type = typename std::conditional<
+                (sizeof(T) < 4), std::conditional<(sizeof(T) == 1), __mmask64, __mmask32>,
+                std::conditional<(sizeof(T) == 4), __mmask16, __mmask8>>::type::type;
+            register_type data;
+            simd_avx512_bool_register() = default;
+            simd_avx512_bool_register(register_type r) { data = r; }
+            operator register_type() const noexcept { return data; }
+        };
+        template <class T>
+        struct get_bool_simd_register<T, avx512f>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER(signed char, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(char, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(short, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(int, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(long int, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(long long int, avx512f, __m512i);
+        XSIMD_DECLARE_SIMD_REGISTER(float, avx512f, __m512);
+        XSIMD_DECLARE_SIMD_REGISTER(double, avx512f, __m512d);
+
+    }
+#endif
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp
new file mode 100644
index 0000000000..ba76ea147b
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512ifma_register.hpp
@@ -0,0 +1,48 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512IFMA_REGISTER_HPP
+#define XSIMD_AVX512IFMA_REGISTER_HPP
+
+#include "./xsimd_avx512bw_register.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512IFMA instructions
+     */
+    struct avx512ifma : avx512bw
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512IFMA; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(3, 5, 0); }
+        static constexpr char const* name() noexcept { return "avx512ifma"; }
+    };
+
+#if XSIMD_WITH_AVX512IFMA
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512ifma>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512ifma, avx512bw);
+
+    }
+#endif
+}
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp
new file mode 100644
index 0000000000..38a10f0227
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512pf_register.hpp
@@ -0,0 +1,48 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512PF_REGISTER_HPP
+#define XSIMD_AVX512PF_REGISTER_HPP
+
+#include "./xsimd_avx512er_register.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512BW instructions
+     */
+    struct avx512pf : avx512er
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512PF; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(3, 4, 1); }
+        static constexpr char const* name() noexcept { return "avx512pf"; }
+    };
+
+#if XSIMD_WITH_AVX512PF
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512pf>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512pf, avx512er);
+
+    }
+#endif
+}
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp
new file mode 100644
index 0000000000..19ff744d72
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512vbmi_register.hpp
@@ -0,0 +1,48 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512VBMI_REGISTER_HPP
+#define XSIMD_AVX512VBMI_REGISTER_HPP
+
+#include "./xsimd_avx512ifma_register.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512VBMI instructions
+     */
+    struct avx512vbmi : avx512ifma
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VBMI; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(3, 6, 0); }
+        static constexpr char const* name() noexcept { return "avx512vbmi"; }
+    };
+
+#if XSIMD_WITH_AVX512VBMI
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512vbmi>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512vbmi, avx512ifma);
+
+    }
+#endif
+}
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp
new file mode 100644
index 0000000000..85edbdf230
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512bw_register.hpp
@@ -0,0 +1,51 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512VNNI_AVX512BW_REGISTER_HPP
+#define XSIMD_AVX512VNNI_AVX512BW_REGISTER_HPP
+
+#include "./xsimd_avx512bw_register.hpp"
+
+namespace xsimd
+{
+    template <typename arch>
+    struct avx512vnni;
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512VNNI instructions
+     */
+    template <>
+    struct avx512vnni<avx512bw> : avx512bw
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VNNI_AVX512BW; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(3, 4, 1); }
+        static constexpr char const* name() noexcept { return "avx512vnni+avx512bw"; }
+    };
+
+#if XSIMD_WITH_AVX512VNNI_AVX512BW
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512vnni<avx512bw>>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512vnni<avx512bw>, avx512bw);
+
+    }
+#endif
+}
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi_register.hpp
new file mode 100644
index 0000000000..232b19a5cb
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512vnni_avx512vbmi_register.hpp
@@ -0,0 +1,51 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512VNNI_AVX512VBMI_REGISTER_HPP
+#define XSIMD_AVX512VNNI_AVX512VBMI_REGISTER_HPP
+
+#include "./xsimd_avx512vbmi_register.hpp"
+
+namespace xsimd
+{
+    template <typename arch>
+    struct avx512vnni;
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512VNNI instructions
+     */
+    template <>
+    struct avx512vnni<avx512vbmi> : avx512vbmi
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VNNI_AVX512VBMI; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(3, 6, 1); }
+        static constexpr char const* name() noexcept { return "avx512vnni+avx512vbmi"; }
+    };
+
+#if XSIMD_WITH_AVX512VNNI_AVX512VBMI
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512vnni<avx512vbmi>>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512vnni<avx512vbmi>, avx512vbmi);
+
+    }
+#endif
+}
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx512vnni_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx512vnni_register.hpp
new file mode 100644
index 0000000000..c276fb0079
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx512vnni_register.hpp
@@ -0,0 +1,48 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX512VNNI_REGISTER_HPP
+#define XSIMD_AVX512VNNI_REGISTER_HPP
+
+#include "./xsimd_avx512vbmi_register.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX512VNNI instructions
+     */
+    struct avx512vnni : avx512vbmi
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX512VNNI; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(3, 7, 0); }
+        static constexpr char const* name() noexcept { return "avx512vnni"; }
+    };
+
+#if XSIMD_WITH_AVX512VNNI
+
+    namespace types
+    {
+        template <class T>
+        struct get_bool_simd_register<T, avx512vnni>
+        {
+            using type = simd_avx512_bool_register<T>;
+        };
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avx512vnni, avx512vbmi);
+
+    }
+#endif
+}
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avx_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avx_register.hpp
new file mode 100644
index 0000000000..6b1951f964
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avx_register.hpp
@@ -0,0 +1,61 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX_REGISTER_HPP
+#define XSIMD_AVX_REGISTER_HPP
+
+#include "./xsimd_generic_arch.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX instructions
+     */
+    struct avx : generic
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVX; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(2, 1, 0); }
+        static constexpr std::size_t alignment() noexcept { return 32; }
+        static constexpr bool requires_alignment() noexcept { return true; }
+        static constexpr char const* name() noexcept { return "avx"; }
+    };
+}
+
+#if XSIMD_WITH_AVX
+
+#include <immintrin.h>
+
+namespace xsimd
+{
+    namespace types
+    {
+
+        XSIMD_DECLARE_SIMD_REGISTER(signed char, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(char, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(short, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(int, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(long int, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(long long int, avx, __m256i);
+        XSIMD_DECLARE_SIMD_REGISTER(float, avx, __m256);
+        XSIMD_DECLARE_SIMD_REGISTER(double, avx, __m256d);
+    }
+}
+#endif
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp
new file mode 100644
index 0000000000..f68fe16bad
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_avxvnni_register.hpp
@@ -0,0 +1,40 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVXVNNI_REGISTER_HPP
+#define XSIMD_AVXVNNI_REGISTER_HPP
+
+#include "./xsimd_avx2_register.hpp"
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * AVXVNNI instructions
+     */
+    struct avxvnni : avx2
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_AVXVNNI; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(2, 3, 0); }
+        static constexpr char const* name() noexcept { return "avxvnni"; }
+    };
+
+#if XSIMD_WITH_AVXVNNI
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(avxvnni, avx2);
+    }
+#endif
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_batch.hpp b/third_party/xsimd/include/xsimd/types/xsimd_batch.hpp
new file mode 100644
index 0000000000..b4989fc88d
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_batch.hpp
@@ -0,0 +1,1492 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_BATCH_HPP
+#define XSIMD_BATCH_HPP
+
+#include <cassert>
+#include <complex>
+
+#include "../config/xsimd_arch.hpp"
+#include "../memory/xsimd_alignment.hpp"
+#include "./xsimd_utils.hpp"
+
+namespace xsimd
+{
+    template <class T, class A = default_arch>
+    class batch;
+
+    namespace types
+    {
+        template <class T, class A>
+        struct integral_only_operators
+        {
+            inline batch<T, A>& operator%=(batch<T, A> const& other) noexcept;
+            inline batch<T, A>& operator>>=(int32_t other) noexcept;
+            inline batch<T, A>& operator>>=(batch<T, A> const& other) noexcept;
+            inline batch<T, A>& operator<<=(int32_t other) noexcept;
+            inline batch<T, A>& operator<<=(batch<T, A> const& other) noexcept;
+
+            /** Shorthand for xsimd::mod() */
+            friend inline batch<T, A> operator%(batch<T, A> const& self, batch<T, A> const& other) noexcept
+            {
+                return batch<T, A>(self) %= other;
+            }
+
+            /** Shorthand for xsimd::bitwise_rshift() */
+            friend inline batch<T, A> operator>>(batch<T, A> const& self, batch<T, A> const& other) noexcept
+            {
+                return batch<T, A>(self) >>= other;
+            }
+
+            /** Shorthand for xsimd::bitwise_lshift() */
+            friend inline batch<T, A> operator<<(batch<T, A> const& self, batch<T, A> const& other) noexcept
+            {
+                return batch<T, A>(self) <<= other;
+            }
+
+            /** Shorthand for xsimd::bitwise_rshift() */
+            friend inline batch<T, A> operator>>(batch<T, A> const& self, int32_t other) noexcept
+            {
+                return batch<T, A>(self) >>= other;
+            }
+
+            /** Shorthand for xsimd::bitwise_lshift() */
+            friend inline batch<T, A> operator<<(batch<T, A> const& self, int32_t other) noexcept
+            {
+                return batch<T, A>(self) <<= other;
+            }
+        };
+        template <class A>
+        struct integral_only_operators<float, A>
+        {
+        };
+        template <class A>
+        struct integral_only_operators<double, A>
+        {
+        };
+
+    }
+
+    namespace details
+    {
+        // These functions are forwarded declared here so that they can be used by friend functions
+        // with batch<T, A>. Their implementation must appear only once the
+        // kernel implementations have been included.
+        template <class T, class A>
+        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other) noexcept;
+
+        template <class T, class A>
+        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other) noexcept;
+
+        template <class T, class A>
+        inline batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other) noexcept;
+
+        template <class T, class A>
+        inline batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other) noexcept;
+
+        template <class T, class A>
+        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other) noexcept;
+
+        template <class T, class A>
+        inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other) noexcept;
+    }
+
+    /**
+     * @brief batch of integer or floating point values.
+     *
+     * Abstract representation of an SIMD register for floating point or integral
+     * value.
+     *
+     * @tparam T the type of the underlying values.
+     * @tparam A the architecture this batch is tied too.
+     **/
+    template <class T, class A>
+    class batch : public types::simd_register<T, A>, public types::integral_only_operators<T, A>
+    {
+        static_assert(!std::is_same<T, bool>::value, "use xsimd::batch_bool<T, A> instead of xsimd::batch<bool, A>");
+
+    public:
+        static constexpr std::size_t size = sizeof(types::simd_register<T, A>) / sizeof(T); ///< Number of scalar elements in this batch.
+
+        using value_type = T; ///< Type of the scalar elements within this batch.
+        using arch_type = A; ///< SIMD Architecture abstracted by this batch.
+        using register_type = typename types::simd_register<T, A>::register_type; ///< SIMD register type abstracted by this batch.
+        using batch_bool_type = batch_bool<T, A>; ///< Associated batch type used to represented logical operations on this batch.
+
+        // constructors
+        inline batch() = default; ///< Create a batch initialized with undefined values.
+        inline batch(T val) noexcept;
+        template <class... Ts>
+        inline batch(T val0, T val1, Ts... vals) noexcept;
+        inline explicit batch(batch_bool_type const& b) noexcept;
+        inline batch(register_type reg) noexcept;
+
+        template <class U>
+        XSIMD_NO_DISCARD static inline batch broadcast(U val) noexcept;
+
+        // memory operators
+        template <class U>
+        inline void store_aligned(U* mem) const noexcept;
+        template <class U>
+        inline void store_unaligned(U* mem) const noexcept;
+        template <class U>
+        inline void store(U* mem, aligned_mode) const noexcept;
+        template <class U>
+        inline void store(U* mem, unaligned_mode) const noexcept;
+
+        template <class U>
+        XSIMD_NO_DISCARD static inline batch load_aligned(U const* mem) noexcept;
+        template <class U>
+        XSIMD_NO_DISCARD static inline batch load_unaligned(U const* mem) noexcept;
+        template <class U>
+        XSIMD_NO_DISCARD static inline batch load(U const* mem, aligned_mode) noexcept;
+        template <class U>
+        XSIMD_NO_DISCARD static inline batch load(U const* mem, unaligned_mode) noexcept;
+
+        template <class U, class V>
+        XSIMD_NO_DISCARD static inline batch gather(U const* src, batch<V, arch_type> const& index) noexcept;
+        template <class U, class V>
+        inline void scatter(U* dst, batch<V, arch_type> const& index) const noexcept;
+
+        inline T get(std::size_t i) const noexcept;
+
+        // comparison operators. Defined as friend to enable automatic
+        // conversion of parameters from scalar to batch, at the cost of using a
+        // proxy implementation from details::.
+        friend inline batch_bool<T, A> operator==(batch const& self, batch const& other) noexcept
+        {
+            return details::eq<T, A>(self, other);
+        }
+        friend inline batch_bool<T, A> operator!=(batch const& self, batch const& other) noexcept
+        {
+            return details::neq<T, A>(self, other);
+        }
+        friend inline batch_bool<T, A> operator>=(batch const& self, batch const& other) noexcept
+        {
+            return details::ge<T, A>(self, other);
+        }
+        friend inline batch_bool<T, A> operator<=(batch const& self, batch const& other) noexcept
+        {
+            return details::le<T, A>(self, other);
+        }
+        friend inline batch_bool<T, A> operator>(batch const& self, batch const& other) noexcept
+        {
+            return details::gt<T, A>(self, other);
+        }
+        friend inline batch_bool<T, A> operator<(batch const& self, batch const& other) noexcept
+        {
+            return details::lt<T, A>(self, other);
+        }
+
+        // Update operators
+        inline batch& operator+=(batch const& other) noexcept;
+        inline batch& operator-=(batch const& other) noexcept;
+        inline batch& operator*=(batch const& other) noexcept;
+        inline batch& operator/=(batch const& other) noexcept;
+        inline batch& operator&=(batch const& other) noexcept;
+        inline batch& operator|=(batch const& other) noexcept;
+        inline batch& operator^=(batch const& other) noexcept;
+
+        // incr/decr operators
+        inline batch& operator++() noexcept;
+        inline batch& operator--() noexcept;
+        inline batch operator++(int) noexcept;
+        inline batch operator--(int) noexcept;
+
+        // unary operators
+        inline batch_bool_type operator!() const noexcept;
+        inline batch operator~() const noexcept;
+        inline batch operator-() const noexcept;
+        inline batch operator+() const noexcept;
+
+        // arithmetic operators. They are defined as friend to enable automatic
+        // conversion of parameters from scalar to batch. Inline implementation
+        // is required to avoid warnings.
+
+        /** Shorthand for xsimd::add() */
+        friend inline batch operator+(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) += other;
+        }
+
+        /** Shorthand for xsimd::sub() */
+        friend inline batch operator-(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) -= other;
+        }
+
+        /** Shorthand for xsimd::mul() */
+        friend inline batch operator*(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) *= other;
+        }
+
+        /** Shorthand for xsimd::div() */
+        friend inline batch operator/(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) /= other;
+        }
+
+        /** Shorthand for xsimd::bitwise_and() */
+        friend inline batch operator&(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) &= other;
+        }
+
+        /** Shorthand for xsimd::bitwise_or() */
+        friend inline batch operator|(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) |= other;
+        }
+
+        /** Shorthand for xsimd::bitwise_xor() */
+        friend inline batch operator^(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) ^= other;
+        }
+
+        /** Shorthand for xsimd::logical_and() */
+        friend inline batch operator&&(batch const& self, batch const& other) noexcept
+        {
+            return batch(self).logical_and(other);
+        }
+
+        /** Shorthand for xsimd::logical_or() */
+        friend inline batch operator||(batch const& self, batch const& other) noexcept
+        {
+            return batch(self).logical_or(other);
+        }
+
+    private:
+        inline batch logical_and(batch const& other) const noexcept;
+        inline batch logical_or(batch const& other) const noexcept;
+    };
+
+    template <class T, class A>
+    constexpr std::size_t batch<T, A>::size;
+
+    /**
+     * @brief batch of predicate over scalar or complex values.
+     *
+     * Abstract representation of a predicate over SIMD register for scalar or
+     * complex values.
+     *
+     * @tparam T the type of the predicated values.
+     * @tparam A the architecture this batch is tied too.
+     **/
+    template <class T, class A = default_arch>
+    class batch_bool : public types::get_bool_simd_register_t<T, A>
+    {
+        using base_type = types::get_bool_simd_register_t<T, A>;
+
+    public:
+        static constexpr std::size_t size = sizeof(types::simd_register<T, A>) / sizeof(T); ///< Number of scalar elements in this batch.
+
+        using value_type = bool; ///< Type of the scalar elements within this batch.
+        using arch_type = A; ///< SIMD Architecture abstracted by this batch.
+        using register_type = typename base_type::register_type; ///< SIMD register type abstracted by this batch.
+        using batch_type = batch<T, A>; ///< Associated batch type this batch represents logical operations for.
+
+        // constructors
+        inline batch_bool() = default; ///< Create a batch initialized with undefined values.
+        inline batch_bool(bool val) noexcept;
+        inline batch_bool(register_type reg) noexcept;
+        template <class... Ts>
+        inline batch_bool(bool val0, bool val1, Ts... vals) noexcept;
+
+        template <class Tp>
+        inline batch_bool(Tp const*) = delete;
+
+        // memory operators
+        inline void store_aligned(bool* mem) const noexcept;
+        inline void store_unaligned(bool* mem) const noexcept;
+        XSIMD_NO_DISCARD static inline batch_bool load_aligned(bool const* mem) noexcept;
+        XSIMD_NO_DISCARD static inline batch_bool load_unaligned(bool const* mem) noexcept;
+
+        inline bool get(std::size_t i) const noexcept;
+
+        // mask operations
+        inline uint64_t mask() const noexcept;
+        inline static batch_bool from_mask(uint64_t mask) noexcept;
+
+        // comparison operators
+        inline batch_bool operator==(batch_bool const& other) const noexcept;
+        inline batch_bool operator!=(batch_bool const& other) const noexcept;
+
+        // logical operators
+        inline batch_bool operator~() const noexcept;
+        inline batch_bool operator!() const noexcept;
+        inline batch_bool operator&(batch_bool const& other) const noexcept;
+        inline batch_bool operator|(batch_bool const& other) const noexcept;
+        inline batch_bool operator^(batch_bool const& other) const noexcept;
+        inline batch_bool operator&&(batch_bool const& other) const noexcept;
+        inline batch_bool operator||(batch_bool const& other) const noexcept;
+
+        // update operators
+        inline batch_bool& operator&=(batch_bool const& other) noexcept { return (*this) = (*this) & other; }
+        inline batch_bool& operator|=(batch_bool const& other) noexcept { return (*this) = (*this) | other; }
+        inline batch_bool& operator^=(batch_bool const& other) noexcept { return (*this) = (*this) ^ other; }
+
+    private:
+        template <class U, class... V, size_t I, size_t... Is>
+        static inline register_type make_register(detail::index_sequence<I, Is...>, U u, V... v) noexcept;
+
+        template <class... V>
+        static inline register_type make_register(detail::index_sequence<>, V... v) noexcept;
+    };
+
+    template <class T, class A>
+    constexpr std::size_t batch_bool<T, A>::size;
+
+    /**
+     * @brief batch of complex values.
+     *
+     * Abstract representation of an SIMD register for complex values.
+     *
+     * @tparam T the type of the underlying values.
+     * @tparam A the architecture this batch is tied too.
+     **/
+    template <class T, class A>
+    class batch<std::complex<T>, A>
+    {
+    public:
+        using value_type = std::complex<T>; ///< Type of the complex elements within this batch.
+        using real_batch = batch<T, A>; ///< Type of the scalar elements within this batch.
+        using arch_type = A; ///< SIMD Architecture abstracted by this batch.
+        using batch_bool_type = batch_bool<T, A>; ///< Associated batch type used to represented logical operations on this batch.
+
+        static constexpr std::size_t size = real_batch::size; ///< Number of complex elements in this batch.
+
+        // constructors
+        inline batch() = default; ///< Create a batch initialized with undefined values.
+        inline batch(value_type const& val) noexcept;
+        inline batch(real_batch const& real, real_batch const& imag) noexcept;
+
+        inline batch(real_batch const& real) noexcept;
+        inline batch(T val) noexcept;
+        template <class... Ts>
+        inline batch(value_type val0, value_type val1, Ts... vals) noexcept;
+        inline explicit batch(batch_bool_type const& b) noexcept;
+
+        template <class U>
+        XSIMD_NO_DISCARD static inline batch broadcast(U val) noexcept;
+
+        // memory operators
+        XSIMD_NO_DISCARD static inline batch load_aligned(const T* real_src, const T* imag_src = nullptr) noexcept;
+        XSIMD_NO_DISCARD static inline batch load_unaligned(const T* real_src, const T* imag_src = nullptr) noexcept;
+        inline void store_aligned(T* real_dst, T* imag_dst) const noexcept;
+        inline void store_unaligned(T* real_dst, T* imag_dst) const noexcept;
+
+        XSIMD_NO_DISCARD static inline batch load_aligned(const value_type* src) noexcept;
+        XSIMD_NO_DISCARD static inline batch load_unaligned(const value_type* src) noexcept;
+        inline void store_aligned(value_type* dst) const noexcept;
+        inline void store_unaligned(value_type* dst) const noexcept;
+
+        template <class U>
+        XSIMD_NO_DISCARD static inline batch load(U const* mem, aligned_mode) noexcept;
+        template <class U>
+        XSIMD_NO_DISCARD static inline batch load(U const* mem, unaligned_mode) noexcept;
+        template <class U>
+        inline void store(U* mem, aligned_mode) const noexcept;
+        template <class U>
+        inline void store(U* mem, unaligned_mode) const noexcept;
+
+        inline real_batch real() const noexcept;
+        inline real_batch imag() const noexcept;
+
+        inline value_type get(std::size_t i) const noexcept;
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+        // xtl-related methods
+        template <bool i3ec>
+        inline batch(xtl::xcomplex<T, T, i3ec> const& val) noexcept;
+        template <bool i3ec, class... Ts>
+        inline batch(xtl::xcomplex<T, T, i3ec> val0, xtl::xcomplex<T, T, i3ec> val1, Ts... vals) noexcept;
+
+        template <bool i3ec>
+        XSIMD_NO_DISCARD static inline batch load_aligned(const xtl::xcomplex<T, T, i3ec>* src) noexcept;
+        template <bool i3ec>
+        XSIMD_NO_DISCARD static inline batch load_unaligned(const xtl::xcomplex<T, T, i3ec>* src) noexcept;
+        template <bool i3ec>
+        inline void store_aligned(xtl::xcomplex<T, T, i3ec>* dst) const noexcept;
+        template <bool i3ec>
+        inline void store_unaligned(xtl::xcomplex<T, T, i3ec>* dst) const noexcept;
+#endif
+
+        // comparison operators
+        inline batch_bool<T, A> operator==(batch const& other) const noexcept;
+        inline batch_bool<T, A> operator!=(batch const& other) const noexcept;
+
+        // Update operators
+        inline batch& operator+=(batch const& other) noexcept;
+        inline batch& operator-=(batch const& other) noexcept;
+        inline batch& operator*=(batch const& other) noexcept;
+        inline batch& operator/=(batch const& other) noexcept;
+
+        // incr/decr operators
+        inline batch& operator++() noexcept;
+        inline batch& operator--() noexcept;
+        inline batch operator++(int) noexcept;
+        inline batch operator--(int) noexcept;
+
+        // unary operators
+        inline batch_bool_type operator!() const noexcept;
+        inline batch operator~() const noexcept;
+        inline batch operator-() const noexcept;
+        inline batch operator+() const noexcept;
+
+        // arithmetic operators. They are defined as friend to enable automatic
+        // conversion of parameters from scalar to batch
+
+        /** Shorthand for xsimd::add() */
+        friend inline batch operator+(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) += other;
+        }
+
+        /** Shorthand for xsimd::sub() */
+        friend inline batch operator-(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) -= other;
+        }
+
+        /** Shorthand for xsimd::mul() */
+        friend inline batch operator*(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) *= other;
+        }
+
+        /** Shorthand for xsimd::div() */
+        friend inline batch operator/(batch const& self, batch const& other) noexcept
+        {
+            return batch(self) /= other;
+        }
+
+    private:
+        real_batch m_real;
+        real_batch m_imag;
+    };
+
+    template <class T, class A>
+    constexpr std::size_t batch<std::complex<T>, A>::size;
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <typename T, bool i3ec, typename A>
+    struct batch<xtl::xcomplex<T, T, i3ec>, A>
+    {
+        static_assert(std::is_same<T, void>::value,
+                      "Please use batch<std::complex<T>, A> initialized from xtl::xcomplex instead");
+    };
+#endif
+}
+
+#include "../arch/xsimd_isa.hpp"
+#include "./xsimd_batch_constant.hpp"
+#include "./xsimd_traits.hpp"
+
+namespace xsimd
+{
+
+    /**
+     * Create a batch with all element initialized to \c val.
+     */
+    template <class T, class A>
+    inline batch<T, A>::batch(T val) noexcept
+        : types::simd_register<T, A>(kernel::broadcast<A>(val, A {}))
+    {
+        detail::static_check_supported_config<T, A>();
+    }
+
+    /**
+     * Create a batch with elements initialized from \c val0, \c val1, \c vals...
+     * There must be exactly \c size elements in total.
+     */
+    template <class T, class A>
+    template <class... Ts>
+    inline batch<T, A>::batch(T val0, T val1, Ts... vals) noexcept
+        : batch(kernel::set<A>(batch {}, A {}, val0, val1, static_cast<T>(vals)...))
+    {
+        detail::static_check_supported_config<T, A>();
+        static_assert(sizeof...(Ts) + 2 == size, "The constructor requires as many arguments as batch elements.");
+    }
+
+    /**
+     * Converts a \c bool_batch to a \c batch where each element is
+     * set to 1 (resp. 0) if the corresponding element is `true`
+     * (resp. `false`).
+     */
+    template <class T, class A>
+    inline batch<T, A>::batch(batch_bool<T, A> const& b) noexcept
+        : batch(kernel::from_bool(b, A {}))
+    {
+    }
+
+    /**
+     * Wraps a compatible native simd register as a \c batch. This is generally not needed but
+     * becomes handy when doing architecture-specific operations.
+     */
+    template <class T, class A>
+    inline batch<T, A>::batch(register_type reg) noexcept
+        : types::simd_register<T, A>({ reg })
+    {
+        detail::static_check_supported_config<T, A>();
+    }
+
+    /**
+     * Equivalent to batch::batch(T val).
+     */
+    template <class T, class A>
+    template <class U>
+    XSIMD_NO_DISCARD inline batch<T, A> batch<T, A>::broadcast(U val) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return batch(static_cast<T>(val));
+    }
+
+    /**************************
+     * batch memory operators *
+     **************************/
+
+    /**
+     * Copy content of this batch to the buffer \c mem. The
+     * memory needs to be aligned.
+     */
+    template <class T, class A>
+    template <class U>
+    inline void batch<T, A>::store_aligned(U* mem) const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        assert(((reinterpret_cast<uintptr_t>(mem) % A::alignment()) == 0)
+               && "store location is not properly aligned");
+        kernel::store_aligned<A>(mem, *this, A {});
+    }
+
+    /**
+     * Copy content of this batch to the buffer \c mem. The
+     * memory does not need to be aligned.
+     */
+    template <class T, class A>
+    template <class U>
+    inline void batch<T, A>::store_unaligned(U* mem) const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        kernel::store_unaligned<A>(mem, *this, A {});
+    }
+
+    /**
+     * Equivalent to batch::store_aligned()
+     */
+    template <class T, class A>
+    template <class U>
+    inline void batch<T, A>::store(U* mem, aligned_mode) const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return store_aligned(mem);
+    }
+
+    /**
+     * Equivalent to batch::store_unaligned()
+     */
+    template <class T, class A>
+    template <class U>
+    inline void batch<T, A>::store(U* mem, unaligned_mode) const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return store_unaligned(mem);
+    }
+
+    /**
+     * Loading from aligned memory. May involve a conversion if \c U is different
+     * from \c T.
+     */
+    template <class T, class A>
+    template <class U>
+    inline batch<T, A> batch<T, A>::load_aligned(U const* mem) noexcept
+    {
+        assert(((reinterpret_cast<uintptr_t>(mem) % A::alignment()) == 0)
+               && "loaded pointer is not properly aligned");
+        detail::static_check_supported_config<T, A>();
+        return kernel::load_aligned<A>(mem, kernel::convert<T> {}, A {});
+    }
+
+    /**
+     * Loading from unaligned memory. May involve a conversion if \c U is different
+     * from \c T.
+     */
+    template <class T, class A>
+    template <class U>
+    inline batch<T, A> batch<T, A>::load_unaligned(U const* mem) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::load_unaligned<A>(mem, kernel::convert<T> {}, A {});
+    }
+
+    /**
+     * Equivalent to batch::load_aligned()
+     */
+    template <class T, class A>
+    template <class U>
+    inline batch<T, A> batch<T, A>::load(U const* mem, aligned_mode) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return load_aligned(mem);
+    }
+
+    /**
+     * Equivalent to batch::load_unaligned()
+     */
+    template <class T, class A>
+    template <class U>
+    inline batch<T, A> batch<T, A>::load(U const* mem, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return load_unaligned(mem);
+    }
+
+    /**
+     * Create a new batch gathering elements starting at address \c src and
+     * offset by each element in \c index.
+     * If \c T is not of the same size as \c U, a \c static_cast is performed
+     * at element gather time.
+     */
+    template <class T, class A>
+    template <typename U, typename V>
+    inline batch<T, A> batch<T, A>::gather(U const* src, batch<V, A> const& index) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        static_assert(std::is_convertible<T, U>::value, "Can't convert from src to this batch's type!");
+        return kernel::gather(batch {}, src, index, A {});
+    }
+
+    /**
+     * Scatter elements from this batch into addresses starting at \c dst
+     * and offset by each element in \c index.
+     * If \c T is not of the same size as \c U, a \c static_cast is performed
+     * at element scatter time.
+     */
+    template <class T, class A>
+    template <class U, class V>
+    inline void batch<T, A>::scatter(U* dst, batch<V, A> const& index) const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        static_assert(std::is_convertible<T, U>::value, "Can't convert from this batch's type to dst!");
+        kernel::scatter<A>(*this, dst, index, A {});
+    }
+
+    /**
+     * Retrieve the \c i th scalar element in this batch.
+     *
+     * \c warning This is very inefficient and should only be used for debugging purpose.
+     */
+    template <class T, class A>
+    inline T batch<T, A>::get(std::size_t i) const noexcept
+    {
+        return kernel::get(*this, i, A {});
+    }
+
+    /******************************
+     * batch comparison operators *
+     ******************************/
+    namespace details
+    {
+        /**
+         * Shorthand for xsimd::eq()
+         */
+        template <class T, class A>
+        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other) noexcept
+        {
+            detail::static_check_supported_config<T, A>();
+            return kernel::eq<A>(self, other, A {});
+        }
+
+        /**
+         * Shorthand for xsimd::neq()
+         */
+        template <class T, class A>
+        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other) noexcept
+        {
+            detail::static_check_supported_config<T, A>();
+            return kernel::neq<A>(self, other, A {});
+        }
+
+        /**
+         * Shorthand for xsimd::ge()
+         */
+        template <class T, class A>
+        inline batch_bool<T, A> ge(batch<T, A> const& self, batch<T, A> const& other) noexcept
+        {
+            detail::static_check_supported_config<T, A>();
+            return kernel::ge<A>(self, other, A {});
+        }
+
+        /**
+         * Shorthand for xsimd::le()
+         */
+        template <class T, class A>
+        inline batch_bool<T, A> le(batch<T, A> const& self, batch<T, A> const& other) noexcept
+        {
+            detail::static_check_supported_config<T, A>();
+            return kernel::le<A>(self, other, A {});
+        }
+
+        /**
+         * Shorthand for xsimd::gt()
+         */
+        template <class T, class A>
+        inline batch_bool<T, A> gt(batch<T, A> const& self, batch<T, A> const& other) noexcept
+        {
+            detail::static_check_supported_config<T, A>();
+            return kernel::gt<A>(self, other, A {});
+        }
+
+        /**
+         * Shorthand for xsimd::lt()
+         */
+        template <class T, class A>
+        inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other) noexcept
+        {
+            detail::static_check_supported_config<T, A>();
+            return kernel::lt<A>(self, other, A {});
+        }
+    }
+
+    /**************************
+     * batch update operators *
+     **************************/
+
+    template <class T, class A>
+    inline batch<T, A>& batch<T, A>::operator+=(batch<T, A> const& other) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return *this = kernel::add<A>(*this, other, A {});
+    }
+
+    template <class T, class A>
+    inline batch<T, A>& batch<T, A>::operator-=(batch<T, A> const& other) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return *this = kernel::sub<A>(*this, other, A {});
+    }
+
+    template <class T, class A>
+    inline batch<T, A>& batch<T, A>::operator*=(batch<T, A> const& other) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return *this = kernel::mul<A>(*this, other, A {});
+    }
+
+    template <class T, class A>
+    inline batch<T, A>& batch<T, A>::operator/=(batch<T, A> const& other) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return *this = kernel::div<A>(*this, other, A {});
+    }
+
+    template <class T, class A>
+    inline batch<T, A>& types::integral_only_operators<T, A>::operator%=(batch<T, A> const& other) noexcept
+    {
+        ::xsimd::detail::static_check_supported_config<T, A>();
+        return *static_cast<batch<T, A>*>(this) = kernel::mod<A>(*static_cast<batch<T, A>*>(this), other, A {});
+    }
+
+    template <class T, class A>
+    inline batch<T, A>& batch<T, A>::operator&=(batch<T, A> const& other) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return *this = kernel::bitwise_and<A>(*this, other, A {});
+    }
+
+    template <class T, class A>
+    inline batch<T, A>& batch<T, A>::operator|=(batch<T, A> const& other) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return *this = kernel::bitwise_or<A>(*this, other, A {});
+    }
+
+    template <class T, class A>
+    inline batch<T, A>& batch<T, A>::operator^=(batch<T, A> const& other) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return *this = kernel::bitwise_xor<A>(*this, other, A {});
+    }
+
+    template <class T, class A>
+    inline batch<T, A>& kernel::integral_only_operators<T, A>::operator>>=(batch<T, A> const& other) noexcept
+    {
+        ::xsimd::detail::static_check_supported_config<T, A>();
+        return *static_cast<batch<T, A>*>(this) = kernel::bitwise_rshift<A>(*static_cast<batch<T, A>*>(this), other, A {});
+    }
+
+    template <class T, class A>
+    inline batch<T, A>& kernel::integral_only_operators<T, A>::operator<<=(batch<T, A> const& other) noexcept
+    {
+        ::xsimd::detail::static_check_supported_config<T, A>();
+        return *static_cast<batch<T, A>*>(this) = kernel::bitwise_lshift<A>(*static_cast<batch<T, A>*>(this), other, A {});
+    }
+
+    template <class T, class A>
+    inline batch<T, A>& kernel::integral_only_operators<T, A>::operator>>=(int32_t other) noexcept
+    {
+        ::xsimd::detail::static_check_supported_config<T, A>();
+        return *static_cast<batch<T, A>*>(this) = kernel::bitwise_rshift<A>(*static_cast<batch<T, A>*>(this), other, A {});
+    }
+
+    template <class T, class A>
+    inline batch<T, A>& kernel::integral_only_operators<T, A>::operator<<=(int32_t other) noexcept
+    {
+        ::xsimd::detail::static_check_supported_config<T, A>();
+        return *static_cast<batch<T, A>*>(this) = kernel::bitwise_lshift<A>(*static_cast<batch<T, A>*>(this), other, A {});
+    }
+
+    /*****************************
+     * batch incr/decr operators *
+     *****************************/
+
+    template <class T, class A>
+    inline batch<T, A>& batch<T, A>::operator++() noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return operator+=(1);
+    }
+
+    template <class T, class A>
+    inline batch<T, A>& batch<T, A>::operator--() noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return operator-=(1);
+    }
+
+    template <class T, class A>
+    inline batch<T, A> batch<T, A>::operator++(int) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        batch<T, A> copy(*this);
+        operator+=(1);
+        return copy;
+    }
+
+    template <class T, class A>
+    inline batch<T, A> batch<T, A>::operator--(int) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        batch copy(*this);
+        operator-=(1);
+        return copy;
+    }
+
+    /*************************
+     * batch unary operators *
+     *************************/
+
+    template <class T, class A>
+    inline batch_bool<T, A> batch<T, A>::operator!() const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::eq<A>(*this, batch(0), A {});
+    }
+
+    template <class T, class A>
+    inline batch<T, A> batch<T, A>::operator~() const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_not<A>(*this, A {});
+    }
+
+    template <class T, class A>
+    inline batch<T, A> batch<T, A>::operator-() const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::neg<A>(*this, A {});
+    }
+
+    template <class T, class A>
+    inline batch<T, A> batch<T, A>::operator+() const noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return *this;
+    }
+
+    /************************
+     * batch private method *
+     ************************/
+
+    template <class T, class A>
+    inline batch<T, A> batch<T, A>::logical_and(batch<T, A> const& other) const noexcept
+    {
+        return kernel::logical_and<A>(*this, other, A());
+    }
+
+    template <class T, class A>
+    inline batch<T, A> batch<T, A>::logical_or(batch<T, A> const& other) const noexcept
+    {
+        return kernel::logical_or<A>(*this, other, A());
+    }
+
+    /***************************
+     * batch_bool constructors *
+     ***************************/
+
+    template <class T, class A>
+    inline batch_bool<T, A>::batch_bool(register_type reg) noexcept
+        : types::get_bool_simd_register_t<T, A>({ reg })
+    {
+    }
+
+    template <class T, class A>
+    template <class... Ts>
+    inline batch_bool<T, A>::batch_bool(bool val0, bool val1, Ts... vals) noexcept
+        : batch_bool(kernel::set<A>(batch_bool {}, A {}, val0, val1, static_cast<bool>(vals)...))
+    {
+        static_assert(sizeof...(Ts) + 2 == size, "The constructor requires as many arguments as batch elements.");
+    }
+
+    /*******************************
+     * batch_bool memory operators *
+     *******************************/
+
+    template <class T, class A>
+    inline void batch_bool<T, A>::store_aligned(bool* mem) const noexcept
+    {
+        kernel::store(*this, mem, A {});
+    }
+
+    template <class T, class A>
+    inline void batch_bool<T, A>::store_unaligned(bool* mem) const noexcept
+    {
+        store_aligned(mem);
+    }
+
+    template <class T, class A>
+    inline batch_bool<T, A> batch_bool<T, A>::load_aligned(bool const* mem) noexcept
+    {
+        batch_type ref(0);
+        alignas(A::alignment()) T buffer[size];
+        for (std::size_t i = 0; i < size; ++i)
+            buffer[i] = mem[i] ? 1 : 0;
+        return ref != batch_type::load_aligned(&buffer[0]);
+    }
+
+    template <class T, class A>
+    inline batch_bool<T, A> batch_bool<T, A>::load_unaligned(bool const* mem) noexcept
+    {
+        return load_aligned(mem);
+    }
+
+    /**
+     * Extract a scalar mask representation from this @c batch_bool.
+     *
+     * @return bit mask
+     */
+    template <class T, class A>
+    inline uint64_t batch_bool<T, A>::mask() const noexcept
+    {
+        return kernel::mask(*this, A {});
+    }
+
+    /**
+     * Extract a scalar mask representation from this @c batch_bool.
+     *
+     * @return bit mask
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> batch_bool<T, A>::from_mask(uint64_t mask) noexcept
+    {
+        return kernel::from_mask(batch_bool<T, A>(), mask, A {});
+    }
+
+    template <class T, class A>
+    inline bool batch_bool<T, A>::get(std::size_t i) const noexcept
+    {
+        return kernel::get(*this, i, A {});
+    }
+
+    /***********************************
+     * batch_bool comparison operators *
+     ***********************************/
+
+    template <class T, class A>
+    inline batch_bool<T, A> batch_bool<T, A>::operator==(batch_bool<T, A> const& other) const noexcept
+    {
+        return kernel::eq<A>(*this, other, A {}).data;
+    }
+
+    template <class T, class A>
+    inline batch_bool<T, A> batch_bool<T, A>::operator!=(batch_bool<T, A> const& other) const noexcept
+    {
+        return kernel::neq<A>(*this, other, A {}).data;
+    }
+
+    /********************************
+     * batch_bool logical operators *
+     ********************************/
+
+    template <class T, class A>
+    inline batch_bool<T, A> batch_bool<T, A>::operator~() const noexcept
+    {
+        return kernel::bitwise_not<A>(*this, A {}).data;
+    }
+
+    template <class T, class A>
+    inline batch_bool<T, A> batch_bool<T, A>::operator!() const noexcept
+    {
+        return operator==(batch_bool(false));
+    }
+
+    template <class T, class A>
+    inline batch_bool<T, A> batch_bool<T, A>::operator&(batch_bool<T, A> const& other) const noexcept
+    {
+        return kernel::bitwise_and<A>(*this, other, A {}).data;
+    }
+
+    template <class T, class A>
+    inline batch_bool<T, A> batch_bool<T, A>::operator|(batch_bool<T, A> const& other) const noexcept
+    {
+        return kernel::bitwise_or<A>(*this, other, A {}).data;
+    }
+
+    template <class T, class A>
+    inline batch_bool<T, A> batch_bool<T, A>::operator^(batch_bool<T, A> const& other) const noexcept
+    {
+        return kernel::bitwise_xor<A>(*this, other, A {}).data;
+    }
+
+    template <class T, class A>
+    inline batch_bool<T, A> batch_bool<T, A>::operator&&(batch_bool const& other) const noexcept
+    {
+        return operator&(other);
+    }
+
+    template <class T, class A>
+    inline batch_bool<T, A> batch_bool<T, A>::operator||(batch_bool const& other) const noexcept
+    {
+        return operator|(other);
+    }
+
+    /******************************
+     * batch_bool private methods *
+     ******************************/
+
+    template <class T, class A>
+    inline batch_bool<T, A>::batch_bool(bool val) noexcept
+        : base_type { make_register(detail::make_index_sequence<size - 1>(), val) }
+    {
+    }
+
+    template <class T, class A>
+    template <class U, class... V, size_t I, size_t... Is>
+    inline auto batch_bool<T, A>::make_register(detail::index_sequence<I, Is...>, U u, V... v) noexcept -> register_type
+    {
+        return make_register(detail::index_sequence<Is...>(), u, u, v...);
+    }
+
+    template <class T, class A>
+    template <class... V>
+    inline auto batch_bool<T, A>::make_register(detail::index_sequence<>, V... v) noexcept -> register_type
+    {
+        return kernel::set<A>(batch_bool<T, A>(), A {}, v...).data;
+    }
+
+    /*******************************
+     * batch<complex> constructors *
+     *******************************/
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A>::batch(value_type const& val) noexcept
+        : m_real(val.real())
+        , m_imag(val.imag())
+    {
+    }
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A>::batch(real_batch const& real, real_batch const& imag) noexcept
+        : m_real(real)
+        , m_imag(imag)
+    {
+    }
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A>::batch(real_batch const& real) noexcept
+        : m_real(real)
+        , m_imag(0)
+    {
+    }
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A>::batch(T val) noexcept
+        : m_real(val)
+        , m_imag(0)
+    {
+    }
+
+    template <class T, class A>
+    template <class... Ts>
+    inline batch<std::complex<T>, A>::batch(value_type val0, value_type val1, Ts... vals) noexcept
+        : batch(kernel::set<A>(batch {}, A {}, val0, val1, static_cast<value_type>(vals)...))
+    {
+        static_assert(sizeof...(Ts) + 2 == size, "as many arguments as batch elements");
+    }
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A>::batch(batch_bool_type const& b) noexcept
+        : m_real(b)
+        , m_imag(0)
+    {
+    }
+
+    template <class T, class A>
+    template <class U>
+    XSIMD_NO_DISCARD inline batch<std::complex<T>, A> batch<std::complex<T>, A>::broadcast(U val) noexcept
+    {
+        return batch(static_cast<std::complex<T>>(val));
+    }
+
+    /***********************************
+     * batch<complex> memory operators *
+     ***********************************/
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::load_aligned(const T* real_src, const T* imag_src) noexcept
+    {
+        return { batch<T, A>::load_aligned(real_src), imag_src ? batch<T, A>::load_aligned(imag_src) : batch<T, A>(0) };
+    }
+    template <class T, class A>
+    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::load_unaligned(const T* real_src, const T* imag_src) noexcept
+    {
+        return { batch<T, A>::load_unaligned(real_src), imag_src ? batch<T, A>::load_unaligned(imag_src) : batch<T, A>(0) };
+    }
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::load_aligned(const value_type* src) noexcept
+    {
+        assert(((reinterpret_cast<uintptr_t>(src) % A::alignment()) == 0)
+               && "loaded pointer is not properly aligned");
+        return kernel::load_complex_aligned<A>(src, kernel::convert<value_type> {}, A {});
+    }
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::load_unaligned(const value_type* src) noexcept
+    {
+        return kernel::load_complex_unaligned<A>(src, kernel::convert<value_type> {}, A {});
+    }
+
+    template <class T, class A>
+    inline void batch<std::complex<T>, A>::store_aligned(value_type* dst) const noexcept
+    {
+        assert(((reinterpret_cast<uintptr_t>(dst) % A::alignment()) == 0)
+               && "store location is not properly aligned");
+        return kernel::store_complex_aligned(dst, *this, A {});
+    }
+
+    template <class T, class A>
+    inline void batch<std::complex<T>, A>::store_unaligned(value_type* dst) const noexcept
+    {
+        return kernel::store_complex_unaligned(dst, *this, A {});
+    }
+
+    template <class T, class A>
+    inline void batch<std::complex<T>, A>::store_aligned(T* real_dst, T* imag_dst) const noexcept
+    {
+        m_real.store_aligned(real_dst);
+        m_imag.store_aligned(imag_dst);
+    }
+
+    template <class T, class A>
+    inline void batch<std::complex<T>, A>::store_unaligned(T* real_dst, T* imag_dst) const noexcept
+    {
+        m_real.store_unaligned(real_dst);
+        m_imag.store_unaligned(imag_dst);
+    }
+
+    template <class T, class A>
+    template <class U>
+    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::load(U const* mem, aligned_mode) noexcept
+    {
+        return load_aligned(mem);
+    }
+
+    template <class T, class A>
+    template <class U>
+    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::load(U const* mem, unaligned_mode) noexcept
+    {
+        return load_unaligned(mem);
+    }
+
+    template <class T, class A>
+    template <class U>
+    inline void batch<std::complex<T>, A>::store(U* mem, aligned_mode) const noexcept
+    {
+        return store_aligned(mem);
+    }
+
+    template <class T, class A>
+    template <class U>
+    inline void batch<std::complex<T>, A>::store(U* mem, unaligned_mode) const noexcept
+    {
+        return store_unaligned(mem);
+    }
+
+    template <class T, class A>
+    inline auto batch<std::complex<T>, A>::real() const noexcept -> real_batch
+    {
+        return m_real;
+    }
+
+    template <class T, class A>
+    inline auto batch<std::complex<T>, A>::imag() const noexcept -> real_batch
+    {
+        return m_imag;
+    }
+
+    template <class T, class A>
+    inline auto batch<std::complex<T>, A>::get(std::size_t i) const noexcept -> value_type
+    {
+        return kernel::get(*this, i, A {});
+    }
+
+    /**************************************
+     * batch<complex> xtl-related methods *
+     **************************************/
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+
+    template <class T, class A>
+    template <bool i3ec>
+    inline batch<std::complex<T>, A>::batch(xtl::xcomplex<T, T, i3ec> const& val) noexcept
+        : m_real(val.real())
+        , m_imag(val.imag())
+    {
+    }
+
+    template <class T, class A>
+    template <bool i3ec, class... Ts>
+    inline batch<std::complex<T>, A>::batch(xtl::xcomplex<T, T, i3ec> val0, xtl::xcomplex<T, T, i3ec> val1, Ts... vals) noexcept
+        : batch(kernel::set<A>(batch {}, A {}, val0, val1, static_cast<xtl::xcomplex<T, T, i3ec>>(vals)...))
+    {
+        static_assert(sizeof...(Ts) + 2 == size, "as many arguments as batch elements");
+    }
+
+    // Memory layout of an xcomplex and std::complex are the same when xcomplex
+    // stores values and not reference. Unfortunately, this breaks strict
+    // aliasing...
+
+    template <class T, class A>
+    template <bool i3ec>
+    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::load_aligned(const xtl::xcomplex<T, T, i3ec>* src) noexcept
+    {
+        return load_aligned(reinterpret_cast<std::complex<T> const*>(src));
+    }
+
+    template <class T, class A>
+    template <bool i3ec>
+    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::load_unaligned(const xtl::xcomplex<T, T, i3ec>* src) noexcept
+    {
+        return load_unaligned(reinterpret_cast<std::complex<T> const*>(src));
+    }
+
+    template <class T, class A>
+    template <bool i3ec>
+    inline void batch<std::complex<T>, A>::store_aligned(xtl::xcomplex<T, T, i3ec>* dst) const noexcept
+    {
+        store_aligned(reinterpret_cast<std::complex<T>*>(dst));
+    }
+
+    template <class T, class A>
+    template <bool i3ec>
+    inline void batch<std::complex<T>, A>::store_unaligned(xtl::xcomplex<T, T, i3ec>* dst) const noexcept
+    {
+        store_unaligned(reinterpret_cast<std::complex<T>*>(dst));
+    }
+
+#endif
+
+    /***************************************
+     * batch<complex> comparison operators *
+     ***************************************/
+
+    template <class T, class A>
+    inline batch_bool<T, A> batch<std::complex<T>, A>::operator==(batch const& other) const noexcept
+    {
+        return m_real == other.m_real && m_imag == other.m_imag;
+    }
+
+    template <class T, class A>
+    inline batch_bool<T, A> batch<std::complex<T>, A>::operator!=(batch const& other) const noexcept
+    {
+        return m_real != other.m_real || m_imag != other.m_imag;
+    }
+
+    /***********************************
+     * batch<complex> update operators *
+     ***********************************/
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A>& batch<std::complex<T>, A>::operator+=(batch const& other) noexcept
+    {
+        m_real += other.m_real;
+        m_imag += other.m_imag;
+        return *this;
+    }
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A>& batch<std::complex<T>, A>::operator-=(batch const& other) noexcept
+    {
+        m_real -= other.m_real;
+        m_imag -= other.m_imag;
+        return *this;
+    }
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A>& batch<std::complex<T>, A>::operator*=(batch const& other) noexcept
+    {
+        real_batch new_real = real() * other.real() - imag() * other.imag();
+        real_batch new_imag = real() * other.imag() + imag() * other.real();
+        m_real = new_real;
+        m_imag = new_imag;
+        return *this;
+    }
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A>& batch<std::complex<T>, A>::operator/=(batch const& other) noexcept
+    {
+        real_batch a = real();
+        real_batch b = imag();
+        real_batch c = other.real();
+        real_batch d = other.imag();
+        real_batch e = c * c + d * d;
+        m_real = (c * a + d * b) / e;
+        m_imag = (c * b - d * a) / e;
+        return *this;
+    }
+
+    /**************************************
+     * batch<complex> incr/decr operators *
+     **************************************/
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A>& batch<std::complex<T>, A>::operator++() noexcept
+    {
+        return operator+=(1);
+    }
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A>& batch<std::complex<T>, A>::operator--() noexcept
+    {
+        return operator-=(1);
+    }
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::operator++(int) noexcept
+    {
+        batch copy(*this);
+        operator+=(1);
+        return copy;
+    }
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::operator--(int) noexcept
+    {
+        batch copy(*this);
+        operator-=(1);
+        return copy;
+    }
+
+    /**********************************
+     * batch<complex> unary operators *
+     **********************************/
+
+    template <class T, class A>
+    inline batch_bool<T, A> batch<std::complex<T>, A>::operator!() const noexcept
+    {
+        return operator==(batch(0));
+    }
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::operator~() const noexcept
+    {
+        return { ~m_real, ~m_imag };
+    }
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::operator-() const noexcept
+    {
+        return { -m_real, -m_imag };
+    }
+
+    template <class T, class A>
+    inline batch<std::complex<T>, A> batch<std::complex<T>, A>::operator+() const noexcept
+    {
+        return { +m_real, +m_imag };
+    }
+
+    /**********************************
+     * size type aliases
+     **********************************/
+
+    namespace details
+    {
+        template <typename T, std::size_t N, class ArchList>
+        struct sized_batch;
+
+        template <typename T, std::size_t N>
+        struct sized_batch<T, N, xsimd::arch_list<>>
+        {
+            using type = void;
+        };
+
+        template <typename T, class Arch, bool BatchExists = xsimd::types::has_simd_register<T, Arch>::value>
+        struct batch_trait;
+
+        template <typename T, class Arch>
+        struct batch_trait<T, Arch, true>
+        {
+            using type = xsimd::batch<T, Arch>;
+            static constexpr std::size_t size = xsimd::batch<T, Arch>::size;
+        };
+
+        template <typename T, class Arch>
+        struct batch_trait<T, Arch, false>
+        {
+            using type = void;
+            static constexpr std::size_t size = 0;
+        };
+
+        template <typename T, std::size_t N, class Arch, class... Archs>
+        struct sized_batch<T, N, xsimd::arch_list<Arch, Archs...>>
+        {
+            using type = typename std::conditional<
+                batch_trait<T, Arch>::size == N,
+                typename batch_trait<T, Arch>::type,
+                typename sized_batch<T, N, xsimd::arch_list<Archs...>>::type>::type;
+        };
+    }
+
+    /**
+     * @brief type utility to select a batch of given type and size
+     *
+     * If one of the available architectures has a native vector type of the
+     * given type and size, sets the @p type member to the appropriate batch
+     * type. Otherwise set its to @p void.
+     *
+     * @tparam T the type of the underlying values.
+     * @tparam N the number of elements of that type in the batch.
+     **/
+    template <typename T, std::size_t N>
+    struct make_sized_batch
+    {
+        using type = typename details::sized_batch<T, N, supported_architectures>::type;
+    };
+
+    template <typename T, std::size_t N>
+    using make_sized_batch_t = typename make_sized_batch<T, N>::type;
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_batch_constant.hpp b/third_party/xsimd/include/xsimd/types/xsimd_batch_constant.hpp
new file mode 100644
index 0000000000..0de9c8ad42
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_batch_constant.hpp
@@ -0,0 +1,288 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_BATCH_CONSTANT_HPP
+#define XSIMD_BATCH_CONSTANT_HPP
+
+#include "./xsimd_batch.hpp"
+#include "./xsimd_utils.hpp"
+
+namespace xsimd
+{
+    /**
+     * @brief batch of boolean constant
+     *
+     * Abstract representation of a batch of boolean constants.
+     *
+     * @tparam batch_type the type of the associated batch values.
+     * @tparam Values boolean constant represented by this batch
+     **/
+    template <class batch_type, bool... Values>
+    struct batch_bool_constant
+    {
+
+    public:
+        static constexpr std::size_t size = sizeof...(Values);
+        using arch_type = typename batch_type::arch_type;
+        using value_type = bool;
+        static_assert(sizeof...(Values) == batch_type::size, "consistent batch size");
+
+        constexpr operator batch_bool<typename batch_type::value_type, arch_type>() const noexcept { return { Values... }; }
+
+        constexpr bool get(size_t i) const noexcept
+        {
+            return std::array<value_type, size> { { Values... } }[i];
+        }
+
+        static constexpr int mask() noexcept
+        {
+            return mask_helper(0, static_cast<int>(Values)...);
+        }
+
+    private:
+        static constexpr int mask_helper(int acc) noexcept { return acc; }
+
+        template <class... Tys>
+        static constexpr int mask_helper(int acc, int mask, Tys... masks) noexcept
+        {
+            return mask_helper(acc | mask, (masks << 1)...);
+        }
+
+        struct logical_or
+        {
+            constexpr bool operator()(bool x, bool y) const { return x || y; }
+        };
+        struct logical_and
+        {
+            constexpr bool operator()(bool x, bool y) const { return x && y; }
+        };
+        struct logical_xor
+        {
+            constexpr bool operator()(bool x, bool y) const { return x ^ y; }
+        };
+
+        template <class F, class SelfPack, class OtherPack, size_t... Indices>
+        static constexpr batch_bool_constant<batch_type, F()(std::tuple_element<Indices, SelfPack>::type::value, std::tuple_element<Indices, OtherPack>::type::value)...>
+        apply(detail::index_sequence<Indices...>)
+        {
+            return {};
+        }
+
+        template <class F, bool... OtherValues>
+        static constexpr auto apply(batch_bool_constant<batch_type, Values...>, batch_bool_constant<batch_type, OtherValues...>)
+            -> decltype(apply<F, std::tuple<std::integral_constant<bool, Values>...>, std::tuple<std::integral_constant<bool, OtherValues>...>>(detail::make_index_sequence<sizeof...(Values)>()))
+        {
+            static_assert(sizeof...(Values) == sizeof...(OtherValues), "compatible constant batches");
+            return apply<F, std::tuple<std::integral_constant<bool, Values>...>, std::tuple<std::integral_constant<bool, OtherValues>...>>(detail::make_index_sequence<sizeof...(Values)>());
+        }
+
+    public:
+#define MAKE_BINARY_OP(OP, NAME)                                                            \
+    template <bool... OtherValues>                                                          \
+    constexpr auto operator OP(batch_bool_constant<batch_type, OtherValues...> other) const \
+        -> decltype(apply<NAME>(*this, other))                                              \
+    {                                                                                       \
+        return apply<NAME>(*this, other);                                                   \
+    }
+
+        MAKE_BINARY_OP(|, logical_or)
+        MAKE_BINARY_OP(||, logical_or)
+        MAKE_BINARY_OP(&, logical_and)
+        MAKE_BINARY_OP(&&, logical_and)
+        MAKE_BINARY_OP(^, logical_xor)
+
+#undef MAKE_BINARY_OP
+
+        constexpr batch_bool_constant<batch_type, !Values...> operator!() const
+        {
+            return {};
+        }
+
+        constexpr batch_bool_constant<batch_type, !Values...> operator~() const
+        {
+            return {};
+        }
+    };
+
+    /**
+     * @brief batch of integral constants
+     *
+     * Abstract representation of a batch of integral constants.
+     *
+     * @tparam batch_type the type of the associated batch values.
+     * @tparam Values constants represented by this batch
+     **/
+    template <class batch_type, typename batch_type::value_type... Values>
+    struct batch_constant
+    {
+        static constexpr std::size_t size = sizeof...(Values);
+        using arch_type = typename batch_type::arch_type;
+        using value_type = typename batch_type::value_type;
+        static_assert(sizeof...(Values) == batch_type::size, "consistent batch size");
+
+        /**
+         * @brief Generate a batch of @p batch_type from this @p batch_constant
+         */
+        inline operator batch_type() const noexcept { return { Values... }; }
+
+        /**
+         * @brief Get the @p i th element of this @p batch_constant
+         */
+        constexpr value_type get(size_t i) const noexcept
+        {
+            return get(i, std::array<value_type, size> { Values... });
+        }
+
+    private:
+        constexpr value_type get(size_t i, std::array<value_type, size> const& values) const noexcept
+        {
+            return values[i];
+        }
+
+        struct arithmetic_add
+        {
+            constexpr value_type operator()(value_type x, value_type y) const { return x + y; }
+        };
+        struct arithmetic_sub
+        {
+            constexpr value_type operator()(value_type x, value_type y) const { return x - y; }
+        };
+        struct arithmetic_mul
+        {
+            constexpr value_type operator()(value_type x, value_type y) const { return x * y; }
+        };
+        struct arithmetic_div
+        {
+            constexpr value_type operator()(value_type x, value_type y) const { return x / y; }
+        };
+        struct arithmetic_mod
+        {
+            constexpr value_type operator()(value_type x, value_type y) const { return x % y; }
+        };
+        struct binary_and
+        {
+            constexpr value_type operator()(value_type x, value_type y) const { return x & y; }
+        };
+        struct binary_or
+        {
+            constexpr value_type operator()(value_type x, value_type y) const { return x | y; }
+        };
+        struct binary_xor
+        {
+            constexpr value_type operator()(value_type x, value_type y) const { return x ^ y; }
+        };
+
+        template <class F, class SelfPack, class OtherPack, size_t... Indices>
+        static constexpr batch_constant<batch_type, F()(std::tuple_element<Indices, SelfPack>::type::value, std::tuple_element<Indices, OtherPack>::type::value)...>
+        apply(detail::index_sequence<Indices...>)
+        {
+            return {};
+        }
+
+        template <class F, value_type... OtherValues>
+        static constexpr auto apply(batch_constant<batch_type, Values...>, batch_constant<batch_type, OtherValues...>)
+            -> decltype(apply<F, std::tuple<std::integral_constant<value_type, Values>...>, std::tuple<std::integral_constant<value_type, OtherValues>...>>(detail::make_index_sequence<sizeof...(Values)>()))
+        {
+            static_assert(sizeof...(Values) == sizeof...(OtherValues), "compatible constant batches");
+            return apply<F, std::tuple<std::integral_constant<value_type, Values>...>, std::tuple<std::integral_constant<value_type, OtherValues>...>>(detail::make_index_sequence<sizeof...(Values)>());
+        }
+
+    public:
+#define MAKE_BINARY_OP(OP, NAME)                                                       \
+    template <value_type... OtherValues>                                               \
+    constexpr auto operator OP(batch_constant<batch_type, OtherValues...> other) const \
+        -> decltype(apply<NAME>(*this, other))                                         \
+    {                                                                                  \
+        return apply<NAME>(*this, other);                                              \
+    }
+
+        MAKE_BINARY_OP(+, arithmetic_add)
+        MAKE_BINARY_OP(-, arithmetic_sub)
+        MAKE_BINARY_OP(*, arithmetic_mul)
+        MAKE_BINARY_OP(/, arithmetic_div)
+        MAKE_BINARY_OP(%, arithmetic_mod)
+        MAKE_BINARY_OP(&, binary_and)
+        MAKE_BINARY_OP(|, binary_or)
+        MAKE_BINARY_OP(^, binary_xor)
+
+#undef MAKE_BINARY_OP
+
+        constexpr batch_constant<batch_type, (value_type)-Values...> operator-() const
+        {
+            return {};
+        }
+
+        constexpr batch_constant<batch_type, (value_type) + Values...> operator+() const
+        {
+            return {};
+        }
+
+        constexpr batch_constant<batch_type, (value_type)~Values...> operator~() const
+        {
+            return {};
+        }
+    };
+
+    namespace detail
+    {
+        template <class batch_type, class G, std::size_t... Is>
+        inline constexpr auto make_batch_constant(detail::index_sequence<Is...>) noexcept
+            -> batch_constant<batch_type, (typename batch_type::value_type)G::get(Is, sizeof...(Is))...>
+        {
+            return {};
+        }
+        template <class batch_type, class G, std::size_t... Is>
+        inline constexpr auto make_batch_bool_constant(detail::index_sequence<Is...>) noexcept
+            -> batch_bool_constant<batch_type, G::get(Is, sizeof...(Is))...>
+        {
+            return {};
+        }
+
+    } // namespace detail
+
+    /**
+     * @brief Build a @c batch_constant out of a generator function
+     *
+     * @tparam batch_type type of the (non-constant) batch to build
+     * @tparam G type used to generate that batch. That type must have a static
+     * member @c get that's used to generate the batch constant. Conversely, the
+     * generated batch_constant has value `{G::get(0, batch_size), ... , G::get(batch_size - 1, batch_size)}`
+     *
+     * The following generator produces a batch of `(n - 1, 0, 1, ... n-2)`
+     *
+     * @code
+     * struct Rot
+     * {
+     *     static constexpr unsigned get(unsigned i, unsigned n)
+     *     {
+     *         return (i + n - 1) % n;
+     *     }
+     * };
+     * @endcode
+     */
+    template <class batch_type, class G>
+    inline constexpr auto make_batch_constant() noexcept -> decltype(detail::make_batch_constant<batch_type, G>(detail::make_index_sequence<batch_type::size>()))
+    {
+        return detail::make_batch_constant<batch_type, G>(detail::make_index_sequence<batch_type::size>());
+    }
+
+    template <class batch_type, class G>
+    inline constexpr auto make_batch_bool_constant() noexcept
+        -> decltype(detail::make_batch_bool_constant<batch_type, G>(
+            detail::make_index_sequence<batch_type::size>()))
+    {
+        return detail::make_batch_bool_constant<batch_type, G>(
+            detail::make_index_sequence<batch_type::size>());
+    }
+
+} // namespace xsimd
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp
new file mode 100644
index 0000000000..b9a5995414
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_fma3_avx2_register.hpp
@@ -0,0 +1,46 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA3_AVX2_REGISTER_HPP
+#define XSIMD_FMA3_AVX2_REGISTER_HPP
+
+#include "./xsimd_avx2_register.hpp"
+
+namespace xsimd
+{
+    template <typename arch>
+    struct fma3;
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX2 + FMA instructions
+     */
+    template <>
+    struct fma3<avx2> : avx2
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3_AVX2; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(2, 2, 1); }
+        static constexpr char const* name() noexcept { return "fma3+avx2"; }
+    };
+
+#if XSIMD_WITH_FMA3_AVX2
+    namespace types
+    {
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma3<avx2>, avx2);
+
+    }
+#endif
+
+}
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp
new file mode 100644
index 0000000000..ae10598f2c
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_fma3_avx_register.hpp
@@ -0,0 +1,46 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA3_AVX_REGISTER_HPP
+#define XSIMD_FMA3_AVX_REGISTER_HPP
+
+#include "./xsimd_avx_register.hpp"
+
+namespace xsimd
+{
+    template <typename arch>
+    struct fma3;
+
+    /**
+     * @ingroup architectures
+     *
+     * AVX + FMA instructions
+     */
+    template <>
+    struct fma3<avx> : avx
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3_AVX; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(2, 1, 1); }
+        static constexpr char const* name() noexcept { return "fma3+avx"; }
+    };
+
+#if XSIMD_WITH_FMA3_AVX
+    namespace types
+    {
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma3<avx>, avx);
+
+    }
+#endif
+
+}
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp
new file mode 100644
index 0000000000..a267490d66
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_fma3_sse_register.hpp
@@ -0,0 +1,46 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA3_SSE_REGISTER_HPP
+#define XSIMD_FMA3_SSE_REGISTER_HPP
+
+#include "./xsimd_sse4_2_register.hpp"
+
+namespace xsimd
+{
+    template <typename arch>
+    struct fma3;
+
+    /**
+     * @ingroup architectures
+     *
+     * SSE4.2 + FMA instructions
+     */
+    template <>
+    struct fma3<sse4_2> : sse4_2
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_FMA3_SSE; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(1, 4, 3); }
+        static constexpr char const* name() noexcept { return "fma3+sse4.2"; }
+    };
+
+#if XSIMD_WITH_FMA3_SSE
+    namespace types
+    {
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma3<sse4_2>, sse4_2);
+
+    }
+#endif
+
+}
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_fma4_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_fma4_register.hpp
new file mode 100644
index 0000000000..3684bbb401
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_fma4_register.hpp
@@ -0,0 +1,42 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_FMA4_REGISTER_HPP
+#define XSIMD_FMA4_REGISTER_HPP
+
+#include "./xsimd_sse4_2_register.hpp"
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * SSE4.2 + FMA4 instructions
+     */
+    struct fma4 : sse4_2
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_FMA4; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(1, 4, 4); }
+        static constexpr char const* name() noexcept { return "fma4"; }
+    };
+
+#if XSIMD_WITH_FMA4
+    namespace types
+    {
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(fma4, sse4_2);
+
+    }
+#endif
+
+}
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_generic_arch.hpp b/third_party/xsimd/include/xsimd/types/xsimd_generic_arch.hpp
new file mode 100644
index 0000000000..f4a2ca6aad
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_generic_arch.hpp
@@ -0,0 +1,52 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_ARCH_HPP
+#define XSIMD_GENERIC_ARCH_HPP
+
+#include "../config/xsimd_config.hpp"
+
+/**
+ * @defgroup architectures Architecture description
+ * */
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * Base class for all architectures.
+     */
+    struct generic
+    {
+        /// Whether this architecture is supported at compile-time.
+        static constexpr bool supported() noexcept { return true; }
+        /// Whether this architecture is available at run-time.
+        static constexpr bool available() noexcept { return true; }
+        /// If this architectures supports aligned memory accesses, the required
+        /// alignment.
+        static constexpr std::size_t alignment() noexcept { return 0; }
+        /// Whether this architecture requires aligned memory access.
+        static constexpr bool requires_alignment() noexcept { return false; }
+        /// Unique identifier for this architecture.
+        static constexpr unsigned version() noexcept { return generic::version(0, 0, 0); }
+        /// Name of the architecture.
+        static constexpr char const* name() noexcept { return "generic"; }
+
+    protected:
+        static constexpr unsigned version(unsigned major, unsigned minor, unsigned patch, unsigned multiplier = 100u) noexcept { return major * multiplier * multiplier + minor * multiplier + patch; }
+    };
+
+    struct unsupported
+    {
+    };
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_neon64_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_neon64_register.hpp
new file mode 100644
index 0000000000..3aa8973b63
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_neon64_register.hpp
@@ -0,0 +1,52 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_NEON64_REGISTER_HPP
+#define XSIMD_NEON64_REGISTER_HPP
+
+#include "xsimd_neon_register.hpp"
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * NEON instructions for arm64
+     */
+    struct neon64 : neon
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_NEON64; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr bool requires_alignment() noexcept { return true; }
+        static constexpr std::size_t alignment() noexcept { return 16; }
+        static constexpr unsigned version() noexcept { return generic::version(8, 1, 0); }
+        static constexpr char const* name() noexcept { return "arm64+neon"; }
+    };
+
+#if XSIMD_WITH_NEON64
+
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(neon64, neon);
+        XSIMD_DECLARE_SIMD_REGISTER(double, neon64, float64x2_t);
+
+        template <class T>
+        struct get_bool_simd_register<T, neon64>
+            : detail::neon_bool_simd_register<T, neon64>
+        {
+        };
+    }
+
+#endif
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_neon_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_neon_register.hpp
new file mode 100644
index 0000000000..0ef4b381d3
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_neon_register.hpp
@@ -0,0 +1,155 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_NEON_REGISTER_HPP
+#define XSIMD_NEON_REGISTER_HPP
+
+#include "xsimd_generic_arch.hpp"
+#include "xsimd_register.hpp"
+
+#if XSIMD_WITH_NEON
+#include <arm_neon.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * NEON instructions for arm32
+     */
+    struct neon : generic
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_NEON; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr bool requires_alignment() noexcept { return true; }
+        static constexpr std::size_t alignment() noexcept { return 16; }
+        static constexpr unsigned version() noexcept { return generic::version(7, 0, 0); }
+        static constexpr char const* name() noexcept { return "arm32+neon"; }
+    };
+
+#if XSIMD_WITH_NEON
+    namespace types
+    {
+        namespace detail
+        {
+            template <size_t S>
+            struct neon_vector_type_impl;
+
+            template <>
+            struct neon_vector_type_impl<8>
+            {
+                using signed_type = int8x16_t;
+                using unsigned_type = uint8x16_t;
+            };
+
+            template <>
+            struct neon_vector_type_impl<16>
+            {
+                using signed_type = int16x8_t;
+                using unsigned_type = uint16x8_t;
+            };
+
+            template <>
+            struct neon_vector_type_impl<32>
+            {
+                using signed_type = int32x4_t;
+                using unsigned_type = uint32x4_t;
+            };
+
+            template <>
+            struct neon_vector_type_impl<64>
+            {
+                using signed_type = int64x2_t;
+                using unsigned_type = uint64x2_t;
+            };
+
+            template <class T>
+            using signed_neon_vector_type = typename neon_vector_type_impl<8 * sizeof(T)>::signed_type;
+
+            template <class T>
+            using unsigned_neon_vector_type = typename neon_vector_type_impl<8 * sizeof(T)>::unsigned_type;
+
+            template <class T>
+            using neon_vector_type = typename std::conditional<std::is_signed<T>::value,
+                                                               signed_neon_vector_type<T>,
+                                                               unsigned_neon_vector_type<T>>::type;
+
+            using char_neon_vector_type = typename std::conditional<std::is_signed<char>::value,
+                                                                    signed_neon_vector_type<char>,
+                                                                    unsigned_neon_vector_type<char>>::type;
+        }
+
+        XSIMD_DECLARE_SIMD_REGISTER(signed char, neon, detail::neon_vector_type<signed char>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, neon, detail::neon_vector_type<unsigned char>);
+        XSIMD_DECLARE_SIMD_REGISTER(char, neon, detail::char_neon_vector_type);
+        XSIMD_DECLARE_SIMD_REGISTER(short, neon, detail::neon_vector_type<short>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, neon, detail::neon_vector_type<unsigned short>);
+        XSIMD_DECLARE_SIMD_REGISTER(int, neon, detail::neon_vector_type<int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, neon, detail::neon_vector_type<unsigned int>);
+        XSIMD_DECLARE_SIMD_REGISTER(long int, neon, detail::neon_vector_type<long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, neon, detail::neon_vector_type<unsigned long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(long long int, neon, detail::neon_vector_type<long long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, neon, detail::neon_vector_type<unsigned long long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(float, neon, float32x4_t);
+        XSIMD_DECLARE_INVALID_SIMD_REGISTER(double, neon);
+
+        namespace detail
+        {
+            template <size_t S>
+            struct get_unsigned_type;
+
+            template <>
+            struct get_unsigned_type<1>
+            {
+                using type = uint8_t;
+            };
+
+            template <>
+            struct get_unsigned_type<2>
+            {
+                using type = uint16_t;
+            };
+
+            template <>
+            struct get_unsigned_type<4>
+            {
+                using type = uint32_t;
+            };
+
+            template <>
+            struct get_unsigned_type<8>
+            {
+                using type = uint64_t;
+            };
+
+            template <size_t S>
+            using get_unsigned_type_t = typename get_unsigned_type<S>::type;
+
+            template <class T, class A>
+            struct neon_bool_simd_register
+            {
+                using type = simd_register<get_unsigned_type_t<sizeof(T)>, A>;
+            };
+        }
+
+        template <class T>
+        struct get_bool_simd_register<T, neon>
+            : detail::neon_bool_simd_register<T, neon>
+        {
+        };
+
+    }
+#endif
+
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_register.hpp
new file mode 100644
index 0000000000..4fe4f3f13f
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_register.hpp
@@ -0,0 +1,94 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_REGISTER_HPP
+#define XSIMD_REGISTER_HPP
+
+#include <type_traits>
+
+namespace xsimd
+{
+    namespace types
+    {
+        template <class T, class A>
+        struct has_simd_register : std::false_type
+        {
+        };
+
+        template <class T, class Arch>
+        struct simd_register
+        {
+            struct register_type
+            {
+            };
+        };
+
+#define XSIMD_DECLARE_SIMD_REGISTER(SCALAR_TYPE, ISA, VECTOR_TYPE) \
+    template <>                                                    \
+    struct simd_register<SCALAR_TYPE, ISA>                         \
+    {                                                              \
+        using register_type = VECTOR_TYPE;                         \
+        register_type data;                                        \
+        inline operator register_type() const noexcept             \
+        {                                                          \
+            return data;                                           \
+        }                                                          \
+    };                                                             \
+    template <>                                                    \
+    struct has_simd_register<SCALAR_TYPE, ISA> : std::true_type    \
+    {                                                              \
+    }
+
+#define XSIMD_DECLARE_INVALID_SIMD_REGISTER(SCALAR_TYPE, ISA)    \
+    template <>                                                  \
+    struct has_simd_register<SCALAR_TYPE, ISA> : std::false_type \
+    {                                                            \
+    }
+
+#define XSIMD_DECLARE_SIMD_REGISTER_ALIAS(ISA, ISA_BASE)                          \
+    template <class T>                                                            \
+    struct simd_register<T, ISA> : simd_register<T, ISA_BASE>                     \
+    {                                                                             \
+        using register_type = typename simd_register<T, ISA_BASE>::register_type; \
+        simd_register(register_type reg) noexcept                                 \
+            : simd_register<T, ISA_BASE> { reg }                                  \
+        {                                                                         \
+        }                                                                         \
+        simd_register() = default;                                                \
+    };                                                                            \
+    template <class T>                                                            \
+    struct has_simd_register<T, ISA> : has_simd_register<T, ISA_BASE>             \
+    {                                                                             \
+    }
+
+        template <class T, class Arch>
+        struct get_bool_simd_register
+        {
+            using type = simd_register<T, Arch>;
+        };
+
+        template <class T, class Arch>
+        using get_bool_simd_register_t = typename get_bool_simd_register<T, Arch>::type;
+    }
+
+    namespace kernel
+    {
+        template <class A>
+        // makes requires_arch equal to A const&, using type_traits functions
+        using requires_arch = typename std::add_lvalue_reference<typename std::add_const<A>::type>::type;
+        template <class T>
+        struct convert
+        {
+        };
+    }
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_rvv_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_rvv_register.hpp
new file mode 100644
index 0000000000..bdc0ef3b87
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_rvv_register.hpp
@@ -0,0 +1,419 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ * Copyright (c) Yibo Cai                                                   *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_RVV_REGISTER_HPP
+#define XSIMD_RVV_REGISTER_HPP
+
+#include "xsimd_generic_arch.hpp"
+#include "xsimd_register.hpp"
+
+#if XSIMD_WITH_RVV
+#include <riscv_vector.h>
+#endif
+
+namespace xsimd
+{
+    namespace detail
+    {
+        /**
+         * @ingroup architectures
+         *
+         * RVV instructions (fixed vector size) for riscv
+         */
+        template <size_t Width>
+        struct rvv : xsimd::generic
+        {
+            static constexpr size_t width = Width;
+            static constexpr bool supported() noexcept { return Width == XSIMD_RVV_BITS; }
+            static constexpr bool available() noexcept { return true; }
+            static constexpr bool requires_alignment() noexcept { return true; }
+            static constexpr std::size_t alignment() noexcept { return 16; }
+            static constexpr unsigned version() noexcept { return generic::version(1, 0, 0, /*multiplier=*/1000); }
+            static constexpr char const* name() noexcept { return "riscv+rvv"; }
+        };
+    }
+
+#if XSIMD_WITH_RVV
+
+    using rvv = detail::rvv<__riscv_v_fixed_vlen>;
+
+#define XSIMD_RVV_JOINT_(a, b, c) a##b##c
+#define XSIMD_RVV_JOINT(a, b, c) XSIMD_RVV_JOINT_(a, b, c)
+#define XSIMD_RVV_JOINT5(a, b, c, d, e) XSIMD_RVV_JOINT(XSIMD_RVV_JOINT(a, b, c), d, e)
+
+#define XSIMD_RVV_TYPE_i(S, V) XSIMD_RVV_JOINT5(vint, S, m, V, _t)
+#define XSIMD_RVV_TYPE_u(S, V) XSIMD_RVV_JOINT5(vuint, S, m, V, _t)
+#define XSIMD_RVV_TYPE_f(S, V) XSIMD_RVV_JOINT5(vfloat, S, m, V, _t)
+#define XSIMD_RVV_TYPE(T, S, V) XSIMD_RVV_JOINT(XSIMD_RVV_TYPE, _, T)(S, V)
+
+    namespace types
+    {
+        namespace detail
+        {
+            static constexpr size_t rvv_width_mf8 = XSIMD_RVV_BITS / 8;
+            static constexpr size_t rvv_width_mf4 = XSIMD_RVV_BITS / 4;
+            static constexpr size_t rvv_width_mf2 = XSIMD_RVV_BITS / 2;
+            static constexpr size_t rvv_width_m1 = XSIMD_RVV_BITS;
+            static constexpr size_t rvv_width_m2 = XSIMD_RVV_BITS * 2;
+            static constexpr size_t rvv_width_m4 = XSIMD_RVV_BITS * 4;
+            static constexpr size_t rvv_width_m8 = XSIMD_RVV_BITS * 8;
+
+            // rvv_type_info is a utility class to convert scalar type and
+            // bitwidth into rvv register types.
+            //
+            // * `type` is the unadorned vector type.
+            // * `fixed_type` is the same type, but with the storage attribute
+            //    applied.
+            // * `byte_type` is the type which is the same size in unsigned
+            //    bytes, used as an intermediate step for bit-cast operations,
+            //    because only a subset of __riscv_vreinterpret() intrinsics
+            //    exist -- but always enough to get us to bytes and back.
+            //
+            template <class T, size_t Width>
+            struct rvv_type_info;
+#define XSIMD_RVV_MAKE_TYPE(scalar, t, s, vmul)                                           \
+    template <>                                                                           \
+    struct rvv_type_info<scalar, rvv_width_m1 * vmul>                                     \
+    {                                                                                     \
+        static constexpr size_t width = rvv_width_m1 * vmul;                              \
+        using type = XSIMD_RVV_TYPE(t, s, vmul);                                          \
+        using byte_type = XSIMD_RVV_TYPE(u, 8, vmul);                                     \
+        using fixed_type = type __attribute__((riscv_rvv_vector_bits(width)));            \
+        template <class U>                                                                \
+        static inline type bitcast(U x) noexcept                                          \
+        {                                                                                 \
+            const auto words = XSIMD_RVV_JOINT5(__riscv_vreinterpret_, u, s, m, vmul)(x); \
+            return XSIMD_RVV_JOINT5(__riscv_vreinterpret_, t, s, m, vmul)(words);         \
+        }                                                                                 \
+        template <>                                                                       \
+        inline type bitcast<type>(type x) noexcept { return x; }                          \
+        static inline byte_type as_bytes(type x) noexcept                                 \
+        {                                                                                 \
+            const auto words = XSIMD_RVV_JOINT5(__riscv_vreinterpret_, u, s, m, vmul)(x); \
+            return XSIMD_RVV_JOINT5(__riscv_vreinterpret_, u, 8, m, vmul)(words);         \
+        }                                                                                 \
+    };
+
+#define XSIMD_RVV_MAKE_TYPES(vmul)             \
+    XSIMD_RVV_MAKE_TYPE(int8_t, i, 8, vmul)    \
+    XSIMD_RVV_MAKE_TYPE(uint8_t, u, 8, vmul)   \
+    XSIMD_RVV_MAKE_TYPE(int16_t, i, 16, vmul)  \
+    XSIMD_RVV_MAKE_TYPE(uint16_t, u, 16, vmul) \
+    XSIMD_RVV_MAKE_TYPE(int32_t, i, 32, vmul)  \
+    XSIMD_RVV_MAKE_TYPE(uint32_t, u, 32, vmul) \
+    XSIMD_RVV_MAKE_TYPE(int64_t, i, 64, vmul)  \
+    XSIMD_RVV_MAKE_TYPE(uint64_t, u, 64, vmul) \
+    XSIMD_RVV_MAKE_TYPE(float, f, 32, vmul)    \
+    XSIMD_RVV_MAKE_TYPE(double, f, 64, vmul)
+
+            XSIMD_RVV_MAKE_TYPES(8)
+            XSIMD_RVV_MAKE_TYPES(4)
+            XSIMD_RVV_MAKE_TYPES(2)
+            XSIMD_RVV_MAKE_TYPES(1)
+#undef XSIMD_RVV_TYPE
+#undef XSIMD_RVV_TYPE_f
+#undef XSIMD_RVV_TYPE_u
+#undef XSIMD_RVV_TYPE_i
+#undef XSIMD_RVV_MAKE_TYPES
+#undef XSIMD_RVV_MAKE_TYPE
+
+            // rvv_blob is storage-type abstraction for a vector register.
+            template <class T, size_t Width>
+            struct rvv_blob : public rvv_type_info<T, Width>
+            {
+                using super = rvv_type_info<T, Width>;
+                using typename super::fixed_type;
+                using typename super::type;
+
+                fixed_type value;
+                type get() const { return value; }
+                void set(type v) { value = v; }
+            };
+            //
+            // But sometimes we want our storage type to be less than a whole
+            // register, while presenting as a whole register to the outside
+            // world.  This is because some partial-register types are not
+            // defined, but they can (mostly) be emulated using shorter vl on a
+            // full-width register for arithmetic, and cast back to a partial
+            // byte register for storage.
+            //
+            template <class T, size_t divisor>
+            struct rvv_semiblob : public rvv_type_info<T, rvv_width_m1>
+            {
+                using super = rvv_type_info<T, rvv_width_m1>;
+                static constexpr size_t width = rvv_width_m1 / divisor;
+                using typename super::type;
+                template <size_t div>
+                struct semitype;
+                template <>
+                struct semitype<2>
+                {
+                    using type = vuint8mf2_t __attribute__((riscv_rvv_vector_bits(rvv_width_mf2)));
+                };
+                template <>
+                struct semitype<4>
+                {
+                    using type = vuint8mf4_t __attribute__((riscv_rvv_vector_bits(rvv_width_mf4)));
+                };
+                template <>
+                struct semitype<8>
+                {
+                    using type = vuint8mf8_t __attribute__((riscv_rvv_vector_bits(rvv_width_mf8)));
+                };
+                using fixed_type = typename semitype<divisor>::type;
+                using super::as_bytes;
+                using super::bitcast;
+
+                fixed_type value;
+                template <size_t div>
+                vuint8m1_t get_bytes() const;
+                template <>
+                vuint8m1_t get_bytes<2>() const { return __riscv_vlmul_ext_v_u8mf2_u8m1(value); }
+                template <>
+                vuint8m1_t get_bytes<4>() const { return __riscv_vlmul_ext_v_u8mf4_u8m1(value); }
+                template <>
+                vuint8m1_t get_bytes<8>() const { return __riscv_vlmul_ext_v_u8mf8_u8m1(value); }
+                type get() const noexcept
+                {
+                    vuint8m1_t bytes = get_bytes<divisor>();
+                    return bitcast(bytes);
+                }
+                template <size_t div>
+                void set_bytes(vuint8m1_t);
+                template <>
+                void set_bytes<2>(vuint8m1_t v) { value = __riscv_vlmul_trunc_v_u8m1_u8mf2(v); }
+                template <>
+                void set_bytes<4>(vuint8m1_t v) { value = __riscv_vlmul_trunc_v_u8m1_u8mf4(v); }
+                template <>
+                void set_bytes<8>(vuint8m1_t v) { value = __riscv_vlmul_trunc_v_u8m1_u8mf8(v); }
+                void set(type v)
+                {
+                    vuint8m1_t bytes = as_bytes(v);
+                    set_bytes<divisor>(bytes);
+                }
+            };
+            template <class T>
+            struct rvv_blob<T, rvv_width_mf2> : rvv_semiblob<T, 2>
+            {
+            };
+            template <class T>
+            struct rvv_blob<T, rvv_width_mf4> : rvv_semiblob<T, 4>
+            {
+            };
+            template <class T>
+            struct rvv_blob<T, rvv_width_mf8> : rvv_semiblob<T, 8>
+            {
+            };
+
+            // It's difficult dealing with both char and whichever *int8_t type
+            // is compatible with char, so just avoid it altogether.
+            //
+            using rvv_char_t = typename std::conditional<std::is_signed<char>::value, int8_t, uint8_t>::type;
+            template <class T>
+            using rvv_fix_char_t = typename std::conditional<
+                std::is_same<char, typename std::decay<T>::type>::value,
+                rvv_char_t, T>::type;
+
+            // An explicit constructor isn't really explicit enough to allow
+            // implicit bit-casting operations between incompatible types, so
+            // we add this vacuous flag argument when we're serious:
+            //
+            enum rvv_bitcast_flag
+            {
+                XSIMD_RVV_BITCAST
+            };
+
+            // the general-purpose vector register type, usable within
+            // templates, and supporting arithmetic on partial registers for
+            // which there is no intrinsic type (by casting via a full register
+            // type).
+            //
+            template <class T, size_t Width>
+            struct rvv_reg
+            {
+                static constexpr size_t width = Width;
+                static constexpr size_t vl = Width / (sizeof(T) * 8);
+                using blob_type = rvv_blob<T, Width>;
+                using register_type = typename blob_type::type;
+                using byte_type = typename blob_type::byte_type;
+                blob_type value;
+                rvv_reg() noexcept = default;
+                rvv_reg(register_type x) noexcept { value.set(x); }
+                explicit rvv_reg(byte_type v, rvv_bitcast_flag) { value.set(value.bitcast(v)); }
+                template <class U>
+                explicit rvv_reg(rvv_reg<U, Width> v, rvv_bitcast_flag)
+                    : rvv_reg(v.get_bytes(), XSIMD_RVV_BITCAST)
+                {
+                }
+                byte_type get_bytes() const noexcept
+                {
+                    return blob_type::as_bytes(value.get());
+                }
+                operator register_type() const noexcept { return value.get(); }
+            };
+            template <class T, size_t Width = XSIMD_RVV_BITS>
+            using rvv_reg_t = typename std::conditional<!std::is_void<T>::value, rvv_reg<rvv_fix_char_t<T>, Width>, void>::type;
+
+            // And some more of the same stuff for bool types, which have
+            // similar problems and similar workarounds.
+            //
+            template <size_t>
+            struct rvv_bool_info;
+#define XSIMD_RVV_MAKE_BOOL_TYPE(i)                                                       \
+    template <>                                                                           \
+    struct rvv_bool_info<i>                                                               \
+    {                                                                                     \
+        using type = XSIMD_RVV_JOINT(vbool, i, _t);                                       \
+        template <class T>                                                                \
+        static inline type bitcast(T value) noexcept                                      \
+        {                                                                                 \
+            return XSIMD_RVV_JOINT(__riscv_vreinterpret_b, i, )(value);                   \
+        }                                                                                 \
+        /*template <> static inline type bitcast(type value) noexcept { return value; }*/ \
+    };
+            XSIMD_RVV_MAKE_BOOL_TYPE(1);
+            XSIMD_RVV_MAKE_BOOL_TYPE(2);
+            XSIMD_RVV_MAKE_BOOL_TYPE(4);
+            XSIMD_RVV_MAKE_BOOL_TYPE(8);
+            XSIMD_RVV_MAKE_BOOL_TYPE(16);
+            XSIMD_RVV_MAKE_BOOL_TYPE(32);
+            XSIMD_RVV_MAKE_BOOL_TYPE(64);
+#undef XSIMD_RVV_MAKE_BOOL_TYPE
+#undef XSIMD_RVV_JOINT5
+#undef XSIMD_RVV_JOINT
+#undef XSIMD_RVV_JOINT_
+
+            template <class T, size_t Width>
+            struct rvv_bool
+            {
+                using bool_info = rvv_bool_info<rvv_width_m1 * sizeof(T) * 8 / Width>;
+                using storage_type = vuint8m1_t __attribute__((riscv_rvv_vector_bits(rvv_width_m1)));
+                using type = typename bool_info::type;
+                storage_type value;
+                rvv_bool() = default;
+                rvv_bool(type v) noexcept
+                    : value(__riscv_vreinterpret_u8m1(v))
+                {
+                }
+                template <class U, typename std::enable_if<sizeof(T) == sizeof(U), int>::type = 0>
+                rvv_bool(rvv_bool<U, Width> v)
+                    : value(v.value)
+                {
+                }
+                explicit rvv_bool(uint8_t mask) noexcept
+                    : value(__riscv_vmv_v_x_u8m1(mask, rvv_width_m1 / 8))
+                {
+                }
+                explicit rvv_bool(uint64_t mask) noexcept
+                    : value(__riscv_vreinterpret_v_u64m1_u8m1(__riscv_vmv_v_x_u64m1(mask, rvv_width_m1 / 64)))
+                {
+                }
+                operator type() const noexcept { return bool_info::bitcast(value); }
+            };
+
+            template <class T, size_t Width = XSIMD_RVV_BITS>
+            using rvv_bool_t = typename std::enable_if < !std::is_void<T>::value,
+                  rvv_bool<rvv_fix_char_t<T>, Width<rvv_width_m1 ? rvv_width_m1 : Width>>::type;
+
+            template <size_t S>
+            struct rvv_vector_type_impl;
+
+            template <>
+            struct rvv_vector_type_impl<8>
+            {
+                using signed_type = rvv_reg_t<int8_t>;
+                using unsigned_type = rvv_reg_t<uint8_t>;
+                using floating_point_type = void;
+            };
+
+            template <>
+            struct rvv_vector_type_impl<16>
+            {
+                using signed_type = rvv_reg_t<int16_t>;
+                using unsigned_type = rvv_reg_t<uint16_t>;
+                using floating_point_type = rvv_reg_t<_Float16>;
+            };
+
+            template <>
+            struct rvv_vector_type_impl<32>
+            {
+                using signed_type = rvv_reg_t<int32_t>;
+                using unsigned_type = rvv_reg_t<uint32_t>;
+                using floating_point_type = rvv_reg_t<float>;
+            };
+
+            template <>
+            struct rvv_vector_type_impl<64>
+            {
+                using signed_type = rvv_reg_t<int64_t>;
+                using unsigned_type = rvv_reg_t<uint64_t>;
+                using floating_point_type = rvv_reg_t<double>;
+            };
+
+            template <class T>
+            using signed_int_rvv_vector_type = typename rvv_vector_type_impl<8 * sizeof(T)>::signed_type;
+
+            template <class T>
+            using unsigned_int_rvv_vector_type = typename rvv_vector_type_impl<8 * sizeof(T)>::unsigned_type;
+
+            template <class T>
+            using floating_point_rvv_vector_type = typename rvv_vector_type_impl<8 * sizeof(T)>::floating_point_type;
+
+            template <class T>
+            using signed_int_or_floating_point_rvv_vector_type = typename std::conditional<std::is_floating_point<T>::value,
+                                                                                           floating_point_rvv_vector_type<T>,
+                                                                                           signed_int_rvv_vector_type<T>>::type;
+
+            template <class T>
+            using rvv_vector_type = typename std::conditional<std::is_signed<T>::value,
+                                                              signed_int_or_floating_point_rvv_vector_type<T>,
+                                                              unsigned_int_rvv_vector_type<T>>::type;
+        } // namespace detail
+
+        XSIMD_DECLARE_SIMD_REGISTER(bool, rvv, detail::rvv_vector_type<unsigned char>);
+        XSIMD_DECLARE_SIMD_REGISTER(signed char, rvv, detail::rvv_vector_type<signed char>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, rvv, detail::rvv_vector_type<unsigned char>);
+        XSIMD_DECLARE_SIMD_REGISTER(char, rvv, detail::rvv_vector_type<char>);
+        XSIMD_DECLARE_SIMD_REGISTER(short, rvv, detail::rvv_vector_type<short>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, rvv, detail::rvv_vector_type<unsigned short>);
+        XSIMD_DECLARE_SIMD_REGISTER(int, rvv, detail::rvv_vector_type<int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, rvv, detail::rvv_vector_type<unsigned int>);
+        XSIMD_DECLARE_SIMD_REGISTER(long int, rvv, detail::rvv_vector_type<long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, rvv, detail::rvv_vector_type<unsigned long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(long long int, rvv, detail::rvv_vector_type<long long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, rvv, detail::rvv_vector_type<unsigned long long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(float, rvv, detail::rvv_vector_type<float>);
+        XSIMD_DECLARE_SIMD_REGISTER(double, rvv, detail::rvv_vector_type<double>);
+
+        namespace detail
+        {
+            template <class T>
+            struct rvv_bool_simd_register
+            {
+                using register_type = rvv_bool_t<T>;
+                register_type data;
+                operator register_type() const noexcept { return data; }
+            };
+        } // namespace detail
+
+        template <class T>
+        struct get_bool_simd_register<T, rvv>
+        {
+            using type = detail::rvv_bool_simd_register<T>;
+        };
+    } // namespace types
+#else
+    using rvv = detail::rvv<0xFFFFFFFF>;
+#endif
+} // namespace xsimd
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_sse2_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_sse2_register.hpp
new file mode 100644
index 0000000000..a9dc8960b6
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_sse2_register.hpp
@@ -0,0 +1,60 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE2_REGISTER_HPP
+#define XSIMD_SSE2_REGISTER_HPP
+
+#include "./xsimd_generic_arch.hpp"
+#include "./xsimd_register.hpp"
+
+#if XSIMD_WITH_SSE2
+#include <emmintrin.h>
+#include <xmmintrin.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * SSE2 instructions
+     */
+    struct sse2 : generic
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_SSE2; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr bool requires_alignment() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(1, 2, 0); }
+        static constexpr std::size_t alignment() noexcept { return 16; }
+        static constexpr char const* name() noexcept { return "sse2"; }
+    };
+
+#if XSIMD_WITH_SSE2
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER(signed char, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(char, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(short, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(int, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(long int, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(long long int, sse2, __m128i);
+        XSIMD_DECLARE_SIMD_REGISTER(float, sse2, __m128);
+        XSIMD_DECLARE_SIMD_REGISTER(double, sse2, __m128d);
+    }
+#endif
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_sse3_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_sse3_register.hpp
new file mode 100644
index 0000000000..1a7708a896
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_sse3_register.hpp
@@ -0,0 +1,45 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE3_REGISTER_HPP
+#define XSIMD_SSE3_REGISTER_HPP
+
+#include "./xsimd_sse2_register.hpp"
+
+#if XSIMD_WITH_SSE3
+#include <pmmintrin.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * SSE3 instructions
+     */
+    struct sse3 : sse2
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_SSE3; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(1, 3, 0); }
+        static constexpr char const* name() noexcept { return "sse3"; }
+    };
+
+#if XSIMD_WITH_SSE3
+    namespace types
+    {
+
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(sse3, sse2);
+    }
+#endif
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp
new file mode 100644
index 0000000000..d906712d56
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_sse4_1_register.hpp
@@ -0,0 +1,44 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE4_1_REGISTER_HPP
+#define XSIMD_SSE4_1_REGISTER_HPP
+
+#include "./xsimd_ssse3_register.hpp"
+
+#if XSIMD_WITH_SSE4_1
+#include <smmintrin.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * SSE4.1 instructions
+     */
+    struct sse4_1 : ssse3
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_SSE4_1; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(1, 4, 1); }
+        static constexpr char const* name() noexcept { return "sse4.1"; }
+    };
+
+#if XSIMD_WITH_SSE4_1
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(sse4_1, ssse3);
+    }
+#endif
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp
new file mode 100644
index 0000000000..b3446c9091
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_sse4_2_register.hpp
@@ -0,0 +1,44 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE4_2_REGISTER_HPP
+#define XSIMD_SSE4_2_REGISTER_HPP
+
+#include "./xsimd_sse4_1_register.hpp"
+
+#if XSIMD_WITH_SSE4_2
+#include <nmmintrin.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * SSE4.2 instructions
+     */
+    struct sse4_2 : sse4_1
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_SSE4_2; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(1, 4, 2); }
+        static constexpr char const* name() noexcept { return "sse4.2"; }
+    };
+
+#if XSIMD_WITH_SSE4_2
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(sse4_2, sse4_1);
+    }
+#endif
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp
new file mode 100644
index 0000000000..50ffac1e06
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_ssse3_register.hpp
@@ -0,0 +1,44 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSSE3_REGISTER_HPP
+#define XSIMD_SSSE3_REGISTER_HPP
+
+#include "./xsimd_sse3_register.hpp"
+
+#if XSIMD_WITH_SSSE3
+#include <tmmintrin.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * SSSE3 instructions
+     */
+    struct ssse3 : sse3
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_SSSE3; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(1, 3, 1); }
+        static constexpr char const* name() noexcept { return "ssse3"; }
+    };
+
+#if XSIMD_WITH_SSSE3
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER_ALIAS(ssse3, sse3);
+    }
+#endif
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_sve_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_sve_register.hpp
new file mode 100644
index 0000000000..4f75c607e8
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_sve_register.hpp
@@ -0,0 +1,157 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ * Copyright (c) Yibo Cai                                                   *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SVE_REGISTER_HPP
+#define XSIMD_SVE_REGISTER_HPP
+
+#include "xsimd_generic_arch.hpp"
+#include "xsimd_register.hpp"
+
+#if XSIMD_WITH_SVE
+#include <arm_sve.h>
+#endif
+
+namespace xsimd
+{
+    namespace detail
+    {
+        /**
+         * @ingroup architectures
+         *
+         * SVE instructions (fixed vector size) for arm64
+         */
+        template <size_t Width>
+        struct sve : xsimd::generic
+        {
+            static constexpr bool supported() noexcept { return Width == XSIMD_SVE_BITS; }
+            static constexpr bool available() noexcept { return true; }
+            static constexpr bool requires_alignment() noexcept { return true; }
+            static constexpr std::size_t alignment() noexcept { return 16; }
+            static constexpr unsigned version() noexcept { return generic::version(9, Width / 32, 0); }
+            static constexpr char const* name() noexcept { return "arm64+sve"; }
+        };
+    }
+
+#if XSIMD_WITH_SVE
+
+    using sve = detail::sve<__ARM_FEATURE_SVE_BITS>;
+
+    namespace types
+    {
+        namespace detail
+        {
+// define fixed size alias per SVE sizeless type
+#define SVE_TO_FIXED_SIZE(ty) ty __attribute__((arm_sve_vector_bits(__ARM_FEATURE_SVE_BITS)))
+            using sve_int8_t = SVE_TO_FIXED_SIZE(svint8_t);
+            using sve_uint8_t = SVE_TO_FIXED_SIZE(svuint8_t);
+            using sve_int16_t = SVE_TO_FIXED_SIZE(svint16_t);
+            using sve_uint16_t = SVE_TO_FIXED_SIZE(svuint16_t);
+            using sve_int32_t = SVE_TO_FIXED_SIZE(svint32_t);
+            using sve_uint32_t = SVE_TO_FIXED_SIZE(svuint32_t);
+            using sve_int64_t = SVE_TO_FIXED_SIZE(svint64_t);
+            using sve_uint64_t = SVE_TO_FIXED_SIZE(svuint64_t);
+            using sve_float32_t = SVE_TO_FIXED_SIZE(svfloat32_t);
+            using sve_float64_t = SVE_TO_FIXED_SIZE(svfloat64_t);
+            using sve_bool_t = SVE_TO_FIXED_SIZE(svbool_t);
+#undef SVE_TO_FIXED_SIZE
+
+            template <size_t S>
+            struct sve_vector_type_impl;
+
+            template <>
+            struct sve_vector_type_impl<8>
+            {
+                using signed_type = sve_int8_t;
+                using unsigned_type = sve_uint8_t;
+                using floating_point_type = void;
+            };
+
+            template <>
+            struct sve_vector_type_impl<16>
+            {
+                using signed_type = sve_int16_t;
+                using unsigned_type = sve_uint16_t;
+                using floating_point_type = void;
+            };
+
+            template <>
+            struct sve_vector_type_impl<32>
+            {
+                using signed_type = sve_int32_t;
+                using unsigned_type = sve_uint32_t;
+                using floating_point_type = sve_float32_t;
+            };
+
+            template <>
+            struct sve_vector_type_impl<64>
+            {
+                using signed_type = sve_int64_t;
+                using unsigned_type = sve_uint64_t;
+                using floating_point_type = sve_float64_t;
+            };
+
+            template <class T>
+            using signed_int_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::signed_type;
+
+            template <class T>
+            using unsigned_int_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::unsigned_type;
+
+            template <class T>
+            using floating_point_sve_vector_type = typename sve_vector_type_impl<8 * sizeof(T)>::floating_point_type;
+
+            template <class T>
+            using signed_int_or_floating_point_sve_vector_type = typename std::conditional<std::is_floating_point<T>::value,
+                                                                                           floating_point_sve_vector_type<T>,
+                                                                                           signed_int_sve_vector_type<T>>::type;
+
+            template <class T>
+            using sve_vector_type = typename std::conditional<std::is_signed<T>::value,
+                                                              signed_int_or_floating_point_sve_vector_type<T>,
+                                                              unsigned_int_sve_vector_type<T>>::type;
+        } // namespace detail
+
+        XSIMD_DECLARE_SIMD_REGISTER(signed char, sve, detail::sve_vector_type<signed char>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, sve, detail::sve_vector_type<unsigned char>);
+        XSIMD_DECLARE_SIMD_REGISTER(char, sve, detail::sve_vector_type<char>);
+        XSIMD_DECLARE_SIMD_REGISTER(short, sve, detail::sve_vector_type<short>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, sve, detail::sve_vector_type<unsigned short>);
+        XSIMD_DECLARE_SIMD_REGISTER(int, sve, detail::sve_vector_type<int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, sve, detail::sve_vector_type<unsigned int>);
+        XSIMD_DECLARE_SIMD_REGISTER(long int, sve, detail::sve_vector_type<long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, sve, detail::sve_vector_type<unsigned long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(long long int, sve, detail::sve_vector_type<long long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, sve, detail::sve_vector_type<unsigned long long int>);
+        XSIMD_DECLARE_SIMD_REGISTER(float, sve, detail::sve_vector_type<float>);
+        XSIMD_DECLARE_SIMD_REGISTER(double, sve, detail::sve_vector_type<double>);
+
+        namespace detail
+        {
+            struct sve_bool_simd_register
+            {
+                using register_type = sve_bool_t;
+                register_type data;
+                operator register_type() const noexcept { return data; }
+            };
+        } // namespace detail
+
+        template <class T>
+        struct get_bool_simd_register<T, sve>
+        {
+            using type = detail::sve_bool_simd_register;
+        };
+    } // namespace types
+#else
+    using sve = detail::sve<0xFFFFFFFF>;
+#endif
+} // namespace xsimd
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_traits.hpp b/third_party/xsimd/include/xsimd/types/xsimd_traits.hpp
new file mode 100644
index 0000000000..f848aab1f7
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_traits.hpp
@@ -0,0 +1,319 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_TRAITS_HPP
+#define XSIMD_TRAITS_HPP
+
+#include <type_traits>
+
+#include "xsimd_batch.hpp"
+
+/**
+ * high level type traits
+ *
+ * @defgroup batch_traits Type traits
+ *
+ **/
+
+namespace xsimd
+{
+
+    /**************************************
+     * simd_traits and revert_simd_traits *
+     **************************************/
+
+    template <class T, class A = default_arch>
+    struct has_simd_register : types::has_simd_register<T, A>
+    {
+    };
+
+    namespace detail
+    {
+        template <class T, bool>
+        struct simd_traits_impl;
+
+        template <class T>
+        struct simd_traits_impl<T, false>
+        {
+            using type = T;
+            using bool_type = bool;
+            static constexpr size_t size = 1;
+        };
+
+        template <class T>
+        constexpr size_t simd_traits_impl<T, false>::size;
+
+        template <class T>
+        struct simd_traits_impl<T, true>
+        {
+            using type = batch<T>;
+            using bool_type = typename type::batch_bool_type;
+            static constexpr size_t size = type::size;
+        };
+
+        template <class T>
+        constexpr size_t simd_traits_impl<T, true>::size;
+
+        template <class T, class A>
+        struct static_check_supported_config_emitter
+        {
+
+            static_assert(A::supported(),
+                          "usage of batch type with unsupported architecture");
+            static_assert(!A::supported() || xsimd::has_simd_register<T, A>::value,
+                          "usage of batch type with unsupported type");
+        };
+
+        template <class T, class A>
+        struct static_check_supported_config_emitter<std::complex<T>, A> : static_check_supported_config_emitter<T, A>
+        {
+        };
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+        template <class T, class A, bool i3ec>
+        struct static_check_supported_config_emitter<xtl::xcomplex<T, T, i3ec>, A> : static_check_supported_config_emitter<T, A>
+        {
+        };
+#endif
+
+        // consistency checker
+        template <class T, class A>
+        inline void static_check_supported_config()
+        {
+            (void)static_check_supported_config_emitter<T, A>();
+        }
+    }
+
+    template <class T>
+    struct simd_traits : detail::simd_traits_impl<T, xsimd::has_simd_register<T>::value>
+    {
+    };
+
+    template <class T>
+    struct simd_traits<std::complex<T>>
+        : detail::simd_traits_impl<std::complex<T>, xsimd::has_simd_register<T>::value>
+    {
+    };
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class T, bool i3ec>
+    struct simd_traits<xtl::xcomplex<T, T, i3ec>>
+        : detail::simd_traits_impl<std::complex<T>, xsimd::has_simd_register<T>::value>
+    {
+    };
+#endif
+
+    template <class T>
+    struct revert_simd_traits
+    {
+        using type = T;
+        static constexpr size_t size = simd_traits<type>::size;
+    };
+
+    template <class T>
+    constexpr size_t revert_simd_traits<T>::size;
+
+    template <class T>
+    struct revert_simd_traits<batch<T>>
+    {
+        using type = T;
+        static constexpr size_t size = batch<T>::size;
+    };
+
+    template <class T>
+    constexpr size_t revert_simd_traits<batch<T>>::size;
+
+    template <class T>
+    using simd_type = typename simd_traits<T>::type;
+
+    template <class T>
+    using simd_bool_type = typename simd_traits<T>::bool_type;
+
+    template <class T>
+    using revert_simd_type = typename revert_simd_traits<T>::type;
+
+    /********************
+     * simd_return_type *
+     ********************/
+
+    namespace detail
+    {
+        template <class T1, class T2>
+        struct simd_condition
+        {
+            static constexpr bool value = (std::is_same<T1, T2>::value && !std::is_same<T1, bool>::value) || (std::is_same<T1, bool>::value && !std::is_same<T2, bool>::value) || std::is_same<T1, float>::value || std::is_same<T1, double>::value || std::is_same<T1, int8_t>::value || std::is_same<T1, uint8_t>::value || std::is_same<T1, int16_t>::value || std::is_same<T1, uint16_t>::value || std::is_same<T1, int32_t>::value || std::is_same<T1, uint32_t>::value || std::is_same<T1, int64_t>::value || std::is_same<T1, uint64_t>::value || std::is_same<T1, char>::value || detail::is_complex<T1>::value;
+        };
+
+        template <class T1, class T2, class A>
+        struct simd_return_type_impl
+            : std::enable_if<simd_condition<T1, T2>::value, batch<T2, A>>
+        {
+        };
+
+        template <class T2, class A>
+        struct simd_return_type_impl<bool, T2, A>
+            : std::enable_if<simd_condition<bool, T2>::value, batch_bool<T2, A>>
+        {
+        };
+
+        template <class T2, class A>
+        struct simd_return_type_impl<bool, std::complex<T2>, A>
+            : std::enable_if<simd_condition<bool, T2>::value, batch_bool<T2, A>>
+        {
+        };
+
+        template <class T1, class T2, class A>
+        struct simd_return_type_impl<std::complex<T1>, T2, A>
+            : std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
+        {
+        };
+
+        template <class T1, class T2, class A>
+        struct simd_return_type_impl<std::complex<T1>, std::complex<T2>, A>
+            : std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
+        {
+        };
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+        template <class T1, class T2, bool I3EC, class A>
+        struct simd_return_type_impl<xtl::xcomplex<T1, T1, I3EC>, T2, A>
+            : std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
+        {
+        };
+
+        template <class T1, class T2, bool I3EC, class A>
+        struct simd_return_type_impl<xtl::xcomplex<T1, T1, I3EC>, std::complex<T2>, A>
+            : std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
+        {
+        };
+
+        template <class T1, class T2, bool I3EC, class A>
+        struct simd_return_type_impl<xtl::xcomplex<T1, T1, I3EC>, xtl::xcomplex<T2, T2, I3EC>, A>
+            : std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
+        {
+        };
+
+        template <class T1, class T2, bool I3EC, class A>
+        struct simd_return_type_impl<std::complex<T1>, xtl::xcomplex<T2, T2, I3EC>, A>
+            : std::enable_if<simd_condition<T1, T2>::value, batch<std::complex<T2>, A>>
+        {
+        };
+#endif
+    }
+
+    template <class T1, class T2, class A = default_arch>
+    using simd_return_type = typename detail::simd_return_type_impl<T1, T2, A>::type;
+
+    /**
+     * @ingroup batch_traits
+     *
+     * type traits that inherits from @c std::true_type for @c batch<...> types and from
+     * @c std::false_type otherwise.
+     *
+     * @tparam T type to analyze.
+     */
+    template <class T>
+    struct is_batch;
+
+    template <class T>
+    struct is_batch : std::false_type
+    {
+    };
+
+    template <class T, class A>
+    struct is_batch<batch<T, A>> : std::true_type
+    {
+    };
+
+    /**
+     * @ingroup batch_traits
+     *
+     * type traits that inherits from @c std::true_type for @c batch_bool<...> types and from
+     * @c std::false_type otherwise.
+     *
+     * @tparam T type to analyze.
+     */
+
+    template <class T>
+    struct is_batch_bool : std::false_type
+    {
+    };
+
+    template <class T, class A>
+    struct is_batch_bool<batch_bool<T, A>> : std::true_type
+    {
+    };
+
+    /**
+     * @ingroup batch_traits
+     *
+     * type traits that inherits from @c std::true_type for @c batch<std::complex<...>>
+     * types and from @c std::false_type otherwise.
+     *
+     * @tparam T type to analyze.
+     */
+
+    template <class T>
+    struct is_batch_complex : std::false_type
+    {
+    };
+
+    template <class T, class A>
+    struct is_batch_complex<batch<std::complex<T>, A>> : std::true_type
+    {
+    };
+
+    /**
+     * @ingroup batch_traits
+     *
+     * type traits whose @c type field is set to @c T::value_type if @c
+     * is_batch<T>::value and to @c T otherwise.
+     *
+     * @tparam T type to analyze.
+     */
+    template <class T>
+    struct scalar_type
+    {
+        using type = T;
+    };
+    template <class T, class A>
+    struct scalar_type<batch<T, A>>
+    {
+        using type = T;
+    };
+
+    template <class T>
+    using scalar_type_t = typename scalar_type<T>::type;
+
+    /**
+     * @ingroup batch_traits
+     *
+     * type traits whose @c type field is set to @c T::value_type if @c
+     * is_batch_bool<T>::value and to @c bool otherwise.
+     *
+     * @tparam T type to analyze.
+     */
+    template <class T>
+    struct mask_type
+    {
+        using type = bool;
+    };
+    template <class T, class A>
+    struct mask_type<batch<T, A>>
+    {
+        using type = typename batch<T, A>::batch_bool_type;
+    };
+
+    template <class T>
+    using mask_type_t = typename mask_type<T>::type;
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_utils.hpp b/third_party/xsimd/include/xsimd/types/xsimd_utils.hpp
new file mode 100644
index 0000000000..aa890f2410
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_utils.hpp
@@ -0,0 +1,530 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_UTILS_HPP
+#define XSIMD_UTILS_HPP
+
+#include <complex>
+#include <cstdint>
+#include <cstring>
+#include <tuple>
+#include <type_traits>
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+#include "xtl/xcomplex.hpp"
+#endif
+
+namespace xsimd
+{
+
+    template <class T, class A>
+    class batch;
+
+    template <class T, class A>
+    class batch_bool;
+
+    /**************
+     * index      *
+     **************/
+
+    template <size_t I>
+    using index = std::integral_constant<size_t, I>;
+
+    /**************
+     * as_integer *
+     **************/
+
+    template <class T>
+    struct as_integer : std::make_signed<T>
+    {
+    };
+
+    template <>
+    struct as_integer<float>
+    {
+        using type = int32_t;
+    };
+
+    template <>
+    struct as_integer<double>
+    {
+        using type = int64_t;
+    };
+
+    template <class T, class A>
+    struct as_integer<batch<T, A>>
+    {
+        using type = batch<typename as_integer<T>::type, A>;
+    };
+
+    template <class B>
+    using as_integer_t = typename as_integer<B>::type;
+
+    /***********************
+     * as_unsigned_integer *
+     ***********************/
+
+    template <class T>
+    struct as_unsigned_integer : std::make_unsigned<T>
+    {
+    };
+
+    template <>
+    struct as_unsigned_integer<float>
+    {
+        using type = uint32_t;
+    };
+
+    template <>
+    struct as_unsigned_integer<double>
+    {
+        using type = uint64_t;
+    };
+
+    template <class T, class A>
+    struct as_unsigned_integer<batch<T, A>>
+    {
+        using type = batch<typename as_unsigned_integer<T>::type, A>;
+    };
+
+    template <class T>
+    using as_unsigned_integer_t = typename as_unsigned_integer<T>::type;
+
+    /*********************
+     * as_signed_integer *
+     *********************/
+
+    template <class T>
+    struct as_signed_integer : std::make_signed<T>
+    {
+    };
+
+    template <class T>
+    using as_signed_integer_t = typename as_signed_integer<T>::type;
+
+    /******************
+     * flip_sign_type *
+     ******************/
+
+    namespace detail
+    {
+        template <class T, bool is_signed>
+        struct flipped_sign_type_impl : std::make_signed<T>
+        {
+        };
+
+        template <class T>
+        struct flipped_sign_type_impl<T, true> : std::make_unsigned<T>
+        {
+        };
+    }
+
+    template <class T>
+    struct flipped_sign_type
+        : detail::flipped_sign_type_impl<T, std::is_signed<T>::value>
+    {
+    };
+
+    template <class T>
+    using flipped_sign_type_t = typename flipped_sign_type<T>::type;
+
+    /***********
+     * as_float *
+     ************/
+
+    template <class T>
+    struct as_float;
+
+    template <>
+    struct as_float<int32_t>
+    {
+        using type = float;
+    };
+
+    template <>
+    struct as_float<int64_t>
+    {
+        using type = double;
+    };
+
+    template <class T, class A>
+    struct as_float<batch<T, A>>
+    {
+        using type = batch<typename as_float<T>::type, A>;
+    };
+
+    template <class T>
+    using as_float_t = typename as_float<T>::type;
+
+    /**************
+     * as_logical *
+     **************/
+
+    template <class T>
+    struct as_logical;
+
+    template <class T, class A>
+    struct as_logical<batch<T, A>>
+    {
+        using type = batch_bool<T, A>;
+    };
+
+    template <class T>
+    using as_logical_t = typename as_logical<T>::type;
+
+    /********************
+     * bit_cast *
+     ********************/
+
+    template <class To, class From>
+    inline To bit_cast(From val) noexcept
+    {
+        static_assert(sizeof(From) == sizeof(To), "casting between compatible layout");
+        // FIXME: Some old version of GCC don't support that trait
+        // static_assert(std::is_trivially_copyable<From>::value, "input type is trivially copyable");
+        // static_assert(std::is_trivially_copyable<To>::value, "output type is trivially copyable");
+        To res;
+        std::memcpy(&res, &val, sizeof(val));
+        return res;
+    }
+
+    namespace kernel
+    {
+        namespace detail
+        {
+            /**************************************
+             * enabling / disabling metafunctions *
+             **************************************/
+
+            template <class T>
+            using enable_integral_t = typename std::enable_if<std::is_integral<T>::value, int>::type;
+
+            template <class T, size_t S>
+            using enable_sized_signed_t = typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value && sizeof(T) == S, int>::type;
+
+            template <class T, size_t S>
+            using enable_sized_unsigned_t = typename std::enable_if<std::is_integral<T>::value && !std::is_signed<T>::value && sizeof(T) == S, int>::type;
+
+            template <class T, size_t S>
+            using enable_sized_integral_t = typename std::enable_if<std::is_integral<T>::value && sizeof(T) == S, int>::type;
+
+            template <class T, size_t S>
+            using enable_sized_t = typename std::enable_if<sizeof(T) == S, int>::type;
+
+            template <class T, size_t S>
+            using enable_max_sized_integral_t = typename std::enable_if<std::is_integral<T>::value && sizeof(T) <= S, int>::type;
+
+            /********************************
+             * Matching & mismatching sizes *
+             ********************************/
+
+            template <class T, class U, class B = int>
+            using sizes_match_t = typename std::enable_if<sizeof(T) == sizeof(U), B>::type;
+
+            template <class T, class U, class B = int>
+            using sizes_mismatch_t = typename std::enable_if<sizeof(T) != sizeof(U), B>::type;
+
+            template <class T, class U, class B = int>
+            using stride_match_t = typename std::enable_if<!std::is_same<T, U>::value && sizeof(T) == sizeof(U), B>::type;
+        } // namespace detail
+    } // namespace kernel
+
+    /*****************************************
+     * Backport of index_sequence from c++14 *
+     *****************************************/
+
+    // TODO: Remove this once we drop C++11 support
+    namespace detail
+    {
+        template <typename T>
+        struct identity
+        {
+            using type = T;
+        };
+
+#ifdef __cpp_lib_integer_sequence
+        using std::index_sequence;
+        using std::integer_sequence;
+        using std::make_index_sequence;
+        using std::make_integer_sequence;
+
+        using std::index_sequence_for;
+#else
+        template <typename T, T... Is>
+        struct integer_sequence
+        {
+            using value_type = T;
+            static constexpr std::size_t size() noexcept { return sizeof...(Is); }
+        };
+
+        template <typename Lhs, typename Rhs>
+        struct make_integer_sequence_concat;
+
+        template <typename T, T... Lhs, T... Rhs>
+        struct make_integer_sequence_concat<integer_sequence<T, Lhs...>,
+                                            integer_sequence<T, Rhs...>>
+            : identity<integer_sequence<T, Lhs..., (sizeof...(Lhs) + Rhs)...>>
+        {
+        };
+
+        template <typename T>
+        struct make_integer_sequence_impl;
+
+        template <typename T>
+        struct make_integer_sequence_impl<std::integral_constant<T, (T)0>> : identity<integer_sequence<T>>
+        {
+        };
+
+        template <typename T>
+        struct make_integer_sequence_impl<std::integral_constant<T, (T)1>> : identity<integer_sequence<T, 0>>
+        {
+        };
+
+        template <typename T, T N>
+        struct make_integer_sequence_impl<std::integral_constant<T, N>>
+            : make_integer_sequence_concat<typename make_integer_sequence_impl<std::integral_constant<T, N / 2>>::type,
+                                           typename make_integer_sequence_impl<std::integral_constant<T, N - (N / 2)>>::type>
+        {
+        };
+
+        template <typename T, T N>
+        using make_integer_sequence = typename make_integer_sequence_impl<std::integral_constant<T, N>>::type;
+
+        template <std::size_t... Is>
+        using index_sequence = integer_sequence<std::size_t, Is...>;
+
+        template <std::size_t N>
+        using make_index_sequence = make_integer_sequence<std::size_t, N>;
+
+        template <typename... Ts>
+        using index_sequence_for = make_index_sequence<sizeof...(Ts)>;
+
+#endif
+
+        template <int... Is>
+        using int_sequence = integer_sequence<int, Is...>;
+
+        template <int N>
+        using make_int_sequence = make_integer_sequence<int, N>;
+
+        template <typename... Ts>
+        using int_sequence_for = make_int_sequence<(int)sizeof...(Ts)>;
+
+        // Type-casted index sequence.
+        template <class P, size_t... Is>
+        inline P indexes_from(index_sequence<Is...>) noexcept
+        {
+            return { static_cast<typename P::value_type>(Is)... };
+        }
+
+        template <class P>
+        inline P make_sequence_as_batch() noexcept
+        {
+            return indexes_from<P>(make_index_sequence<P::size>());
+        }
+    }
+
+    /***********************************
+     * Backport of std::get from C++14 *
+     ***********************************/
+
+    namespace detail
+    {
+        template <class T, class... Types, size_t I, size_t... Is>
+        inline const T& get_impl(const std::tuple<Types...>& t, std::is_same<T, T>, index_sequence<I, Is...>) noexcept
+        {
+            return std::get<I>(t);
+        }
+
+        template <class T, class U, class... Types, size_t I, size_t... Is>
+        inline const T& get_impl(const std::tuple<Types...>& t, std::is_same<T, U>, index_sequence<I, Is...>) noexcept
+        {
+            using tuple_elem = typename std::tuple_element<I + 1, std::tuple<Types...>>::type;
+            return get_impl<T>(t, std::is_same<T, tuple_elem>(), index_sequence<Is...>());
+        }
+
+        template <class T, class... Types>
+        inline const T& get(const std::tuple<Types...>& t) noexcept
+        {
+            using tuple_elem = typename std::tuple_element<0, std::tuple<Types...>>::type;
+            return get_impl<T>(t, std::is_same<T, tuple_elem>(), make_index_sequence<sizeof...(Types)>());
+        }
+    }
+
+    /*********************************
+     * Backport of void_t from C++17 *
+     *********************************/
+
+    namespace detail
+    {
+        template <class... T>
+        struct make_void
+        {
+            using type = void;
+        };
+
+        template <class... T>
+        using void_t = typename make_void<T...>::type;
+    }
+
+    /**************************************************
+     * Equivalent of void_t but with size_t parameter *
+     **************************************************/
+
+    namespace detail
+    {
+        template <std::size_t>
+        struct check_size
+        {
+            using type = void;
+        };
+
+        template <std::size_t S>
+        using check_size_t = typename check_size<S>::type;
+    }
+
+    /*****************************************
+     * Supplementary std::array constructors *
+     *****************************************/
+
+    namespace detail
+    {
+        // std::array constructor from scalar value ("broadcast")
+        template <typename T, std::size_t... Is>
+        inline constexpr std::array<T, sizeof...(Is)>
+        array_from_scalar_impl(const T& scalar, index_sequence<Is...>) noexcept
+        {
+            // You can safely ignore this silly ternary, the "scalar" is all
+            // that matters. The rest is just a dirty workaround...
+            return std::array<T, sizeof...(Is)> { (Is + 1) ? scalar : T()... };
+        }
+
+        template <typename T, std::size_t N>
+        inline constexpr std::array<T, N>
+        array_from_scalar(const T& scalar) noexcept
+        {
+            return array_from_scalar_impl(scalar, make_index_sequence<N>());
+        }
+
+        // std::array constructor from C-style pointer (handled as an array)
+        template <typename T, std::size_t... Is>
+        inline constexpr std::array<T, sizeof...(Is)>
+        array_from_pointer_impl(const T* c_array, index_sequence<Is...>) noexcept
+        {
+            return std::array<T, sizeof...(Is)> { c_array[Is]... };
+        }
+
+        template <typename T, std::size_t N>
+        inline constexpr std::array<T, N>
+        array_from_pointer(const T* c_array) noexcept
+        {
+            return array_from_pointer_impl(c_array, make_index_sequence<N>());
+        }
+    }
+
+    /************************
+     * is_array_initializer *
+     ************************/
+
+    namespace detail
+    {
+        template <bool...>
+        struct bool_pack;
+
+        template <bool... bs>
+        using all_true = std::is_same<
+            bool_pack<bs..., true>, bool_pack<true, bs...>>;
+
+        template <typename T, typename... Args>
+        using is_all_convertible = all_true<std::is_convertible<Args, T>::value...>;
+
+        template <typename T, std::size_t N, typename... Args>
+        using is_array_initializer = std::enable_if<
+            (sizeof...(Args) == N) && is_all_convertible<T, Args...>::value>;
+
+        // Check that a variadic argument pack is a list of N values of type T,
+        // as usable for instantiating a value of type std::array<T, N>.
+        template <typename T, std::size_t N, typename... Args>
+        using is_array_initializer_t = typename is_array_initializer<T, N, Args...>::type;
+    }
+
+    /**************
+     * is_complex *
+     **************/
+
+    // This is used in both xsimd_complex_base.hpp and xsimd_traits.hpp
+    // However xsimd_traits.hpp indirectly includes xsimd_complex_base.hpp
+    // so we cannot define is_complex in xsimd_traits.hpp. Besides, if
+    // no file defining batches is included, we still need this definition
+    // in xsimd_traits.hpp, so let's define it here.
+
+    namespace detail
+    {
+        template <class T>
+        struct is_complex : std::false_type
+        {
+        };
+
+        template <class T>
+        struct is_complex<std::complex<T>> : std::true_type
+        {
+        };
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+        template <class T, bool i3ec>
+        struct is_complex<xtl::xcomplex<T, T, i3ec>> : std::true_type
+        {
+        };
+#endif
+    }
+
+    /*******************
+     * real_batch_type *
+     *******************/
+
+    template <class B>
+    struct real_batch_type
+    {
+        using type = B;
+    };
+
+    template <class T, class A>
+    struct real_batch_type<batch<std::complex<T>, A>>
+    {
+        using type = batch<T, A>;
+    };
+
+    template <class B>
+    using real_batch_type_t = typename real_batch_type<B>::type;
+
+    /**********************
+     * complex_batch_type *
+     **********************/
+
+    template <class B>
+    struct complex_batch_type
+    {
+        using real_value_type = typename B::value_type;
+        using arch_type = typename B::arch_type;
+        using type = batch<std::complex<real_value_type>, arch_type>;
+    };
+
+    template <class T, class A>
+    struct complex_batch_type<batch<std::complex<T>, A>>
+    {
+        using type = batch<std::complex<T>, A>;
+    };
+
+    template <class B>
+    using complex_batch_type_t = typename complex_batch_type<B>::type;
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_wasm_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_wasm_register.hpp
new file mode 100644
index 0000000000..237db95c6e
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_wasm_register.hpp
@@ -0,0 +1,60 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ * Copyright (c) Anutosh Bhat                                               *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_WASM_REGISTER_HPP
+#define XSIMD_WASM_REGISTER_HPP
+
+#include "xsimd_generic_arch.hpp"
+#include "xsimd_register.hpp"
+
+#if XSIMD_WITH_WASM
+#include <wasm_simd128.h>
+#endif
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * WASM instructions
+     */
+    struct wasm : generic
+    {
+        static constexpr bool supported() noexcept { return XSIMD_WITH_WASM; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr bool requires_alignment() noexcept { return true; }
+        static constexpr unsigned version() noexcept { return generic::version(10, 0, 0); }
+        static constexpr std::size_t alignment() noexcept { return 16; }
+        static constexpr char const* name() noexcept { return "wasm"; }
+    };
+
+#if XSIMD_WITH_WASM
+    namespace types
+    {
+        XSIMD_DECLARE_SIMD_REGISTER(signed char, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned char, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(char, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned short, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(short, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned int, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(int, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long int, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(long int, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(unsigned long long int, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(long long int, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(float, wasm, v128_t);
+        XSIMD_DECLARE_SIMD_REGISTER(double, wasm, v128_t);
+    }
+#endif
+}
+
+#endif