1 files changed, 2599 insertions, 0 deletions
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_api.hpp b/third_party/xsimd/include/xsimd/types/xsimd_api.hpp
new file mode 100644
index 0000000000..0420f0a09d
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_api.hpp
@@ -0,0 +1,2599 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_API_HPP
+#define XSIMD_API_HPP
+
+#include <complex>
+#include <cstddef>
+#include <limits>
+#include <ostream>
+
+#include "../arch/xsimd_isa.hpp"
+#include "../types/xsimd_batch.hpp"
+#include "../types/xsimd_traits.hpp"
+
+namespace xsimd
+{
+    /**
+     * high level free functions
+     *
+     * @defgroup batch_arithmetic Arithmetic operators
+     * @defgroup batch_constant Constant batches
+     * @defgroup batch_data_transfer Memory operators
+     * @defgroup batch_math Basic math operators
+     * @defgroup batch_math_extra Extra math operators
+     * @defgroup batch_fp Floating point manipulation
+     * @defgroup batch_rounding Rounding operators
+     * @defgroup batch_conversion Conversion operators
+     * @defgroup batch_complex_op Complex operators
+     * @defgroup batch_logical Logical operators
+     * @defgroup batch_bitwise Bitwise operators
+     * @defgroup batch_reducers Reducers
+     * @defgroup batch_miscellaneous Miscellaneous
+     * @defgroup batch_trigo Trigonometry
+     *
+     * @defgroup batch_bool_logical Boolean logical operators
+     * @defgroup batch_bool_reducers Boolean reducers
+     */
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the absolute values of each scalar in the batch \c x.
+     * @param x batch of integer or floating point values.
+     * @return the absolute values of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> abs(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::abs<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_complex
+     *
+     * Computes the absolute values of each complex in the batch \c z.
+     * @param z batch of complex values.
+     * @return the absolute values of \c z.
+     */
+    template <class T, class A>
+    inline batch<T, A> abs(batch<std::complex<T>, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::abs<A>(z, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the sum of the batches \c x and \c y.
+     * @param x batch or scalar involved in the addition.
+     * @param y batch or scalar involved in the addition.
+     * @return the sum of \c x and \c y
+     */
+    template <class T, class A>
+    inline auto add(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x + y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x + y;
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the arc cosine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the arc cosine of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> acos(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::acos<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the inverse hyperbolic cosine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the inverse hyperbolic cosine of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> acosh(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::acosh<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_complex
+     *
+     * Computes the argument of the batch \c z.
+     * @param z batch of complex or real values.
+     * @return the argument of \c z.
+     */
+    template <class T, class A>
+    inline real_batch_type_t<batch<T, A>> arg(batch<T, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::arg<A>(z, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the arc sine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the arc sine of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> asin(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::asin<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the inverse hyperbolic sine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the inverse hyperbolic sine of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> asinh(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::asinh<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the arc tangent of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the arc tangent of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> atan(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::atan<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the arc tangent of the batch \c x/y, using the signs of the
+     * arguments to determine the correct quadrant.
+     * @param x batch of floating point values.
+     * @param y batch of floating point values.
+     * @return the arc tangent of \c x/y.
+     */
+    template <class T, class A>
+    inline batch<T, A> atan2(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::atan2<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the inverse hyperbolic tangent of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the inverse hyperbolic tangent of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> atanh(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::atanh<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_conversion
+     *
+     * Perform a static_cast from \c T_in to \c T_out on \c \c x.
+     * @param x batch_bool of \c T_in
+     * @return \c x cast to \c T_out
+     */
+    template <class T_out, class T_in, class A>
+    inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T_out, A>();
+        detail::static_check_supported_config<T_in, A>();
+        static_assert(batch_bool<T_out, A>::size == batch_bool<T_in, A>::size, "Casting between incompatibles batch_bool types.");
+        return kernel::batch_bool_cast<A>(x, batch_bool<T_out, A> {}, A {});
+    }
+
+    /**
+     * @ingroup batch_conversion
+     *
+     * Perform a static_cast from \c T_in to \c T_out on \c \c x.
+     * @param x batch of \c T_in
+     * @return \c x cast to \c T_out
+     */
+    template <class T_out, class T_in, class A>
+    inline batch<T_out, A> batch_cast(batch<T_in, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T_out, A>();
+        detail::static_check_supported_config<T_in, A>();
+        return kernel::batch_cast<A>(x, batch<T_out, A> {}, A {});
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Computes the bit of sign of \c x
+     * @param x batch of scalar
+     * @return bit of sign of \c x
+     */
+    template <class T, class A>
+    inline batch<T, A> bitofsign(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitofsign<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise and of the batches \c x and \c y.
+     * @param x batch involved in the operation.
+     * @param y batch involved in the operation.
+     * @return the result of the bitwise and.
+     */
+    template <class T, class A>
+    inline auto bitwise_and(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x & y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x & y;
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise and of the batches \c x and \c y.
+     * @param x batch involved in the operation.
+     * @param y batch involved in the operation.
+     * @return the result of the bitwise and.
+     */
+    template <class T, class A>
+    inline auto bitwise_and(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept -> decltype(x & y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x & y;
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise and not of batches \c x and \c y.
+     * @param x batch involved in the operation.
+     * @param y batch involved in the operation.
+     * @return the result of the bitwise and not.
+     */
+    template <class T, class A>
+    inline batch<T, A> bitwise_andnot(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_andnot<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_bool_logical
+     *
+     * Computes the bitwise and not of batches \c x and \c y.
+     * @param x batch involved in the operation.
+     * @param y batch involved in the operation.
+     * @return the result of the bitwise and not.
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_andnot<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_conversion
+     *
+     * Perform a reinterpret_cast from \c T_in to \c T_out on \c x.
+     * @param x batch of \c T_in
+     * @return \c x reinterpreted as \c T_out
+     */
+    template <class T_out, class T_in, class A>
+    inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T_in, A>();
+        detail::static_check_supported_config<T_out, A>();
+        return kernel::bitwise_cast<A>(x, batch<T_out, A> {}, A {});
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Perform a bitwise shift to the left
+     * @param x batch of \c T_in
+     * @param shift scalar amount to shift
+     * @return shifted \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> bitwise_lshift(batch<T, A> const& x, int shift) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_lshift<A>(x, shift, A {});
+    }
+    template <class T, class A>
+    inline batch<T, A> bitwise_lshift(batch<T, A> const& x, batch<T, A> const& shift) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_lshift<A>(x, shift, A {});
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise not of batch \c x.
+     * @param x batch involved in the operation.
+     * @return the result of the bitwise not.
+     */
+    template <class T, class A>
+    inline batch<T, A> bitwise_not(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_not<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise not of batch \c x.
+     * @param x batch involved in the operation.
+     * @return the result of the bitwise not.
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_not<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise or of the batches \c x and \c y.
+     * @param x scalar or batch of scalars
+     * @param y scalar or batch of scalars
+     * @return the result of the bitwise or.
+     */
+    template <class T, class A>
+    inline auto bitwise_or(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x | y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x | y;
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise or of the batches \c x and \c y.
+     * @param x scalar or batch of scalars
+     * @param y scalar or batch of scalars
+     * @return the result of the bitwise or.
+     */
+    template <class T, class A>
+    inline auto bitwise_or(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept -> decltype(x | y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x | y;
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Perform a bitwise shift to the right
+     * @param x batch of \c T_in
+     * @param shift scalar amount to shift
+     * @return shifted \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> bitwise_rshift(batch<T, A> const& x, int shift) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_rshift<A>(x, shift, A {});
+    }
+    template <class T, class A>
+    inline batch<T, A> bitwise_rshift(batch<T, A> const& x, batch<T, A> const& shift) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::bitwise_rshift<A>(x, shift, A {});
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise xor of the batches \c x and \c y.
+     * @param x scalar or batch of scalars
+     * @param y scalar or batch of scalars
+     * @return the result of the bitwise xor.
+     */
+    template <class T, class A>
+    inline auto bitwise_xor(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x ^ y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x ^ y;
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Computes the bitwise xor of the batches \c x and \c y.
+     * @param x scalar or batch of scalars
+     * @param y scalar or batch of scalars
+     * @return the result of the bitwise xor.
+     */
+    template <class T, class A>
+    inline auto bitwise_xor(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept -> decltype(x ^ y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x ^ y;
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the single value \c v.
+     * @param v the value used to initialize the batch
+     * @return a new batch instance
+     */
+    template <class T, class A = default_arch>
+    inline batch<T, A> broadcast(T v) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return batch<T, A>::broadcast(v);
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the single value \c v and
+     * the specified batch value type \c To.
+     * @param v the value used to initialize the batch
+     * @return a new batch instance
+     */
+    template <class To, class A = default_arch, class From>
+    inline simd_return_type<From, To, A> broadcast_as(From v) noexcept
+    {
+        detail::static_check_supported_config<From, A>();
+        using batch_value_type = typename simd_return_type<From, To, A>::value_type;
+        using value_type = typename std::conditional<std::is_same<From, bool>::value,
+                                                     bool,
+                                                     batch_value_type>::type;
+        return simd_return_type<From, To, A>(value_type(v));
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the cubic root of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the cubic root of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> cbrt(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::cbrt<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_rounding
+     *
+     * Computes the batch of smallest integer values not less than
+     * scalars in \c x.
+     * @param x batch of floating point values.
+     * @return the batch of smallest integer values not less than \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> ceil(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::ceil<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Clips the values of the batch \c x between those of the batches \c lo and \c hi.
+     * @param x batch of scalar values.
+     * @param lo batch of scalar values.
+     * @param hi batch of scalar values.
+     * @return the result of the clipping.
+     */
+    template <class T, class A>
+    inline batch<T, A> clip(batch<T, A> const& x, batch<T, A> const& lo, batch<T, A> const& hi) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::clip(x, lo, hi, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Pick elements from \c x selected by \c mask, and append them to the
+     * resulting vector, zeroing the remaining slots
+     */
+    template <class T, class A>
+    inline batch<T, A> compress(batch<T, A> const& x, batch_bool<T, A> const& mask) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::compress<A>(x, mask, A {});
+    }
+
+    /**
+     * @ingroup batch_complex
+     *
+     * Computes the conjugate of the batch \c z.
+     * @param z batch of complex values.
+     * @return the argument of \c z.
+     */
+    template <class A, class T>
+    inline complex_batch_type_t<batch<T, A>> conj(batch<T, A> const& z) noexcept
+    {
+        return kernel::conj(z, A {});
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Computes a value whose  absolute  value  matches
+     *        that of \c x, but whose sign bit matches that of \c y.
+     * @param x batch of scalars
+     * @param y batch of scalars
+     * @return batch whose absolute  value  matches that of \c x, but whose sign bit
+     * matches that of \c y.
+     */
+    template <class T, class A>
+    inline batch<T, A> copysign(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::copysign<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the cosine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the cosine of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> cos(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::cos<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * computes the hyperbolic cosine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the hyperbolic cosine of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> cosh(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::cosh<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Subtract 1 to batch \c x.
+     * @param x batch involved in the decrement.
+     * @return the subtraction of \c x and 1.
+     */
+    template <class T, class A>
+    inline batch<T, A> decr(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::decr<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Subtract 1 to batch \c x for each element where \c mask is true.
+     * @param x batch involved in the increment.
+     * @param mask whether to perform the increment or not. Can be a \c
+     *             batch_bool or a \c batch_bool_constant.
+     * @return the subtraction of \c x and 1 when \c mask is true.
+     */
+    template <class T, class A, class Mask>
+    inline batch<T, A> decr_if(batch<T, A> const& x, Mask const& mask) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::decr_if<A>(x, mask, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the division of the batch \c x by the batch \c y.
+     * @param x scalar or batch of scalars
+     * @param y scalar or batch of scalars
+     * @return the result of the division.
+     */
+    template <class T, class A>
+    inline auto div(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x / y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x / y;
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Element-wise equality comparison of batches \c x and \c y.
+     * @param x batch of scalars
+     * @param y batch of scalars
+     * @return a boolean batch.
+     */
+    template <class T, class A>
+    inline auto eq(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x == y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x == y;
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Element-wise equality comparison of batches of boolean values \c x and \c y.
+     * @param x batch of booleans involved in the comparison.
+     * @param y batch of booleans involved in the comparison.
+     * @return a boolean batch.
+     */
+    template <class T, class A>
+    inline auto eq(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept -> decltype(x == y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x == y;
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the natural exponential of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the natural exponential of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> exp(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::exp<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the base 10 exponential of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the base 10 exponential of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> exp10(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::exp10<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the base 2 exponential of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the base 2 exponential of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> exp2(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::exp2<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Load contiguous elements from \c x and place them in slots selected by \c
+     * mask, zeroing the other slots
+     */
+    template <class T, class A>
+    inline batch<T, A> expand(batch<T, A> const& x, batch_bool<T, A> const& mask) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::expand<A>(x, mask, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the natural exponential of the batch \c x, minus one.
+     * @param x batch of floating point values.
+     * @return the natural exponential of \c x, minus one.
+     */
+    template <class T, class A>
+    inline batch<T, A> expm1(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::expm1<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math_extra
+     *
+     * Computes the error function of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the error function of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> erf(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::erf<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math_extra
+     *
+     * Computes the complementary error function of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the error function of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> erfc(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::erfc<A>(x, A {});
+    }
+
+    /**
+     * Extract vector from pair of vectors
+     * extracts the lowest vector elements from the second source \c x
+     * and the highest vector elements from the first source \c y
+     * Concatenates the results into th Return value.
+     * @param x batch of integer or floating point values.
+     * @param y batch of integer or floating point values.
+     * @param i integer specifying the lowest vector element to extract from the first source register
+     * @return.
+     */
+    template <class T, class A>
+    inline batch<T, A> extract_pair(batch<T, A> const& x, batch<T, A> const& y, std::size_t i) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::extract_pair<A>(x, y, i, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the absolute values of each scalar in the batch \c x.
+     * @param x batch floating point values.
+     * @return the absolute values of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> fabs(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::abs<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the positive difference between \c x and \c y, that is,
+     * <tt>max(0, x-y)</tt>.
+     * @param x batch of floating point values.
+     * @param y batch of floating point values.
+     * @return the positive difference.
+     */
+    template <class T, class A>
+    inline batch<T, A> fdim(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::fdim<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_rounding
+     *
+     * Computes the batch of largest integer values not greater than
+     * scalars in \c x.
+     * @param x batch of floating point values.
+     * @return the batch of largest integer values not greater than \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> floor(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::floor<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes <tt>(x*y) + z</tt> in a single instruction when possible.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @param z a batch of integer or floating point values.
+     * @return the result of the fused multiply-add operation.
+     */
+    template <class T, class A>
+    inline batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::fma<A>(x, y, z, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the larger values of the batches \c x and \c y.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @return a batch of the larger values.
+     */
+    template <class T, class A>
+    inline batch<T, A> fmax(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::max<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the smaller values of the batches \c x and \c y.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @return a batch of the smaller values.
+     */
+    template <class T, class A>
+    inline batch<T, A> fmin(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::min<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the modulo of the batch \c x by the batch \c y.
+     * @param x batch involved in the modulo.
+     * @param y batch involved in the modulo.
+     * @return the result of the modulo.
+     */
+    template <class T, class A>
+    inline batch<T, A> fmod(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::fmod<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes <tt>(x*y) - z</tt> in a single instruction when possible.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @param z a batch of integer or floating point values.
+     * @return the result of the fused multiply-sub operation.
+     */
+    template <class T, class A>
+    inline batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::fms<A>(x, y, z, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes <tt>-(x*y) + z</tt> in a single instruction when possible.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @param z a batch of integer or floating point values.
+     * @return the result of the fused negated multiply-add operation.
+     */
+    template <class T, class A>
+    inline batch<T, A> fnma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::fnma<A>(x, y, z, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes <tt>-(x*y) - z</tt> in a single instruction when possible.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @param z a batch of integer or floating point values.
+     * @return the result of the fused negated multiply-sub operation.
+     */
+    template <class T, class A>
+    inline batch<T, A> fnms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::fnms<A>(x, y, z, A {});
+    }
+
+    /**
+     * @ingroup batch_fp
+     *
+     * Split split the number x into a normalized fraction and an exponent which is stored in exp
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @return the normalized fraction of x
+     */
+    template <class T, class A>
+    inline batch<T, A> frexp(const batch<T, A>& x, batch<as_integer_t<T>, A>& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::frexp<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Element-wise greater or equal comparison of batches \c x and \c y.
+     * @tparam X the actual type of batch.
+     * @param x batch involved in the comparison.
+     * @param y batch involved in the comparison.
+     * @return a boolean batch.
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> ge(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return x >= y;
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Element-wise greater than comparison of batches \c x and \c y.
+     * @tparam X the actual type of batch.
+     * @param x batch involved in the comparison.
+     * @param y batch involved in the comparison.
+     * @return a boolean batch.
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> gt(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return x > y;
+    }
+
+    /**
+     * @ingroup batch_reducers
+     *
+     * Parallel horizontal addition: adds the scalars of each batch
+     * in the array pointed by \c row and store them in a returned
+     * batch.
+     * @param row an array of \c N batches
+     * @return the result of the reduction.
+     */
+    template <class T, class A>
+    inline batch<T, A> haddp(batch<T, A> const* row) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::haddp<A>(row, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the square root of the sum of the squares of the batches
+     * \c x, and \c y.
+     * @param x batch of floating point values.
+     * @param y batch of floating point values.
+     * @return the square root of the sum of the squares of \c x and \c y.
+     */
+    template <class T, class A>
+    inline batch<T, A> hypot(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::hypot<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_complex
+     *
+     * Computes the imaginary part of the batch \c x.
+     * @param x batch of complex or real values.
+     * @return the argument of \c x.
+     */
+    template <class T, class A>
+    inline real_batch_type_t<batch<T, A>> imag(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::imag<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Add 1 to batch \c x.
+     * @param x batch involved in the increment.
+     * @return the sum of \c x and 1.
+     */
+    template <class T, class A>
+    inline batch<T, A> incr(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::incr<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Add 1 to batch \c x for each element where \c mask is true.
+     * @param x batch involved in the increment.
+     * @param mask whether to perform the increment or not. Can be a \c
+     *             batch_bool or a \c batch_bool_constant.
+     * @return the sum of \c x and 1 when \c mask is true.
+     */
+    template <class T, class A, class Mask>
+    inline batch<T, A> incr_if(batch<T, A> const& x, Mask const& mask) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::incr_if<A>(x, mask, A {});
+    }
+
+    /**
+     * @ingroup batch_constant
+     *
+     * Return a batch of scalars representing positive infinity
+     * @return a batch of positive infinity
+     */
+    template <class B>
+    inline B infinity()
+    {
+        using T = typename B::value_type;
+        using A = typename B::arch_type;
+        detail::static_check_supported_config<T, A>();
+        return B(std::numeric_limits<T>::infinity());
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Create a new batch equivalent to \c x but with element \c val set at position \c pos
+     * @param x batch
+     * @param val value to set
+     * @param pos index of the updated slot
+     * @return copy of \c x with position \c pos set to \c val
+     */
+    template <class T, class A, size_t I>
+    inline batch<T, A> insert(batch<T, A> const& x, T val, index<I> pos) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::insert<A>(x, val, pos, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Determines if the scalars in the given batch \c x represent an even integer value
+     * @param x batch of floating point values.
+     * @return a batch of booleans.
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> is_even(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::is_even<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Determines if the floating-point scalars in the given batch \c x represent integer value
+     * @param x batch of floating point values.
+     * @return a batch of booleans.
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> is_flint(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::is_flint<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Determines if the scalars in the given batch \c x represent an odd integer value
+     * @param x batch of floating point values.
+     * @return a batch of booleans.
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> is_odd(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::is_odd<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Determines if the scalars in the given batch \c x are inf values.
+     * @param x batch of floating point values.
+     * @return a batch of booleans.
+     */
+    template <class T, class A>
+    inline typename batch<T, A>::batch_bool_type isinf(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::isinf<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Determines if the scalars in the given batch \c x are finite values.
+     * @param x batch of floating point values.
+     * @return a batch of booleans.
+     */
+    template <class T, class A>
+    inline typename batch<T, A>::batch_bool_type isfinite(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::isfinite<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Determines if the scalars in the given batch \c x are NaN values.
+     * @param x batch of floating point values.
+     * @return a batch of booleans.
+     */
+    template <class T, class A>
+    inline typename batch<T, A>::batch_bool_type isnan(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::isnan<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math_extra
+     *
+     * Computes the multiplication of the floating point number \c x by 2 raised to the power \c y.
+     * @param x batch of floating point values.
+     * @param y batch of integer values.
+     * @return a batch of floating point values.
+     */
+    template <class T, class A>
+    inline batch<T, A> ldexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::ldexp<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Element-wise lesser or equal to comparison of batches \c x and \c y.
+     * @param x batch involved in the comparison.
+     * @param y batch involved in the comparison.
+     * @return a boolean batch.
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> le(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return x <= y;
+    }
+
+    /**
+     * @ingroup batch_math_extra
+     *
+     * Computes the natural logarithm of the gamma function of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the natural logarithm of the gamma function of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> lgamma(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::lgamma<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the buffer \c ptr and the specifed
+     * batch value type \c To. The memory needs to be aligned.
+     * @param ptr the memory buffer to read
+     * @return a new batch instance
+     */
+    template <class To, class A = default_arch, class From>
+    inline simd_return_type<From, To, A> load_as(From const* ptr, aligned_mode) noexcept
+    {
+        using batch_value_type = typename simd_return_type<From, To, A>::value_type;
+        detail::static_check_supported_config<From, A>();
+        detail::static_check_supported_config<To, A>();
+        return kernel::load_aligned<A>(ptr, kernel::convert<batch_value_type> {}, A {});
+    }
+
+    template <class To, class A = default_arch>
+    inline simd_return_type<bool, To, A> load_as(bool const* ptr, aligned_mode) noexcept
+    {
+        detail::static_check_supported_config<To, A>();
+        return simd_return_type<bool, To, A>::load_aligned(ptr);
+    }
+
+    template <class To, class A = default_arch, class From>
+    inline simd_return_type<std::complex<From>, To, A> load_as(std::complex<From> const* ptr, aligned_mode) noexcept
+    {
+        detail::static_check_supported_config<To, A>();
+        using batch_value_type = typename simd_return_type<std::complex<From>, To, A>::value_type;
+        return kernel::load_complex_aligned<A>(ptr, kernel::convert<batch_value_type> {}, A {});
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class To, class A = default_arch, class From, bool i3ec>
+    inline simd_return_type<xtl::xcomplex<From, From, i3ec>, To, A> load_as(xtl::xcomplex<From, From, i3ec> const* ptr, aligned_mode) noexcept
+    {
+        detail::static_check_supported_config<To, A>();
+        detail::static_check_supported_config<From, A>();
+        return load_as<To>(reinterpret_cast<std::complex<From> const*>(ptr), aligned_mode());
+    }
+#endif
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the buffer \c ptr and the specifed
+     * batch value type \c To. The memory does not need to be aligned.
+     * @param ptr the memory buffer to read
+     * @return a new batch instance
+     */
+    template <class To, class A = default_arch, class From>
+    inline simd_return_type<From, To, A> load_as(From const* ptr, unaligned_mode) noexcept
+    {
+        using batch_value_type = typename simd_return_type<From, To, A>::value_type;
+        detail::static_check_supported_config<To, A>();
+        detail::static_check_supported_config<From, A>();
+        return kernel::load_unaligned<A>(ptr, kernel::convert<batch_value_type> {}, A {});
+    }
+
+    template <class To, class A = default_arch>
+    inline simd_return_type<bool, To, A> load_as(bool const* ptr, unaligned_mode) noexcept
+    {
+        return simd_return_type<bool, To, A>::load_unaligned(ptr);
+    }
+
+    template <class To, class A = default_arch, class From>
+    inline simd_return_type<std::complex<From>, To, A> load_as(std::complex<From> const* ptr, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<To, A>();
+        detail::static_check_supported_config<From, A>();
+        using batch_value_type = typename simd_return_type<std::complex<From>, To, A>::value_type;
+        return kernel::load_complex_unaligned<A>(ptr, kernel::convert<batch_value_type> {}, A {});
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class To, class A = default_arch, class From, bool i3ec>
+    inline simd_return_type<xtl::xcomplex<From, From, i3ec>, To, A> load_as(xtl::xcomplex<From, From, i3ec> const* ptr, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<To, A>();
+        detail::static_check_supported_config<From, A>();
+        return load_as<To>(reinterpret_cast<std::complex<From> const*>(ptr), unaligned_mode());
+    }
+#endif
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the buffer \c ptr. The
+     * memory needs to be aligned.
+     * @param ptr the memory buffer to read
+     * @return a new batch instance
+     */
+    template <class A = default_arch, class From>
+    inline batch<From, A> load(From const* ptr, aligned_mode = {}) noexcept
+    {
+        detail::static_check_supported_config<From, A>();
+        return load_as<From, A>(ptr, aligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the buffer \c ptr. The
+     * memory does not need to be aligned.
+     * @param ptr the memory buffer to read
+     * @return a new batch instance
+     */
+    template <class A = default_arch, class From>
+    inline batch<From, A> load(From const* ptr, unaligned_mode) noexcept
+    {
+        detail::static_check_supported_config<From, A>();
+        return load_as<From, A>(ptr, unaligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the buffer \c ptr. The
+     * memory needs to be aligned.
+     * @param ptr the memory buffer to read
+     * @return a new batch instance
+     */
+    template <class A = default_arch, class From>
+    inline batch<From, A> load_aligned(From const* ptr) noexcept
+    {
+        detail::static_check_supported_config<From, A>();
+        return load_as<From, A>(ptr, aligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Creates a batch from the buffer \c ptr. The
+     * memory does not need to be aligned.
+     * @param ptr the memory buffer to read
+     * @return a new batch instance
+     */
+    template <class A = default_arch, class From>
+    inline batch<From, A> load_unaligned(From const* ptr) noexcept
+    {
+        detail::static_check_supported_config<From, A>();
+        return load_as<From, A>(ptr, unaligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the natural logarithm of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the natural logarithm of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> log(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::log<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     * Computes the base 2 logarithm of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the base 2 logarithm of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> log2(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::log2<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     * Computes the base 10 logarithm of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the base 10 logarithm of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> log10(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::log10<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     * Computes the natural logarithm of one plus the batch \c x.
+     * @param x batch of floating point values.
+     * @return the natural logarithm of one plus \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> log1p(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::log1p<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Element-wise lesser than comparison of batches \c x and \c y.
+     * @param x batch involved in the comparison.
+     * @param y batch involved in the comparison.
+     * @return a boolean batch.
+     */
+    template <class T, class A>
+    inline batch_bool<T, A> lt(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return x < y;
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the larger values of the batches \c x and \c y.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @return a batch of the larger values.
+     */
+    template <class T, class A>
+    inline batch<T, A> max(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::max<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the smaller values of the batches \c x and \c y.
+     * @param x a batch of integer or floating point values.
+     * @param y a batch of integer or floating point values.
+     * @return a batch of the smaller values.
+     */
+    template <class T, class A>
+    inline batch<T, A> min(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::min<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_constant
+     *
+     * Return a batch of scalars representing positive infinity
+     * @return a batch of positive infinity
+     */
+    template <class B>
+    inline B minusinfinity() noexcept
+    {
+        using T = typename B::value_type;
+        using A = typename B::arch_type;
+        detail::static_check_supported_config<T, A>();
+        return B(-std::numeric_limits<T>::infinity());
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the integer modulo of the batch \c x by the batch \c y.
+     * @param x batch involved in the modulo.
+     * @param y batch involved in the modulo.
+     * @return the result of the modulo.
+     */
+    template <class T, class A>
+    inline auto mod(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x % y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x % y;
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the product of the batches \c x and \c y.
+     * @tparam X the actual type of batch.
+     * @param x batch involved in the product.
+     * @param y batch involved in the product.
+     * @return the result of the product.
+     */
+    template <class T, class A>
+    inline auto mul(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x * y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x * y;
+    }
+
+    /**
+     * @ingroup batch_rounding
+     *
+     * Rounds the scalars in \c x to integer values (in floating point format), using
+     * the current rounding mode.
+     * @param x batch of floating point values.
+     * @return the batch of nearest integer values.
+     */
+    template <class T, class A>
+    inline batch<T, A> nearbyint(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::nearbyint<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_rounding
+     *
+     * Rounds the scalars in \c x to integer values (in integer format) using
+     * the current rounding mode.
+     * @param x batch of floating point values.
+     * @return the batch of nearest integer values.
+     *
+     * @warning For very large values the conversion to int silently overflows.
+     */
+    template <class T, class A>
+    inline batch<as_integer_t<T>, A>
+    nearbyint_as_int(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::nearbyint_as_int(x, A {});
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Element-wise inequality comparison of batches \c x and \c y.
+     * @param x batch involved in the comparison.
+     * @param y batch involved in the comparison.
+     * @return a boolean batch.
+     */
+    template <class T, class A>
+    inline auto neq(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x != y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x != y;
+    }
+
+    /**
+     * @ingroup batch_logical
+     *
+     * Element-wise inequality comparison of batches of boolean values \c x and \c y.
+     * @param x batch of booleans involved in the comparison.
+     * @param y batch of booleans involved in the comparison.
+     * @return a boolean batch.
+     */
+    template <class T, class A>
+    inline auto neq(batch_bool<T, A> const& x, batch_bool<T, A> const& y) noexcept -> decltype(x != y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x != y;
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the opposite of the batch \c x.
+     * @param x batch involved in the operation.
+     * @return the opposite of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> neg(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return -x;
+    }
+
+    /**
+     * @ingroup batch_math_extra
+     *
+     * Computes  the next representable  floating-point
+     *        value  following  x  in the direction of y
+     * @param x batch of floating point values.
+     * @param y batch of floating point values.
+     * @return \c x raised to the power \c y.
+     */
+    template <class T, class A>
+    inline batch<T, A> nextafter(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::nextafter<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_complex
+     *
+     * Computes the norm of the batch \c x.
+     * @param x batch of complex or real values.
+     * @return the norm of \c x.
+     */
+    template <class T, class A>
+    inline real_batch_type_t<batch<T, A>> norm(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::norm(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Returns a complex batch with magnitude \c r and phase angle \c theta.
+     * @param r The magnitude of the desired complex result.
+     * @param theta The phase angle of the desired complex result.
+     * @return \c r exp(i * \c theta).
+     */
+    template <class T, class A>
+    inline complex_batch_type_t<batch<T, A>> polar(batch<T, A> const& r, batch<T, A> const& theta = batch<T, A> {}) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::polar<A>(r, theta, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * No-op on \c x.
+     * @param x batch involved in the operation.
+     * @return \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> pos(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return +x;
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the value of the batch \c x raised to the power
+     * \c y.
+     * @param x batch of floating point values.
+     * @param y batch of floating point values.
+     * @return \c x raised to the power \c y.
+     */
+    template <class T, class A>
+    inline batch<T, A> pow(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::pow<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the value of the batch \c x raised to the power
+     * \c y.
+     * @param x batch of integral values.
+     * @param y batch of integral values.
+     * @return \c x raised to the power \c y.
+     */
+    template <class T, class ITy, class A, class = typename std::enable_if<std::is_integral<ITy>::value, void>::type>
+    inline batch<T, A> pow(batch<T, A> const& x, ITy y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::ipow<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_complex
+     *
+     * Computes the projection of the batch \c z.
+     * @param z batch of complex or real values.
+     * @return the projection of \c z.
+     */
+    template <class T, class A>
+    inline complex_batch_type_t<batch<T, A>> proj(batch<T, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::proj(z, A {});
+    }
+
+    /**
+     * @ingroup batch_complex
+     *
+     * Computes the real part of the batch \c z.
+     * @param z batch of complex or real values.
+     * @return the argument of \c z.
+     */
+    template <class T, class A>
+    inline real_batch_type_t<batch<T, A>> real(batch<T, A> const& z) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::real<A>(z, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the approximate reciprocal of the batch \c x.
+     * The maximum relative error for this approximation is
+     * less than 1.5*2^-12.
+     * @param x batch of floating point numbers.
+     * @return the reciprocal.
+     */
+    template <class T, class A, class = typename std::enable_if<std::is_floating_point<T>::value, void>::type>
+    inline batch<T, A> reciprocal(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::reciprocal(x, A {});
+    }
+
+    /**
+     * @ingroup batch_reducers
+     *
+     * Generic reducer using only batch operations
+     * @param f reducing function, accepting `batch ()(batch, batch)`
+     * @param x batch involved in the reduction
+     * @return the result of the reduction, as a scalar.
+     */
+    template <class T, class A, class F>
+    inline T reduce(F&& f, batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::detail::reduce(std::forward<F>(f), x, std::integral_constant<unsigned, batch<T, A>::size>());
+    }
+
+    /**
+     * @ingroup batch_reducers
+     *
+     * Adds all the scalars of the batch \c x.
+     * @param x batch involved in the reduction
+     * @return the result of the reduction.
+     */
+    template <class T, class A>
+    inline T reduce_add(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::reduce_add<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_reducers
+     *
+     * Max of all the scalars of the batch \c x.
+     * @param x batch involved in the reduction
+     * @return the result of the reduction.
+     */
+    template <class T, class A>
+    inline T reduce_max(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::reduce_max<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_reducers
+     *
+     * Min of all the scalars of the batch \c x.
+     * @param x batch involved in the reduction
+     * @return the result of the reduction.
+     */
+    template <class T, class A>
+    inline T reduce_min(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::reduce_min<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the remainder of dividing \c x by \c y
+     * @param x batch of scalar values
+     * @param y batch of scalar values
+     * @return the result of the addition.
+     */
+    template <class T, class A>
+    inline batch<T, A> remainder(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::remainder<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_rounding
+     *
+     * Rounds the scalars in \c x to integer values (in floating point format), using
+     * the current rounding mode.
+     * @param x batch of floating point values.
+     * @return the batch of rounded values.
+     */
+    template <class T, class A>
+    inline batch<T, A> rint(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return nearbyint(x);
+    }
+
+    /**
+     * @ingroup rotate_left
+     *
+     * Slide the whole batch to the left by \c n bytes, and reintroduce the
+     * slided out elements from the right. This is different from
+     * \c rol that rotates each batch element to the left.
+     *
+     * @tparam N Amount of bytes to rotated to the left.
+     * @param x batch of integer values.
+     * @return rotated batch.
+     */
+    template <size_t N, class T, class A>
+    inline batch<T, A> rotate_left(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::rotate_left<N, A>(x, A {});
+    }
+
+    /**
+     * @ingroup rotate_right
+     *
+     * Slide the whole batch to the right by \c n bytes, and reintroduce the
+     * slided out elements from the left. This is different from
+     * \c rol that rotates each batch element to the left.
+     *
+     * @tparam N Amount of bytes to rotate to the right.
+     * @param x batch of integer values.
+     * @return rotated batch.
+     */
+    template <size_t N, class T, class A>
+    inline batch<T, A> rotate_right(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::rotate_right<N, A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Perform a bitwise shift to the left, reintroducing the shifted out bits
+     * to the right
+     * @param x batch to rotate
+     * @param shift scalar amount to shift
+     * @return rotated \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> rotl(batch<T, A> const& x, int shift) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::rotl<A>(x, shift, A {});
+    }
+    template <class T, class A>
+    inline batch<T, A> rotl(batch<T, A> const& x, batch<T, A> const& shift) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::rotl<A>(x, shift, A {});
+    }
+
+    /**
+     * @ingroup batch_bitwise
+     *
+     * Perform a bitwise shift to the right, reintroducing the shifted out bits
+     * to the left.
+     * @param x batch to rotate
+     * @param shift scalar amount to shift
+     * @return rotated \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> rotr(batch<T, A> const& x, int shift) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::rotr<A>(x, shift, A {});
+    }
+    template <class T, class A>
+    inline batch<T, A> rotr(batch<T, A> const& x, batch<T, A> const& shift) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::rotr<A>(x, shift, A {});
+    }
+
+    /**
+     * @ingroup batch_rounding
+     *
+     * Computes the batch of nearest integer values to scalars in \c x (in
+     * floating point format), rounding halfway cases away from zero, regardless
+     * of the current rounding mode.
+     * @param x batch of flaoting point values.
+     * @return the batch of nearest integer values.
+     */
+    template <class T, class A>
+    inline batch<T, A> round(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::round<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes an estimate of the inverse square root of the batch \c x.
+     *
+     * @warning Unlike most xsimd function, this does not return the same result as the
+     * equivalent scalar operation, trading accuracy for speed.
+     *
+     * @param x batch of floating point values.
+     * @return the inverse square root of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> rsqrt(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::rsqrt<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the saturate sum of the batch \c x and the batch \c y.
+
+     * @tparam X the actual type of batch.
+     * @param x batch involved in the saturated addition.
+     * @param y batch involved in the saturated addition.
+     * @return the result of the saturated addition.
+     */
+    template <class T, class A>
+    inline batch<T, A> sadd(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::sadd<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Ternary operator for batches: selects values from the batches \c true_br or \c false_br
+     * depending on the boolean values in the constant batch \c cond. Equivalent to
+     * \code{.cpp}
+     * for(std::size_t i = 0; i < N; ++i)
+     *     res[i] = cond[i] ? true_br[i] : false_br[i];
+     * \endcode
+     * @param cond batch condition.
+     * @param true_br batch values for truthy condition.
+     * @param false_br batch value for falsy condition.
+     * @return the result of the selection.
+     */
+    template <class T, class A>
+    inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::select<A>(cond, true_br, false_br, A {});
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Ternary operator for batches: selects values from the batches \c true_br or \c false_br
+     * depending on the boolean values in the constant batch \c cond. Equivalent to
+     * \code{.cpp}
+     * for(std::size_t i = 0; i < N; ++i)
+     *     res[i] = cond[i] ? true_br[i] : false_br[i];
+     * \endcode
+     * @param cond batch condition.
+     * @param true_br batch values for truthy condition.
+     * @param false_br batch value for falsy condition.
+     * @return the result of the selection.
+     */
+    template <class T, class A>
+    inline batch<std::complex<T>, A> select(batch_bool<T, A> const& cond, batch<std::complex<T>, A> const& true_br, batch<std::complex<T>, A> const& false_br) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::select<A>(cond, true_br, false_br, A {});
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Ternary operator for batches: selects values from the batches \c true_br or \c false_br
+     * depending on the boolean values in the constant batch \c cond. Equivalent to
+     * \code{.cpp}
+     * for(std::size_t i = 0; i < N; ++i)
+     *     res[i] = cond[i] ? true_br[i] : false_br[i];
+     * \endcode
+     * @param cond constant batch condition.
+     * @param true_br batch values for truthy condition.
+     * @param false_br batch value for falsy condition.
+     * @return the result of the selection.
+     */
+    template <class T, class A, bool... Values>
+    inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::select<A>(cond, true_br, false_br, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Combine elements from \c x and \c y according to selector \c mask
+     * @param x batch
+     * @param y batch
+     * @param mask constant batch mask of integer elements of the same size as
+     * element of \c x and \c y. Each element of the mask index the vector that
+     * would be formed by the concatenation of \c x and \c y. For instance
+     * \code{.cpp}
+     * batch_constant<batch<uint32_t, sse2>, 0, 4, 3, 7>
+     * \endcode
+     * Picks \c x[0], \c y[0], \c x[3], \c y[3]
+     *
+     * @return combined batch
+     */
+    template <class T, class A, class Vt, Vt... Values>
+    inline typename std::enable_if<std::is_arithmetic<T>::value, batch<T, A>>::type
+    shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<batch<Vt, A>, Values...> mask) noexcept
+    {
+        static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
+        detail::static_check_supported_config<T, A>();
+        return kernel::shuffle<A>(x, y, mask, A {});
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Computes the sign of \c x
+     * @param x batch
+     * @return -1 for each negative element, -1 or +1 for each null element and +1 for each element
+     */
+    template <class T, class A>
+    inline batch<T, A> sign(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::sign<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Computes the sign of \c x, assuming x doesn't have any zero
+     * @param x batch
+     * @return -1 for each negative element, -1 or +1 for each null element and +1 for each element
+     */
+    template <class T, class A>
+    inline batch<T, A> signnz(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::signnz<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the sine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the sine of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> sin(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::sin<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the sine and the cosine of the batch \c x. This method is faster
+     * than calling sine and cosine independently.
+     * @param x batch of floating point values.
+     * @return a pair containing the sine then the cosine of  batch \c x
+     */
+    template <class T, class A>
+    inline std::pair<batch<T, A>, batch<T, A>> sincos(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::sincos<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the hyperbolic sine of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the hyperbolic sine of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> sinh(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::sinh<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Slide the whole batch to the left by \c n bytes. This is different from
+     * \c bitwise_lshift that shifts each batch element to the left.
+     *
+     * @tparam N Amount of bytes to slide to the left.
+     * @param x batch of integer values.
+     * @return slided batch.
+     */
+    template <size_t N, class T, class A>
+    inline batch<T, A> slide_left(batch<T, A> const& x) noexcept
+    {
+        static_assert(std::is_integral<T>::value, "can only slide batch of integers");
+        detail::static_check_supported_config<T, A>();
+        return kernel::slide_left<N, A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Slide the whole batch to the right by \c N bytes. This is different from
+     * \c bitwise_rshift that shifts each batch element to the right.
+     *
+     * @tparam N Amount of bytes to slide to the right.
+     * @param x batch of integer values.
+     * @return slided batch.
+     */
+    template <size_t N, class T, class A>
+    inline batch<T, A> slide_right(batch<T, A> const& x) noexcept
+    {
+        static_assert(std::is_integral<T>::value, "can only slide batch of integers");
+        detail::static_check_supported_config<T, A>();
+        return kernel::slide_right<N, A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math
+     *
+     * Computes the square root of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the square root of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> sqrt(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::sqrt<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the saturate difference of the batch \c x and the batch \c y.
+     * @tparam X the actual type of batch.
+     * @param x batch involved in the saturated difference.
+     * @param y batch involved in the saturated difference.
+     * @return the result of the saturated difference.
+     */
+    template <class T, class A>
+    inline batch<T, A> ssub(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::ssub<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Copy content of batch \c src to the buffer \c dst. The
+     * memory needs to be aligned.
+     * @param dst the memory buffer to write to
+     * @param src the batch to copy
+     */
+    template <class To, class A = default_arch, class From>
+    inline void store_as(To* dst, batch<From, A> const& src, aligned_mode) noexcept
+    {
+        kernel::store_aligned(dst, src, A {});
+    }
+
+    template <class A = default_arch, class From>
+    inline void store_as(bool* dst, batch_bool<From, A> const& src, aligned_mode) noexcept
+    {
+        kernel::store(src, dst, A {});
+    }
+
+    template <class To, class A = default_arch, class From>
+    inline void store_as(std::complex<To>* dst, batch<std::complex<From>, A> const& src, aligned_mode) noexcept
+    {
+        kernel::store_complex_aligned(dst, src, A {});
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class To, class A = default_arch, class From, bool i3ec>
+    inline void store_as(xtl::xcomplex<To, To, i3ec>* dst, batch<std::complex<From>, A> const& src, aligned_mode) noexcept
+    {
+        store_as(reinterpret_cast<std::complex<To>*>(dst), src, aligned_mode());
+    }
+#endif
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Copy content of batch \c src to the buffer \c dst. The
+     * memory does not need to be aligned.
+     * @param dst the memory buffer to write to
+     * @param src the batch to copy
+     */
+    template <class To, class A = default_arch, class From>
+    inline void store_as(To* dst, batch<From, A> const& src, unaligned_mode) noexcept
+    {
+        kernel::store_unaligned(dst, src, A {});
+    }
+
+    template <class A = default_arch, class From>
+    inline void store_as(bool* dst, batch_bool<From, A> const& src, unaligned_mode) noexcept
+    {
+        kernel::store(src, dst, A {});
+    }
+
+    template <class To, class A = default_arch, class From>
+    inline void store_as(std::complex<To>* dst, batch<std::complex<From>, A> const& src, unaligned_mode) noexcept
+    {
+        kernel::store_complex_unaligned(dst, src, A {});
+    }
+
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+    template <class To, class A = default_arch, class From, bool i3ec>
+    inline void store_as(xtl::xcomplex<To, To, i3ec>* dst, batch<std::complex<From>, A> const& src, unaligned_mode) noexcept
+    {
+        store_as(reinterpret_cast<std::complex<To>*>(dst), src, unaligned_mode());
+    }
+#endif
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Copy content of batch \c val to the buffer \c mem. The
+     * memory does not need to be aligned.
+     * @param mem the memory buffer to write to
+     * @param val the batch to copy from
+     */
+    template <class A, class T>
+    inline void store(T* mem, batch<T, A> const& val, aligned_mode = {}) noexcept
+    {
+        store_as<T, A>(mem, val, aligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Copy content of batch \c val to the buffer \c mem. The
+     * memory does not need to be aligned.
+     * @param mem the memory buffer to write to
+     * @param val the batch to copy from
+     */
+    template <class A, class T>
+    inline void store(T* mem, batch<T, A> const& val, unaligned_mode) noexcept
+    {
+        store_as<T, A>(mem, val, unaligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Copy content of batch \c val to the buffer \c mem. The
+     * memory needs to be aligned.
+     * @param mem the memory buffer to write to
+     * @param val the batch to copy from
+     */
+    template <class A, class T>
+    inline void store_aligned(T* mem, batch<T, A> const& val) noexcept
+    {
+        store_as<T, A>(mem, val, aligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Copy content of batch \c val to the buffer \c mem. The
+     * memory does not need to be aligned.
+     * @param mem the memory buffer to write to
+     * @param val the batch to copy
+     */
+    template <class A, class T>
+    inline void store_unaligned(T* mem, batch<T, A> const& val) noexcept
+    {
+        store_as<T, A>(mem, val, unaligned_mode {});
+    }
+
+    /**
+     * @ingroup batch_arithmetic
+     *
+     * Computes the difference between \c x and \c y
+     * @tparam X the actual type of batch.
+     * @param x scalar or batch of scalars
+     * @param y scalar or batch of scalars
+     * @return the difference between \c x and \c y
+     */
+    template <class T, class A>
+    inline auto sub(batch<T, A> const& x, batch<T, A> const& y) noexcept -> decltype(x - y)
+    {
+        detail::static_check_supported_config<T, A>();
+        return x - y;
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Rearrange elements from \c x according to constant mask \c mask
+     * @param x batch
+     * @param mask constant batch mask of integer elements of the same size as
+     * element of \c x
+     * @return swizzled batch
+     */
+    template <class T, class A, class Vt, Vt... Values>
+    inline typename std::enable_if<std::is_arithmetic<T>::value, batch<T, A>>::type
+    swizzle(batch<T, A> const& x, batch_constant<batch<Vt, A>, Values...> mask) noexcept
+    {
+        static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
+        detail::static_check_supported_config<T, A>();
+        return kernel::swizzle<A>(x, mask, A {});
+    }
+    template <class T, class A, class Vt, Vt... Values>
+    inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& x, batch_constant<batch<Vt, A>, Values...> mask) noexcept
+    {
+        static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
+        detail::static_check_supported_config<T, A>();
+        return kernel::swizzle<A>(x, mask, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Rearrange elements from \c x according to mask \c mask
+     * @param x batch
+     * @param mask batch mask of integer elements of the same size as
+     * element of \c x
+     * @return swizzled batch
+     */
+    template <class T, class A, class Vt>
+    inline typename std::enable_if<std::is_arithmetic<T>::value, batch<T, A>>::type
+    swizzle(batch<T, A> const& x, batch<Vt, A> mask) noexcept
+    {
+        static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
+        detail::static_check_supported_config<T, A>();
+        return kernel::swizzle<A>(x, mask, A {});
+    }
+
+    template <class T, class A, class Vt>
+    inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& x, batch<Vt, A> mask) noexcept
+    {
+        static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
+        detail::static_check_supported_config<T, A>();
+        return kernel::swizzle<A>(x, mask, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the tangent of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the tangent of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> tan(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::tan<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_trigo
+     *
+     * Computes the hyperbolic tangent of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the hyperbolic tangent of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> tanh(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::tanh<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_math_extra
+     *
+     * Computes the gamma function of the batch \c x.
+     * @param x batch of floating point values.
+     * @return the gamma function of \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> tgamma(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::tgamma<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_conversion
+     *
+     * Perform a conversion from \c i to a value of an floating point type of the same size as \c T.
+     * This is equivalent to \c batch_cast<as_float_t<T>>(i)
+     * @param i batch of integers.
+     * @return \c i converted to a value of an floating point type of the same size as \c T
+     */
+    template <class T, class A>
+    inline batch<as_float_t<T>, A> to_float(batch<T, A> const& i) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return batch_cast<as_float_t<T>>(i);
+    }
+
+    /**
+     * @ingroup batch_conversion
+     *
+     * Perform a conversion from \c x to a value of an integer type of the same size as \c T
+     * This is equivalent to \c batch_cast<as_integer_t<T>>(x)
+     * @param x batch.
+     * @return \c x converted to a value of an integer type of the same size as \c T
+     */
+    template <class T, class A>
+    inline batch<as_integer_t<T>, A> to_int(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return batch_cast<as_integer_t<T>>(x);
+    }
+
+    /**
+     * @ingroup batch_rounding
+     *
+     * Computes the batch of nearest integer values not greater in magnitude
+     * than scalars in \c x.
+     * @param x batch of floating point values.
+     * @return the batch of nearest integer values not greater in magnitude than \c x.
+     */
+    template <class T, class A>
+    inline batch<T, A> trunc(batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::trunc<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Unpack and interleave data from the HIGH half of batches \c x and \c y.
+     * Store the results in the Return value.
+     * @param x a batch of integer or floating point or double precision values.
+     * @param y a batch of integer or floating point or double precision values.
+     * @return a batch of the high part of shuffled values.
+     */
+    template <class T, class A>
+    inline batch<T, A> zip_hi(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::zip_hi<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_data_transfer
+     *
+     * Unpack and interleave data from the LOW half of batches \c x and \c y.
+     * Store the results in the Return value.
+     * @param x a batch of integer or floating point or double precision values.
+     * @param y a batch of integer or floating point or double precision values.
+     * @return a batch of the low part of shuffled values.
+     */
+    template <class T, class A>
+    inline batch<T, A> zip_lo(batch<T, A> const& x, batch<T, A> const& y) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::zip_lo<A>(x, y, A {});
+    }
+
+    /**
+     * @ingroup batch_conversion
+     *
+     * Cast a \c batch_bool of \c T into a \c batch of the same type using the
+     * following rule: if an element of \c self is true, it maps to -1 in the
+     * returned integral batch, otherwise it maps to 0.
+     *
+     * @param self batch_bool of \c T
+     * @return \c self cast to a \c batch of \c T
+     */
+    template <class T, class A, typename std::enable_if<std::is_integral<T>::value, int>::type = 3>
+    inline batch<T, A> bitwise_cast(batch_bool<T, A> const& self) noexcept
+    {
+        T z(0);
+        detail::static_check_supported_config<T, A>();
+        return select(self, batch<T, A>(T(~z)), batch<T, A>(z));
+    }
+
+    template <class T, class A, typename std::enable_if<std::is_floating_point<T>::value, int>::type = 3>
+    inline batch<T, A> bitwise_cast(batch_bool<T, A> const& self) noexcept
+    {
+        T z0(0), z1(0);
+        using int_type = as_unsigned_integer_t<T>;
+        int_type value(~int_type(0));
+        std::memcpy(&z1, &value, sizeof(int_type));
+        detail::static_check_supported_config<T, A>();
+        return select(self, batch<T, A>(z1), batch<T, A>(z0));
+    }
+
+    /**
+     * @ingroup batch_bool_reducers
+     *
+     * Returns true if all the boolean values in the batch are true,
+     * false otherwise.
+     * @param x the batch to reduce.
+     * @return a boolean scalar.
+     */
+    template <class T, class A>
+    inline bool all(batch_bool<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::all<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_bool_reducers
+     *
+     * Return true if any of the boolean values in the batch is true,
+     * false otherwise.
+     * @param x the batch to reduce.
+     * @return a boolean scalar.
+     */
+    template <class T, class A>
+    inline bool any(batch_bool<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return kernel::any<A>(x, A {});
+    }
+
+    /**
+     * @ingroup batch_bool_reducers
+     *
+     * Return true if none of the boolean values in the batch is true,
+     * false otherwise.
+     * @param x the batch to reduce.
+     * @return a boolean scalar.
+     */
+    template <class T, class A>
+    inline bool none(batch_bool<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        return !xsimd::any(x);
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Dump the content of batch \c x to stream \c o
+     * @param o the stream where the batch is dumped
+     * @param x batch to dump.
+     * @return a reference to \c o
+     */
+    template <class T, class A>
+    inline std::ostream& operator<<(std::ostream& o, batch<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        constexpr auto size = batch<T, A>::size;
+        alignas(A::alignment()) T buffer[size];
+        x.store_aligned(&buffer[0]);
+        o << '(';
+        for (std::size_t i = 0; i < size - 1; ++i)
+            o << buffer[i] << ", ";
+        return o << buffer[size - 1] << ')';
+    }
+
+    /**
+     * @ingroup batch_miscellaneous
+     *
+     * Dump the content of batch \c x to stream \c o
+     * @param o the stream where the batch is dumped
+     * @param x batch to dump.
+     * @return a reference to \c o
+     */
+    template <class T, class A>
+    inline std::ostream& operator<<(std::ostream& o, batch_bool<T, A> const& x) noexcept
+    {
+        detail::static_check_supported_config<T, A>();
+        constexpr auto size = batch_bool<T, A>::size;
+        alignas(A::alignment()) bool buffer[size];
+        x.store_aligned(&buffer[0]);
+        o << '(';
+        for (std::size_t i = 0; i < size - 1; ++i)
+            o << buffer[i] << ", ";
+        return o << buffer[size - 1] << ')';
+    }
+}
+
+#endif