1 files changed, 1424 insertions, 0 deletions
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp
new file mode 100644
index 0000000000..bc982c7ce6
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp
@@ -0,0 +1,1424 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_NEON64_HPP
+#define XSIMD_NEON64_HPP
+
+#include <complex>
+#include <cstddef>
+#include <tuple>
+
+#include "../types/xsimd_neon64_register.hpp"
+#include "../types/xsimd_utils.hpp"
+
+namespace xsimd
+{
+    template <class batch_type, bool... Values>
+    struct batch_bool_constant;
+
+    namespace kernel
+    {
+        using namespace types;
+
+        /*******
+         * all *
+         *******/
+
+        template <class A, class T, detail::enable_sized_t<T, 4> = 0>
+        inline bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return vminvq_u32(arg) == ~0U;
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 1> = 0>
+        inline bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return all(batch_bool<uint32_t, A>(vreinterpretq_u32_u8(arg)), neon64 {});
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 2> = 0>
+        inline bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return all(batch_bool<uint32_t, A>(vreinterpretq_u32_u16(arg)), neon64 {});
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 8> = 0>
+        inline bool all(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return all(batch_bool<uint32_t, A>(vreinterpretq_u32_u64(arg)), neon64 {});
+        }
+
+        /*******
+         * any *
+         *******/
+
+        template <class A, class T, detail::enable_sized_t<T, 4> = 0>
+        inline bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return vmaxvq_u32(arg) != 0;
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 1> = 0>
+        inline bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return any(batch_bool<uint32_t, A>(vreinterpretq_u32_u8(arg)), neon64 {});
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 2> = 0>
+        inline bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return any(batch_bool<uint32_t, A>(vreinterpretq_u32_u16(arg)), neon64 {});
+        }
+
+        template <class A, class T, detail::enable_sized_t<T, 8> = 0>
+        inline bool any(batch_bool<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return any(batch_bool<uint32_t, A>(vreinterpretq_u32_u64(arg)), neon64 {});
+        }
+
+        /*************
+         * broadcast *
+         *************/
+
+        // Required to avoid ambiguous call
+        template <class A, class T>
+        inline batch<T, A> broadcast(T val, requires_arch<neon64>) noexcept
+        {
+            return broadcast<neon64>(val, neon {});
+        }
+
+        template <class A>
+        inline batch<double, A> broadcast(double val, requires_arch<neon64>) noexcept
+        {
+            return vdupq_n_f64(val);
+        }
+
+        /*******
+         * set *
+         *******/
+
+        template <class A>
+        inline batch<double, A> set(batch<double, A> const&, requires_arch<neon64>, double d0, double d1) noexcept
+        {
+            return float64x2_t { d0, d1 };
+        }
+
+        template <class A>
+        inline batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<neon64>, bool b0, bool b1) noexcept
+        {
+            using register_type = typename batch_bool<double, A>::register_type;
+            using unsigned_type = as_unsigned_integer_t<double>;
+            return register_type { static_cast<unsigned_type>(b0 ? -1LL : 0LL),
+                                   static_cast<unsigned_type>(b1 ? -1LL : 0LL) };
+        }
+
+        /*************
+         * from_bool *
+         *************/
+
+        template <class A>
+        inline batch<double, A> from_bool(batch_bool<double, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return vreinterpretq_f64_u64(vandq_u64(arg, vreinterpretq_u64_f64(vdupq_n_f64(1.))));
+        }
+
+        /********
+         * load *
+         ********/
+#if defined(__clang__) || defined(__GNUC__)
+#define xsimd_aligned_load(inst, type, expr) inst((type)__builtin_assume_aligned(expr, 16))
+#elif defined(_MSC_VER)
+#define xsimd_aligned_load(inst, type, expr) inst##_ex((type)expr, 128)
+#else
+#define xsimd_aligned_load(inst, type, expr) inst((type)expr)
+#endif
+
+        template <class A>
+        inline batch<double, A> load_aligned(double const* src, convert<double>, requires_arch<neon64>) noexcept
+        {
+            return xsimd_aligned_load(vld1q_f64, double*, src);
+        }
+
+        template <class A>
+        inline batch<double, A> load_unaligned(double const* src, convert<double>, requires_arch<neon64>) noexcept
+        {
+            return vld1q_f64(src);
+        }
+#undef xsimd_aligned_load
+
+        /*********
+         * store *
+         *********/
+
+        template <class A>
+        inline void store_aligned(double* dst, batch<double, A> const& src, requires_arch<neon64>) noexcept
+        {
+            vst1q_f64(dst, src);
+        }
+
+        template <class A>
+        inline void store_unaligned(double* dst, batch<double, A> const& src, requires_arch<neon64>) noexcept
+        {
+            return store_aligned<A>(dst, src, A {});
+        }
+
+        /****************
+         * load_complex *
+         ****************/
+
+        template <class A>
+        inline batch<std::complex<double>, A> load_complex_aligned(std::complex<double> const* mem, convert<std::complex<double>>, requires_arch<neon64>) noexcept
+        {
+            using real_batch = batch<double, A>;
+            const double* buf = reinterpret_cast<const double*>(mem);
+            float64x2x2_t tmp = vld2q_f64(buf);
+            real_batch real = tmp.val[0],
+                       imag = tmp.val[1];
+            return batch<std::complex<double>, A> { real, imag };
+        }
+
+        template <class A>
+        inline batch<std::complex<double>, A> load_complex_unaligned(std::complex<double> const* mem, convert<std::complex<double>> cvt, requires_arch<neon64>) noexcept
+        {
+            return load_complex_aligned<A>(mem, cvt, A {});
+        }
+
+        /*****************
+         * store_complex *
+         *****************/
+
+        template <class A>
+        inline void store_complex_aligned(std::complex<double>* dst, batch<std::complex<double>, A> const& src, requires_arch<neon64>) noexcept
+        {
+            float64x2x2_t tmp;
+            tmp.val[0] = src.real();
+            tmp.val[1] = src.imag();
+            double* buf = reinterpret_cast<double*>(dst);
+            vst2q_f64(buf, tmp);
+        }
+
+        template <class A>
+        inline void store_complex_unaligned(std::complex<double>* dst, batch<std::complex<double>, A> const& src, requires_arch<neon64>) noexcept
+        {
+            store_complex_aligned(dst, src, A {});
+        }
+
+        /*******
+         * neg *
+         *******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vreinterpretq_u64_s64(vnegq_s64(vreinterpretq_s64_u64(rhs)));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vnegq_s64(rhs);
+        }
+
+        template <class A>
+        inline batch<double, A> neg(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vnegq_f64(rhs);
+        }
+
+        /*******
+         * add *
+         *******/
+
+        template <class A>
+        inline batch<double, A> add(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vaddq_f64(lhs, rhs);
+        }
+
+        /********
+         * sadd *
+         ********/
+
+        template <class A>
+        inline batch<double, A> sadd(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return add(lhs, rhs, neon64 {});
+        }
+
+        /*******
+         * sub *
+         *******/
+
+        template <class A>
+        inline batch<double, A> sub(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vsubq_f64(lhs, rhs);
+        }
+
+        /********
+         * ssub *
+         ********/
+
+        template <class A>
+        inline batch<double, A> ssub(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return sub(lhs, rhs, neon64 {});
+        }
+
+        /*******
+         * mul *
+         *******/
+
+        template <class A>
+        inline batch<double, A> mul(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vmulq_f64(lhs, rhs);
+        }
+
+        /*******
+         * div *
+         *******/
+
+#if defined(XSIMD_FAST_INTEGER_DIVISION)
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcvtq_u64_f64(vcvtq_f64_u64(lhs) / vcvtq_f64_u64(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcvtq_s64_f64(vcvtq_f64_s64(lhs) / vcvtq_f64_s64(rhs));
+        }
+#endif
+        template <class A>
+        inline batch<double, A> div(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vdivq_f64(lhs, rhs);
+        }
+
+        /******
+         * eq *
+         ******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vceqq_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vceqq_s64(lhs, rhs);
+        }
+
+        template <class A>
+        inline batch_bool<double, A> eq(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vceqq_f64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vceqq_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vceqq_u64(lhs, rhs);
+        }
+
+        template <class A>
+        inline batch_bool<double, A> eq(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vceqq_u64(lhs, rhs);
+        }
+
+        /*************
+         * fast_cast *
+         *************/
+        namespace detail
+        {
+            template <class A>
+            inline batch<double, A> fast_cast(batch<int64_t, A> const& x, batch<double, A> const&, requires_arch<neon64>) noexcept
+            {
+                return vcvtq_f64_s64(x);
+            }
+
+            template <class A>
+            inline batch<double, A> fast_cast(batch<uint64_t, A> const& x, batch<double, A> const&, requires_arch<neon64>) noexcept
+            {
+                return vcvtq_f64_u64(x);
+            }
+
+            template <class A>
+            inline batch<int64_t, A> fast_cast(batch<double, A> const& x, batch<int64_t, A> const&, requires_arch<neon64>) noexcept
+            {
+                return vcvtq_s64_f64(x);
+            }
+
+            template <class A>
+            inline batch<uint64_t, A> fast_cast(batch<double, A> const& x, batch<uint64_t, A> const&, requires_arch<neon64>) noexcept
+            {
+                return vcvtq_u64_f64(x);
+            }
+
+        }
+
+        /******
+         * lt *
+         ******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcltq_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcltq_s64(lhs, rhs);
+        }
+
+        template <class A>
+        inline batch_bool<double, A> lt(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcltq_f64(lhs, rhs);
+        }
+
+        /******
+         * le *
+         ******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcleq_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcleq_s64(lhs, rhs);
+        }
+
+        template <class A>
+        inline batch_bool<double, A> le(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcleq_f64(lhs, rhs);
+        }
+
+        /******
+         * gt *
+         ******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcgtq_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcgtq_s64(lhs, rhs);
+        }
+
+        template <class A>
+        inline batch_bool<double, A> gt(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcgtq_f64(lhs, rhs);
+        }
+
+        /******
+         * ge *
+         ******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcgeq_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcgeq_s64(lhs, rhs);
+        }
+
+        template <class A>
+        inline batch_bool<double, A> ge(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vcgeq_f64(lhs, rhs);
+        }
+
+        /*******************
+         * batch_bool_cast *
+         *******************/
+
+        template <class A, class T_out, class T_in>
+        inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<neon64>) noexcept
+        {
+            using register_type = typename batch_bool<T_out, A>::register_type;
+            return register_type(self);
+        }
+
+        /***************
+         * bitwise_and *
+         ***************/
+
+        template <class A>
+        inline batch<double, A> bitwise_and(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vreinterpretq_f64_u64(vandq_u64(vreinterpretq_u64_f64(lhs),
+                                                   vreinterpretq_u64_f64(rhs)));
+        }
+
+        template <class A>
+        inline batch_bool<double, A> bitwise_and(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vandq_u64(lhs, rhs);
+        }
+
+        /**************
+         * bitwise_or *
+         **************/
+
+        template <class A>
+        inline batch<double, A> bitwise_or(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vreinterpretq_f64_u64(vorrq_u64(vreinterpretq_u64_f64(lhs),
+                                                   vreinterpretq_u64_f64(rhs)));
+        }
+
+        template <class A>
+        inline batch_bool<double, A> bitwise_or(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vorrq_u64(lhs, rhs);
+        }
+
+        /***************
+         * bitwise_xor *
+         ***************/
+
+        template <class A>
+        inline batch<double, A> bitwise_xor(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vreinterpretq_f64_u64(veorq_u64(vreinterpretq_u64_f64(lhs),
+                                                   vreinterpretq_u64_f64(rhs)));
+        }
+
+        template <class A>
+        inline batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return veorq_u64(lhs, rhs);
+        }
+
+        /*******
+         * neq *
+         *******/
+
+        template <class A>
+        inline batch_bool<double, A> neq(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return bitwise_xor(lhs, rhs, A {});
+        }
+
+        /***************
+         * bitwise_not *
+         ***************/
+
+        template <class A>
+        inline batch<double, A> bitwise_not(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vreinterpretq_f64_u32(vmvnq_u32(vreinterpretq_u32_f64(rhs)));
+        }
+
+        template <class A>
+        inline batch_bool<double, A> bitwise_not(batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return detail::bitwise_not_u64(rhs);
+        }
+
+        /******************
+         * bitwise_andnot *
+         ******************/
+
+        template <class A>
+        inline batch<double, A> bitwise_andnot(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vreinterpretq_f64_u64(vbicq_u64(vreinterpretq_u64_f64(lhs),
+                                                   vreinterpretq_u64_f64(rhs)));
+        }
+
+        template <class A>
+        inline batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& lhs, batch_bool<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vbicq_u64(lhs, rhs);
+        }
+
+        /*******
+         * min *
+         *******/
+
+        template <class A>
+        inline batch<double, A> min(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vminq_f64(lhs, rhs);
+        }
+
+        /*******
+         * max *
+         *******/
+
+        template <class A>
+        inline batch<double, A> max(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vmaxq_f64(lhs, rhs);
+        }
+
+        /*******
+         * abs *
+         *******/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> abs(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return rhs;
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> abs(batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vabsq_s64(rhs);
+        }
+
+        template <class A>
+        inline batch<double, A> abs(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vabsq_f64(rhs);
+        }
+
+        template <class A>
+        inline batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
+                                                  requires_arch<neon64>) noexcept
+        {
+            return vcvtnq_s32_f32(self);
+        }
+
+#if !defined(__GNUC__)
+        template <class A>
+        inline batch<int64_t, A> nearbyint_as_int(batch<double, A> const& self,
+                                                  requires_arch<neon64>) noexcept
+        {
+            return vcvtnq_s64_f64(self);
+        }
+#endif
+
+        /**************
+         * reciprocal *
+         **************/
+
+        template <class A>
+        inline batch<double, A>
+        reciprocal(const batch<double, A>& x,
+                   kernel::requires_arch<neon64>) noexcept
+        {
+            return vrecpeq_f64(x);
+        }
+
+        /********
+         * rsqrt *
+         ********/
+
+        template <class A>
+        inline batch<double, A> rsqrt(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vrsqrteq_f64(rhs);
+        }
+
+        /********
+         * sqrt *
+         ********/
+
+        template <class A>
+        inline batch<double, A> sqrt(batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vsqrtq_f64(rhs);
+        }
+
+        /********************
+         * Fused operations *
+         ********************/
+
+#ifdef __ARM_FEATURE_FMA
+        template <class A>
+        inline batch<double, A> fma(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<neon64>) noexcept
+        {
+            return vfmaq_f64(z, x, y);
+        }
+
+        template <class A>
+        inline batch<double, A> fms(batch<double, A> const& x, batch<double, A> const& y, batch<double, A> const& z, requires_arch<neon64>) noexcept
+        {
+            return vfmaq_f64(-z, x, y);
+        }
+#endif
+
+        /*********
+         * haddp *
+         *********/
+
+        template <class A>
+        inline batch<double, A> haddp(const batch<double, A>* row, requires_arch<neon64>) noexcept
+        {
+            return vpaddq_f64(row[0], row[1]);
+        }
+
+        /**********
+         * insert *
+         **********/
+
+        template <class A, size_t I>
+        inline batch<double, A> insert(batch<double, A> const& self, double val, index<I>, requires_arch<neon64>) noexcept
+        {
+            return vsetq_lane_f64(val, self, I);
+        }
+
+        /******************
+         * reducer macros *
+         ******************/
+
+        // Wrap reducer intrinsics so we can pass them as function pointers
+        // - OP: intrinsics name prefix, e.g., vorrq
+
+#define WRAP_REDUCER_INT_EXCLUDING_64(OP)               \
+    namespace wrap                                      \
+    {                                                   \
+        inline uint8_t OP##_u8(uint8x16_t a) noexcept   \
+        {                                               \
+            return ::OP##_u8(a);                        \
+        }                                               \
+        inline int8_t OP##_s8(int8x16_t a) noexcept     \
+        {                                               \
+            return ::OP##_s8(a);                        \
+        }                                               \
+        inline uint16_t OP##_u16(uint16x8_t a) noexcept \
+        {                                               \
+            return ::OP##_u16(a);                       \
+        }                                               \
+        inline int16_t OP##_s16(int16x8_t a) noexcept   \
+        {                                               \
+            return ::OP##_s16(a);                       \
+        }                                               \
+        inline uint32_t OP##_u32(uint32x4_t a) noexcept \
+        {                                               \
+            return ::OP##_u32(a);                       \
+        }                                               \
+        inline int32_t OP##_s32(int32x4_t a) noexcept   \
+        {                                               \
+            return ::OP##_s32(a);                       \
+        }                                               \
+    }
+
+#define WRAP_REDUCER_INT(OP)                            \
+    WRAP_REDUCER_INT_EXCLUDING_64(OP)                   \
+    namespace wrap                                      \
+    {                                                   \
+        inline uint64_t OP##_u64(uint64x2_t a) noexcept \
+        {                                               \
+            return ::OP##_u64(a);                       \
+        }                                               \
+        inline int64_t OP##_s64(int64x2_t a) noexcept   \
+        {                                               \
+            return ::OP##_s64(a);                       \
+        }                                               \
+    }
+
+#define WRAP_REDUCER_FLOAT(OP)                         \
+    namespace wrap                                     \
+    {                                                  \
+        inline float OP##_f32(float32x4_t a) noexcept  \
+        {                                              \
+            return ::OP##_f32(a);                      \
+        }                                              \
+        inline double OP##_f64(float64x2_t a) noexcept \
+        {                                              \
+            return ::OP##_f64(a);                      \
+        }                                              \
+    }
+
+        namespace detail
+        {
+            template <class R>
+            struct reducer_return_type_impl;
+
+            template <>
+            struct reducer_return_type_impl<uint8x16_t>
+            {
+                using type = uint8_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<int8x16_t>
+            {
+                using type = int8_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<uint16x8_t>
+            {
+                using type = uint16_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<int16x8_t>
+            {
+                using type = int16_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<uint32x4_t>
+            {
+                using type = uint32_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<int32x4_t>
+            {
+                using type = int32_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<uint64x2_t>
+            {
+                using type = uint64_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<int64x2_t>
+            {
+                using type = int64_t;
+            };
+
+            template <>
+            struct reducer_return_type_impl<float32x4_t>
+            {
+                using type = float;
+            };
+
+            template <>
+            struct reducer_return_type_impl<float64x2_t>
+            {
+                using type = double;
+            };
+
+            template <class R>
+            using reducer_return_type = typename reducer_return_type_impl<R>::type;
+
+            template <class... T>
+            struct neon_reducer_dispatcher_impl : neon_dispatcher_base<reducer_return_type, T...>
+            {
+            };
+
+            using neon_reducer_dispatcher = neon_reducer_dispatcher_impl<uint8x16_t, int8x16_t,
+                                                                         uint16x8_t, int16x8_t,
+                                                                         uint32x4_t, int32x4_t,
+                                                                         uint64x2_t, int64x2_t,
+                                                                         float32x4_t, float64x2_t>;
+            template <class T>
+            using enable_neon64_type_t = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, float>::value || std::is_same<T, double>::value,
+                                                                 int>::type;
+        }
+
+        /**************
+         * reduce_add *
+         **************/
+
+        WRAP_REDUCER_INT(vaddvq)
+        WRAP_REDUCER_FLOAT(vaddvq)
+
+        template <class A, class T, detail::enable_neon64_type_t<T> = 0>
+        inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_reducer_dispatcher::unary dispatcher = {
+                std::make_tuple(wrap::vaddvq_u8, wrap::vaddvq_s8, wrap::vaddvq_u16, wrap::vaddvq_s16,
+                                wrap::vaddvq_u32, wrap::vaddvq_s32, wrap::vaddvq_u64, wrap::vaddvq_s64,
+                                wrap::vaddvq_f32, wrap::vaddvq_f64)
+            };
+            return dispatcher.apply(register_type(arg));
+        }
+
+        /**************
+         * reduce_max *
+         **************/
+
+        WRAP_REDUCER_INT_EXCLUDING_64(vmaxvq)
+        WRAP_REDUCER_FLOAT(vmaxvq)
+
+        namespace wrap
+        {
+            inline uint64_t vmaxvq_u64(uint64x2_t a) noexcept
+            {
+                return std::max(vdupd_laneq_u64(a, 0), vdupd_laneq_u64(a, 1));
+            }
+
+            inline int64_t vmaxvq_s64(int64x2_t a) noexcept
+            {
+                return std::max(vdupd_laneq_s64(a, 0), vdupd_laneq_s64(a, 1));
+            }
+        }
+
+        template <class A, class T, detail::enable_neon64_type_t<T> = 0>
+        inline typename batch<T, A>::value_type reduce_max(batch<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_reducer_dispatcher::unary dispatcher = {
+                std::make_tuple(wrap::vmaxvq_u8, wrap::vmaxvq_s8, wrap::vmaxvq_u16, wrap::vmaxvq_s16,
+                                wrap::vmaxvq_u32, wrap::vmaxvq_s32, wrap::vmaxvq_u64, wrap::vmaxvq_s64,
+                                wrap::vmaxvq_f32, wrap::vmaxvq_f64)
+            };
+            return dispatcher.apply(register_type(arg));
+        }
+
+        /**************
+         * reduce_min *
+         **************/
+
+        WRAP_REDUCER_INT_EXCLUDING_64(vminvq)
+        WRAP_REDUCER_FLOAT(vminvq)
+
+        namespace wrap
+        {
+            inline uint64_t vminvq_u64(uint64x2_t a) noexcept
+            {
+                return std::min(vdupd_laneq_u64(a, 0), vdupd_laneq_u64(a, 1));
+            }
+
+            inline int64_t vminvq_s64(int64x2_t a) noexcept
+            {
+                return std::min(vdupd_laneq_s64(a, 0), vdupd_laneq_s64(a, 1));
+            }
+        }
+
+        template <class A, class T, detail::enable_neon64_type_t<T> = 0>
+        inline typename batch<T, A>::value_type reduce_min(batch<T, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            using register_type = typename batch<T, A>::register_type;
+            const detail::neon_reducer_dispatcher::unary dispatcher = {
+                std::make_tuple(wrap::vminvq_u8, wrap::vminvq_s8, wrap::vminvq_u16, wrap::vminvq_s16,
+                                wrap::vminvq_u32, wrap::vminvq_s32, wrap::vminvq_u64, wrap::vminvq_s64,
+                                wrap::vminvq_f32, wrap::vminvq_f64)
+            };
+            return dispatcher.apply(register_type(arg));
+        }
+
+#undef WRAP_REDUCER_INT_EXCLUDING_64
+#undef WRAP_REDUCER_INT
+#undef WRAP_REDUCER_FLOAT
+
+        /**********
+         * select *
+         **********/
+
+        template <class A>
+        inline batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& a, batch<double, A> const& b, requires_arch<neon64>) noexcept
+        {
+            return vbslq_f64(cond, a, b);
+        }
+
+        template <class A, bool... b>
+        inline batch<double, A> select(batch_bool_constant<batch<double, A>, b...> const&,
+                                       batch<double, A> const& true_br,
+                                       batch<double, A> const& false_br,
+                                       requires_arch<neon64>) noexcept
+        {
+            return select(batch_bool<double, A> { b... }, true_br, false_br, neon64 {});
+        }
+        /**********
+         * zip_lo *
+         **********/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip1q_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip1q_s64(lhs, rhs);
+        }
+
+        template <class A>
+        inline batch<double, A> zip_lo(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip1q_f64(lhs, rhs);
+        }
+
+        /**********
+         * zip_hi *
+         **********/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip2q_u64(lhs, rhs);
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip2q_s64(lhs, rhs);
+        }
+
+        template <class A>
+        inline batch<double, A> zip_hi(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vzip2q_f64(lhs, rhs);
+        }
+
+        /****************
+         * extract_pair *
+         ****************/
+
+        namespace detail
+        {
+            template <class A, size_t I, size_t... Is>
+            inline batch<double, A> extract_pair(batch<double, A> const& lhs, batch<double, A> const& rhs, std::size_t n,
+                                                 ::xsimd::detail::index_sequence<I, Is...>) noexcept
+            {
+                if (n == I)
+                {
+                    return vextq_f64(rhs, lhs, I);
+                }
+                else
+                {
+                    return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>());
+                }
+            }
+        }
+
+        template <class A>
+        inline batch<double, A> extract_pair(batch<double, A> const& lhs, batch<double, A> const& rhs, std::size_t n, requires_arch<neon64>) noexcept
+        {
+            constexpr std::size_t size = batch<double, A>::size;
+            assert(n < size && "index in bounds");
+            return detail::extract_pair(lhs, rhs, n, ::xsimd::detail::make_index_sequence<size>());
+        }
+
+        /******************
+         * bitwise_rshift *
+         ******************/
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon64>) noexcept
+        {
+            return bitwise_rshift<A>(lhs, n, neon {});
+        }
+
+        template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vshlq_u64(lhs, vnegq_s64(rhs));
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon64>) noexcept
+        {
+            return bitwise_rshift<A>(lhs, n, neon {});
+        }
+
+        template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+        {
+            return vshlq_s64(lhs, vnegq_s64(rhs));
+        }
+
+        /****************
+         * bitwise_cast *
+         ****************/
+
+#define WRAP_CAST(SUFFIX, TYPE)                                          \
+    namespace wrap                                                       \
+    {                                                                    \
+        inline float64x2_t vreinterpretq_f64_##SUFFIX(TYPE a) noexcept   \
+        {                                                                \
+            return ::vreinterpretq_f64_##SUFFIX(a);                      \
+        }                                                                \
+        inline TYPE vreinterpretq_##SUFFIX##_f64(float64x2_t a) noexcept \
+        {                                                                \
+            return ::vreinterpretq_##SUFFIX##_f64(a);                    \
+        }                                                                \
+    }
+
+        WRAP_CAST(u8, uint8x16_t)
+        WRAP_CAST(s8, int8x16_t)
+        WRAP_CAST(u16, uint16x8_t)
+        WRAP_CAST(s16, int16x8_t)
+        WRAP_CAST(u32, uint32x4_t)
+        WRAP_CAST(s32, int32x4_t)
+        WRAP_CAST(u64, uint64x2_t)
+        WRAP_CAST(s64, int64x2_t)
+        WRAP_CAST(f32, float32x4_t)
+
+#undef WRAP_CAST
+
+        template <class A, class T>
+        inline batch<double, A> bitwise_cast(batch<T, A> const& arg, batch<double, A> const&, requires_arch<neon64>) noexcept
+        {
+            using caster_type = detail::bitwise_caster_impl<float64x2_t,
+                                                            uint8x16_t, int8x16_t,
+                                                            uint16x8_t, int16x8_t,
+                                                            uint32x4_t, int32x4_t,
+                                                            uint64x2_t, int64x2_t,
+                                                            float32x4_t>;
+            const caster_type caster = {
+                std::make_tuple(wrap::vreinterpretq_f64_u8, wrap::vreinterpretq_f64_s8, wrap::vreinterpretq_f64_u16, wrap::vreinterpretq_f64_s16,
+                                wrap::vreinterpretq_f64_u32, wrap::vreinterpretq_f64_s32, wrap::vreinterpretq_f64_u64, wrap::vreinterpretq_f64_s64,
+                                wrap::vreinterpretq_f64_f32)
+            };
+            using register_type = typename batch<T, A>::register_type;
+            return caster.apply(register_type(arg));
+        }
+
+        namespace detail
+        {
+            template <class S, class... R>
+            struct bitwise_caster_neon64
+            {
+                using container_type = std::tuple<R (*)(S)...>;
+                container_type m_func;
+
+                template <class V>
+                V apply(float64x2_t rhs) const
+                {
+                    using func_type = V (*)(float64x2_t);
+                    auto func = xsimd::detail::get<func_type>(m_func);
+                    return func(rhs);
+                }
+            };
+        }
+
+        template <class A, class R>
+        inline batch<R, A> bitwise_cast(batch<double, A> const& arg, batch<R, A> const&, requires_arch<neon64>) noexcept
+        {
+            using caster_type = detail::bitwise_caster_neon64<float64x2_t,
+                                                              uint8x16_t, int8x16_t,
+                                                              uint16x8_t, int16x8_t,
+                                                              uint32x4_t, int32x4_t,
+                                                              uint64x2_t, int64x2_t,
+                                                              float32x4_t>;
+            const caster_type caster = {
+                std::make_tuple(wrap::vreinterpretq_u8_f64, wrap::vreinterpretq_s8_f64, wrap::vreinterpretq_u16_f64, wrap::vreinterpretq_s16_f64,
+                                wrap::vreinterpretq_u32_f64, wrap::vreinterpretq_s32_f64, wrap::vreinterpretq_u64_f64, wrap::vreinterpretq_s64_f64,
+                                wrap::vreinterpretq_f32_f64)
+            };
+            using src_register_type = typename batch<double, A>::register_type;
+            using dst_register_type = typename batch<R, A>::register_type;
+            return caster.apply<dst_register_type>(src_register_type(arg));
+        }
+
+        template <class A>
+        inline batch<double, A> bitwise_cast(batch<double, A> const& arg, batch<double, A> const&, requires_arch<neon64>) noexcept
+        {
+            return arg;
+        }
+
+        /*********
+         * isnan *
+         *********/
+
+        template <class A>
+        inline batch_bool<double, A> isnan(batch<double, A> const& arg, requires_arch<neon64>) noexcept
+        {
+            return !(arg == arg);
+        }
+
+        /****************
+         * rotate_right *
+         ****************/
+        template <size_t N, class A>
+        inline batch<double, A> rotate_right(batch<double, A> const& a, requires_arch<neon64>) noexcept
+        {
+            return vextq_f64(a, a, N);
+        }
+    }
+
+    template <class batch_type, typename batch_type::value_type... Values>
+    struct batch_constant;
+
+    namespace kernel
+    {
+        /*********************
+         * swizzle (dynamic) *
+         *********************/
+        template <class A>
+        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch<uint8_t, A> idx,
+                                         requires_arch<neon64>) noexcept
+        {
+            return vqtbl1q_u8(self, idx);
+        }
+
+        template <class A>
+        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch<uint8_t, A> idx,
+                                        requires_arch<neon64>) noexcept
+        {
+            return vqtbl1q_s8(self, idx);
+        }
+
+        template <class A>
+        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self,
+                                          batch<uint16_t, A> idx,
+                                          requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<uint8_t, A>;
+            using index_type = batch<uint8_t, A>;
+            return vreinterpretq_u16_u8(swizzle(batch_type(vreinterpretq_u8_u16(self)),
+                                                index_type(vreinterpretq_u8_u16(idx * 0x0202 + 0x0100)),
+                                                neon64 {}));
+        }
+
+        template <class A>
+        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self,
+                                         batch<uint16_t, A> idx,
+                                         requires_arch<neon64>) noexcept
+        {
+            return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), idx, neon64 {}));
+        }
+
+        template <class A>
+        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self,
+                                          batch<uint32_t, A> idx,
+                                          requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<uint8_t, A>;
+            using index_type = batch<uint8_t, A>;
+            return vreinterpretq_u32_u8(swizzle(batch_type(vreinterpretq_u8_u32(self)),
+                                                index_type(vreinterpretq_u8_u32(idx * 0x04040404 + 0x03020100)),
+                                                neon64 {}));
+        }
+
+        template <class A>
+        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self,
+                                         batch<uint32_t, A> idx,
+                                         requires_arch<neon64>) noexcept
+        {
+            return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), idx, neon64 {}));
+        }
+
+        template <class A>
+        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self,
+                                          batch<uint64_t, A> idx,
+                                          requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<uint8_t, A>;
+            using index_type = batch<uint8_t, A>;
+            return vreinterpretq_u64_u8(swizzle(batch_type(vreinterpretq_u8_u64(self)),
+                                                index_type(vreinterpretq_u8_u64(idx * 0x0808080808080808ull + 0x0706050403020100ull)),
+                                                neon64 {}));
+        }
+
+        template <class A>
+        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self,
+                                         batch<uint64_t, A> idx,
+                                         requires_arch<neon64>) noexcept
+        {
+            return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), idx, neon64 {}));
+        }
+
+        template <class A>
+        inline batch<float, A> swizzle(batch<float, A> const& self,
+                                       batch<uint32_t, A> idx,
+                                       requires_arch<neon64>) noexcept
+        {
+            return bitwise_cast<float>(swizzle(bitwise_cast<uint32_t>(self), idx, neon64 {}));
+        }
+
+        template <class A>
+        inline batch<double, A> swizzle(batch<double, A> const& self,
+                                        batch<uint64_t, A> idx,
+                                        requires_arch<neon64>) noexcept
+        {
+            return bitwise_cast<double>(swizzle(bitwise_cast<uint64_t>(self), idx, neon64 {}));
+        }
+
+        /********************
+         * swizzle (static) *
+         ********************/
+
+        namespace detail
+        {
+            using ::xsimd::batch_constant;
+            using ::xsimd::detail::integer_sequence;
+            using ::xsimd::detail::make_integer_sequence;
+
+            template <class CB1, class CB2, class IS>
+            struct index_burst_impl;
+
+            template <class B1, class B2, typename B2::value_type... V,
+                      typename B2::value_type... incr>
+            struct index_burst_impl<batch_constant<B1>, batch_constant<B2, V...>,
+                                    integer_sequence<typename B2::value_type, incr...>>
+            {
+                using type = batch_constant<B2, V...>;
+            };
+
+            template <class B1, typename B1::value_type V0, typename B1::value_type... V1,
+                      class B2, typename B2::value_type... V2,
+                      typename B2::value_type... incr>
+            struct index_burst_impl<batch_constant<B1, V0, V1...>, batch_constant<B2, V2...>,
+                                    integer_sequence<typename B2::value_type, incr...>>
+            {
+                using value_type = typename B2::value_type;
+                using next_input = batch_constant<B1, V1...>;
+                using next_output = batch_constant<B2, V2..., (V0 + incr)...>;
+                using type = typename index_burst_impl<next_input, next_output, integer_sequence<value_type, incr...>>::type;
+            };
+
+            template <class B, class T>
+            struct index_burst;
+
+            template <class B, typename B::value_type... V, class T>
+            struct index_burst<batch_constant<B, V...>, T>
+            {
+                static constexpr size_t mul = sizeof(typename B::value_type) / sizeof(T);
+                using input = batch_constant<B, (mul * V)...>;
+                using output = batch_constant<batch<T, typename B::arch_type>>;
+                using type = typename index_burst_impl<input, output, make_integer_sequence<T, mul>>::type;
+            };
+
+            template <class B, class T>
+            using index_burst_t = typename index_burst<B, T>::type;
+
+            template <class T, class B>
+            inline index_burst_t<B, T> burst_index(B)
+            {
+                return index_burst_t<B, T>();
+            }
+        }
+
+        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
+                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
+        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self,
+                                         batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
+                                         requires_arch<neon64>) noexcept
+        {
+            return vqtbl1q_u8(self, batch<uint8_t, A>(idx));
+        }
+
+        template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
+                  uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
+        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self,
+                                        batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
+                                        requires_arch<neon64>) noexcept
+        {
+            return vqtbl1q_s8(self, batch<uint8_t, A>(idx));
+        }
+
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self,
+                                          batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> idx,
+                                          requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<uint8_t, A>;
+            return vreinterpretq_u16_u8(swizzle<A>(batch_type(vreinterpretq_u8_u16(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
+        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self,
+                                         batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> idx,
+                                         requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<int8_t, A>;
+            return vreinterpretq_s16_s8(swizzle<A>(batch_type(vreinterpretq_s8_s16(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self,
+                                          batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
+                                          requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<uint8_t, A>;
+            return vreinterpretq_u32_u8(swizzle<A>(batch_type(vreinterpretq_u8_u32(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self,
+                                         batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
+                                         requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<int8_t, A>;
+            return vreinterpretq_s32_s8(swizzle<A>(batch_type(vreinterpretq_s8_s32(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self,
+                                          batch_constant<batch<uint64_t, A>, V0, V1> idx,
+                                          requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<uint8_t, A>;
+            return vreinterpretq_u64_u8(swizzle<A>(batch_type(vreinterpretq_u8_u64(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self,
+                                         batch_constant<batch<uint64_t, A>, V0, V1> idx,
+                                         requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<int8_t, A>;
+            return vreinterpretq_s64_s8(swizzle<A>(batch_type(vreinterpretq_s8_s64(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        inline batch<float, A> swizzle(batch<float, A> const& self,
+                                       batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
+                                       requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<uint8_t, A>;
+            return vreinterpretq_f32_u8(swizzle<A>(batch_type(vreinterpretq_u8_f32(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        inline batch<double, A> swizzle(batch<double, A> const& self,
+                                        batch_constant<batch<uint64_t, A>, V0, V1> idx,
+                                        requires_arch<neon64>) noexcept
+        {
+            using batch_type = batch<uint8_t, A>;
+            return vreinterpretq_f64_u8(swizzle<A>(batch_type(vreinterpretq_u8_f64(self)), detail::burst_index<uint8_t>(idx), A()));
+        }
+
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
+        inline batch<std::complex<float>, A> swizzle(batch<std::complex<float>, A> const& self,
+                                                     batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
+                                                     requires_arch<neon64>) noexcept
+        {
+            return batch<std::complex<float>>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A()));
+        }
+
+        template <class A, uint64_t V0, uint64_t V1>
+        inline batch<std::complex<double>, A> swizzle(batch<std::complex<double>, A> const& self,
+                                                      batch_constant<batch<uint64_t, A>, V0, V1> idx,
+                                                      requires_arch<neon64>) noexcept
+        {
+            return batch<std::complex<double>>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A()));
+        }
+    }
+}
+
+#endif