diff options
Diffstat (limited to 'third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp')
-rw-r--r-- | third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp | 2670 |
1 files changed, 2670 insertions, 0 deletions
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp new file mode 100644 index 0000000000..57c662cd63 --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp @@ -0,0 +1,2670 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_NEON_HPP +#define XSIMD_NEON_HPP + +#include <algorithm> +#include <complex> +#include <tuple> +#include <type_traits> + +#include "../types/xsimd_neon_register.hpp" +#include "../types/xsimd_utils.hpp" + +// Wrap intrinsics so we can pass them as function pointers +// - OP: intrinsics name prefix, e.g., vorrq +// - RT: type traits to deduce intrinsics return types +#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ + namespace wrap \ + { \ + inline RT<uint8x16_t> OP##_u8(uint8x16_t a, uint8x16_t b) noexcept \ + { \ + return ::OP##_u8(a, b); \ + } \ + inline RT<int8x16_t> OP##_s8(int8x16_t a, int8x16_t b) noexcept \ + { \ + return ::OP##_s8(a, b); \ + } \ + inline RT<uint16x8_t> OP##_u16(uint16x8_t a, uint16x8_t b) noexcept \ + { \ + return ::OP##_u16(a, b); \ + } \ + inline RT<int16x8_t> OP##_s16(int16x8_t a, int16x8_t b) noexcept \ + { \ + return ::OP##_s16(a, b); \ + } \ + inline RT<uint32x4_t> OP##_u32(uint32x4_t a, uint32x4_t b) noexcept \ + { \ + return ::OP##_u32(a, b); \ + } \ + inline RT<int32x4_t> OP##_s32(int32x4_t a, int32x4_t b) noexcept \ + { \ + return ::OP##_s32(a, b); \ + } \ + } + +#define WRAP_BINARY_INT(OP, RT) \ + WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ + namespace wrap \ + { \ + inline RT<uint64x2_t> OP##_u64(uint64x2_t a, uint64x2_t b) noexcept \ + { \ + return ::OP##_u64(a, b); \ + } \ + inline RT<int64x2_t> OP##_s64(int64x2_t a, int64x2_t b) noexcept \ + { \ + return ::OP##_s64(a, b); \ + } \ + } + +#define WRAP_BINARY_FLOAT(OP, RT) \ + namespace wrap \ + { \ + inline RT<float32x4_t> OP##_f32(float32x4_t a, float32x4_t b) noexcept \ + { \ + return ::OP##_f32(a, b); \ + } \ + } + +#define WRAP_UNARY_INT_EXCLUDING_64(OP) \ + namespace wrap \ + { \ + inline uint8x16_t OP##_u8(uint8x16_t a) noexcept \ + { \ + return ::OP##_u8(a); \ + } \ + inline int8x16_t OP##_s8(int8x16_t a) noexcept \ + { \ + return ::OP##_s8(a); \ + } \ + inline uint16x8_t OP##_u16(uint16x8_t a) noexcept \ + { \ + return ::OP##_u16(a); \ + } \ + inline int16x8_t OP##_s16(int16x8_t a) noexcept \ + { \ + return ::OP##_s16(a); \ + } \ + inline uint32x4_t OP##_u32(uint32x4_t a) noexcept \ + { \ + return ::OP##_u32(a); \ + } \ + inline int32x4_t OP##_s32(int32x4_t a) noexcept \ + { \ + return ::OP##_s32(a); \ + } \ + } + +#define WRAP_UNARY_INT(OP) \ + WRAP_UNARY_INT_EXCLUDING_64(OP) \ + namespace wrap \ + { \ + inline uint64x2_t OP##_u64(uint64x2_t a) noexcept \ + { \ + return ::OP##_u64(a); \ + } \ + inline int64x2_t OP##_s64(int64x2_t a) noexcept \ + { \ + return ::OP##_s64(a); \ + } \ + } + +#define WRAP_UNARY_FLOAT(OP) \ + namespace wrap \ + { \ + inline float32x4_t OP##_f32(float32x4_t a) noexcept \ + { \ + return ::OP##_f32(a); \ + } \ + } + +// Dummy identity caster to ease coding +inline uint8x16_t vreinterpretq_u8_u8(uint8x16_t arg) noexcept { return arg; } +inline int8x16_t vreinterpretq_s8_s8(int8x16_t arg) noexcept { return arg; } +inline uint16x8_t vreinterpretq_u16_u16(uint16x8_t arg) noexcept { return arg; } +inline int16x8_t vreinterpretq_s16_s16(int16x8_t arg) noexcept { return arg; } +inline uint32x4_t vreinterpretq_u32_u32(uint32x4_t arg) noexcept { return arg; } +inline int32x4_t vreinterpretq_s32_s32(int32x4_t arg) noexcept { return arg; } +inline uint64x2_t vreinterpretq_u64_u64(uint64x2_t arg) noexcept { return arg; } +inline int64x2_t vreinterpretq_s64_s64(int64x2_t arg) noexcept { return arg; } +inline float32x4_t vreinterpretq_f32_f32(float32x4_t arg) noexcept { return arg; } + +namespace xsimd +{ + template <class batch_type, bool... Values> + struct batch_bool_constant; + + namespace kernel + { + using namespace types; + + namespace detail + { + template <template <class> class return_type, class... T> + struct neon_dispatcher_base + { + struct unary + { + using container_type = std::tuple<return_type<T> (*)(T)...>; + const container_type m_func; + + template <class U> + return_type<U> apply(U rhs) const noexcept + { + using func_type = return_type<U> (*)(U); + auto func = xsimd::detail::get<func_type>(m_func); + return func(rhs); + } + }; + + struct binary + { + using container_type = std::tuple<return_type<T> (*)(T, T)...>; + const container_type m_func; + + template <class U> + return_type<U> apply(U lhs, U rhs) const noexcept + { + using func_type = return_type<U> (*)(U, U); + auto func = xsimd::detail::get<func_type>(m_func); + return func(lhs, rhs); + } + }; + }; + + /*************************** + * arithmetic dispatchers * + ***************************/ + + template <class T> + using identity_return_type = T; + + template <class... T> + struct neon_dispatcher_impl : neon_dispatcher_base<identity_return_type, T...> + { + }; + + using neon_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t, + uint16x8_t, int16x8_t, + uint32x4_t, int32x4_t, + uint64x2_t, int64x2_t, + float32x4_t>; + + using excluding_int64_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t, + uint16x8_t, int16x8_t, + uint32x4_t, int32x4_t, + float32x4_t>; + + /************************** + * comparison dispatchers * + **************************/ + + template <class T> + struct comp_return_type_impl; + + template <> + struct comp_return_type_impl<uint8x16_t> + { + using type = uint8x16_t; + }; + + template <> + struct comp_return_type_impl<int8x16_t> + { + using type = uint8x16_t; + }; + + template <> + struct comp_return_type_impl<uint16x8_t> + { + using type = uint16x8_t; + }; + + template <> + struct comp_return_type_impl<int16x8_t> + { + using type = uint16x8_t; + }; + + template <> + struct comp_return_type_impl<uint32x4_t> + { + using type = uint32x4_t; + }; + + template <> + struct comp_return_type_impl<int32x4_t> + { + using type = uint32x4_t; + }; + + template <> + struct comp_return_type_impl<uint64x2_t> + { + using type = uint64x2_t; + }; + + template <> + struct comp_return_type_impl<int64x2_t> + { + using type = uint64x2_t; + }; + + template <> + struct comp_return_type_impl<float32x4_t> + { + using type = uint32x4_t; + }; + + template <class T> + using comp_return_type = typename comp_return_type_impl<T>::type; + + template <class... T> + struct neon_comp_dispatcher_impl : neon_dispatcher_base<comp_return_type, T...> + { + }; + + using excluding_int64_comp_dispatcher = neon_comp_dispatcher_impl<uint8x16_t, int8x16_t, + uint16x8_t, int16x8_t, + uint32x4_t, int32x4_t, + float32x4_t>; + + /************************************** + * enabling / disabling metafunctions * + **************************************/ + + template <class T> + using enable_neon_type_t = typename std::enable_if<std::is_integral<T>::value || std::is_same<T, float>::value, + int>::type; + + template <class T> + using exclude_int64_neon_t + = typename std::enable_if<(std::is_integral<T>::value && sizeof(T) != 8) || std::is_same<T, float>::value, int>::type; + } + + /************* + * broadcast * + *************/ + + template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0> + inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept + { + return vdupq_n_u8(uint8_t(val)); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0> + inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept + { + return vdupq_n_s8(int8_t(val)); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0> + inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept + { + return vdupq_n_u16(uint16_t(val)); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0> + inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept + { + return vdupq_n_s16(int16_t(val)); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0> + inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept + { + return vdupq_n_u32(uint32_t(val)); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0> + inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept + { + return vdupq_n_s32(int32_t(val)); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> + inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept + { + return vdupq_n_u64(uint64_t(val)); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0> + inline batch<T, A> broadcast(T val, requires_arch<neon>) noexcept + { + return vdupq_n_s64(int64_t(val)); + } + + template <class A> + inline batch<float, A> broadcast(float val, requires_arch<neon>) noexcept + { + return vdupq_n_f32(val); + } + + /******* + * set * + *******/ + + template <class A, class T, class... Args, detail::enable_integral_t<T> = 0> + inline batch<T, A> set(batch<T, A> const&, requires_arch<neon>, Args... args) noexcept + { + return xsimd::types::detail::neon_vector_type<T> { args... }; + } + + template <class A, class T, class... Args, detail::enable_integral_t<T> = 0> + inline batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<neon>, Args... args) noexcept + { + using register_type = typename batch_bool<T, A>::register_type; + using unsigned_type = as_unsigned_integer_t<T>; + return register_type { static_cast<unsigned_type>(args ? -1LL : 0LL)... }; + } + + template <class A> + inline batch<float, A> set(batch<float, A> const&, requires_arch<neon>, float f0, float f1, float f2, float f3) noexcept + { + return float32x4_t { f0, f1, f2, f3 }; + } + + template <class A> + inline batch<std::complex<float>, A> set(batch<std::complex<float>, A> const&, requires_arch<neon>, + std::complex<float> c0, std::complex<float> c1, + std::complex<float> c2, std::complex<float> c3) noexcept + { + return batch<std::complex<float>>(float32x4_t { c0.real(), c1.real(), c2.real(), c3.real() }, + float32x4_t { c0.imag(), c1.imag(), c2.imag(), c3.imag() }); + } + + template <class A, class... Args> + inline batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<neon>, Args... args) noexcept + { + using register_type = typename batch_bool<float, A>::register_type; + using unsigned_type = as_unsigned_integer_t<float>; + return register_type { static_cast<unsigned_type>(args ? -1LL : 0LL)... }; + } + + /************* + * from_bool * + *************/ + + template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0> + inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept + { + return vandq_u8(arg, vdupq_n_u8(1)); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0> + inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept + { + return vandq_s8(reinterpret_cast<int8x16_t>(arg.data), vdupq_n_s8(1)); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0> + inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept + { + return vandq_u16(arg, vdupq_n_u16(1)); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0> + inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept + { + return vandq_s16(reinterpret_cast<int16x8_t>(arg.data), vdupq_n_s16(1)); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0> + inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept + { + return vandq_u32(arg, vdupq_n_u32(1)); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0> + inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept + { + return vandq_s32(reinterpret_cast<int32x4_t>(arg.data), vdupq_n_s32(1)); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> + inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept + { + return vandq_u64(arg, vdupq_n_u64(1)); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0> + inline batch<T, A> from_bool(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept + { + return vandq_s64(reinterpret_cast<int64x2_t>(arg.data), vdupq_n_s64(1)); + } + + template <class A> + inline batch<float, A> from_bool(batch_bool<float, A> const& arg, requires_arch<neon>) noexcept + { + return vreinterpretq_f32_u32(vandq_u32(arg, vreinterpretq_u32_f32(vdupq_n_f32(1.f)))); + } + + /******** + * load * + ********/ + + // It is not possible to use a call to A::alignment() here, so use an + // immediate instead. +#if defined(__clang__) || defined(__GNUC__) +#define xsimd_aligned_load(inst, type, expr) inst((type)__builtin_assume_aligned(expr, 16)) +#elif defined(_MSC_VER) +#define xsimd_aligned_load(inst, type, expr) inst##_ex((type)expr, 128) +#else +#define xsimd_aligned_load(inst, type, expr) inst((type)expr) +#endif + + template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0> + inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept + { + return xsimd_aligned_load(vld1q_u8, uint8_t*, src); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0> + inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept + { + return xsimd_aligned_load(vld1q_s8, int8_t*, src); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0> + inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept + { + return xsimd_aligned_load(vld1q_u16, uint16_t*, src); + } + template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0> + inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept + { + return xsimd_aligned_load(vld1q_s16, int16_t*, src); + } + template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0> + inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept + { + return xsimd_aligned_load(vld1q_u32, uint32_t*, src); + } + template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0> + inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept + { + return xsimd_aligned_load(vld1q_s32, int32_t*, src); + } + template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> + inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept + { + return xsimd_aligned_load(vld1q_u64, uint64_t*, src); + } + template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0> + inline batch<T, A> load_aligned(T const* src, convert<T>, requires_arch<neon>) noexcept + { + return xsimd_aligned_load(vld1q_s64, int64_t*, src); + } + + template <class A> + inline batch<float, A> load_aligned(float const* src, convert<float>, requires_arch<neon>) noexcept + { + return xsimd_aligned_load(vld1q_f32, float*, src); + } + +#undef xsimd_aligned_load + + template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0> + inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept + { + return vld1q_u8((uint8_t*)src); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0> + inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept + { + return vld1q_s8((int8_t*)src); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0> + inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept + { + return vld1q_u16((uint16_t*)src); + } + template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0> + inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept + { + return vld1q_s16((int16_t*)src); + } + template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0> + inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept + { + return vld1q_u32((uint32_t*)src); + } + template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0> + inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept + { + return vld1q_s32((int32_t*)src); + } + template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> + inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept + { + return vld1q_u64((uint64_t*)src); + } + template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0> + inline batch<T, A> load_unaligned(T const* src, convert<T>, requires_arch<neon>) noexcept + { + return vld1q_s64((int64_t*)src); + } + + template <class A> + inline batch<float, A> load_unaligned(float const* src, convert<float>, requires_arch<neon>) noexcept + { + return vld1q_f32(src); + } + + /********* + * store * + *********/ + + template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0> + inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept + { + vst1q_u8((uint8_t*)dst, src); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0> + inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept + { + vst1q_s8((int8_t*)dst, src); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0> + inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept + { + vst1q_u16((uint16_t*)dst, src); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0> + inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept + { + vst1q_s16((int16_t*)dst, src); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0> + inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept + { + vst1q_u32((uint32_t*)dst, src); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0> + inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept + { + vst1q_s32((int32_t*)dst, src); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> + inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept + { + vst1q_u64((uint64_t*)dst, src); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0> + inline void store_aligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept + { + vst1q_s64((int64_t*)dst, src); + } + + template <class A> + inline void store_aligned(float* dst, batch<float, A> const& src, requires_arch<neon>) noexcept + { + vst1q_f32(dst, src); + } + + template <class A, class T> + inline void store_unaligned(T* dst, batch<T, A> const& src, requires_arch<neon>) noexcept + { + store_aligned<A>(dst, src, A {}); + } + + /**************** + * load_complex * + ****************/ + + template <class A> + inline batch<std::complex<float>, A> load_complex_aligned(std::complex<float> const* mem, convert<std::complex<float>>, requires_arch<neon>) noexcept + { + using real_batch = batch<float, A>; + const float* buf = reinterpret_cast<const float*>(mem); + float32x4x2_t tmp = vld2q_f32(buf); + real_batch real = tmp.val[0], + imag = tmp.val[1]; + return batch<std::complex<float>, A> { real, imag }; + } + + template <class A> + inline batch<std::complex<float>, A> load_complex_unaligned(std::complex<float> const* mem, convert<std::complex<float>> cvt, requires_arch<neon>) noexcept + { + return load_complex_aligned<A>(mem, cvt, A {}); + } + + /***************** + * store_complex * + *****************/ + + template <class A> + inline void store_complex_aligned(std::complex<float>* dst, batch<std::complex<float>, A> const& src, requires_arch<neon>) noexcept + { + float32x4x2_t tmp; + tmp.val[0] = src.real(); + tmp.val[1] = src.imag(); + float* buf = reinterpret_cast<float*>(dst); + vst2q_f32(buf, tmp); + } + + template <class A> + inline void store_complex_unaligned(std::complex<float>* dst, batch<std::complex<float>, A> const& src, requires_arch<neon>) noexcept + { + store_complex_aligned(dst, src, A {}); + } + + /******* + * neg * + *******/ + + template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0> + inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + return vreinterpretq_u8_s8(vnegq_s8(vreinterpretq_s8_u8(rhs))); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0> + inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + return vnegq_s8(rhs); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0> + inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + return vreinterpretq_u16_s16(vnegq_s16(vreinterpretq_s16_u16(rhs))); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0> + inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + return vnegq_s16(rhs); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0> + inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + return vreinterpretq_u32_s32(vnegq_s32(vreinterpretq_s32_u32(rhs))); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0> + inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + return vnegq_s32(rhs); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> + inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + return batch<T, A> { -rhs.get(0), -rhs.get(1) }; + } + + template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0> + inline batch<T, A> neg(batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + return batch<T, A> { -rhs.get(0), -rhs.get(1) }; + } + + template <class A> + inline batch<float, A> neg(batch<float, A> const& rhs, requires_arch<neon>) noexcept + { + return vnegq_f32(rhs); + } + + /******* + * add * + *******/ + + WRAP_BINARY_INT(vaddq, detail::identity_return_type) + WRAP_BINARY_FLOAT(vaddq, detail::identity_return_type) + + template <class A, class T, detail::enable_neon_type_t<T> = 0> + inline batch<T, A> add(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + using register_type = typename batch<T, A>::register_type; + const detail::neon_dispatcher::binary dispatcher = { + std::make_tuple(wrap::vaddq_u8, wrap::vaddq_s8, wrap::vaddq_u16, wrap::vaddq_s16, + wrap::vaddq_u32, wrap::vaddq_s32, wrap::vaddq_u64, wrap::vaddq_s64, + wrap::vaddq_f32) + }; + return dispatcher.apply(register_type(lhs), register_type(rhs)); + } + + /******** + * sadd * + ********/ + + WRAP_BINARY_INT(vqaddq, detail::identity_return_type) + + template <class A, class T, detail::enable_neon_type_t<T> = 0> + inline batch<T, A> sadd(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + using register_type = typename batch<T, A>::register_type; + const detail::neon_dispatcher::binary dispatcher = { + std::make_tuple(wrap::vqaddq_u8, wrap::vqaddq_s8, wrap::vqaddq_u16, wrap::vqaddq_s16, + wrap::vqaddq_u32, wrap::vqaddq_s32, wrap::vqaddq_u64, wrap::vqaddq_s64, + wrap::vaddq_f32) + }; + return dispatcher.apply(register_type(lhs), register_type(rhs)); + } + + /******* + * sub * + *******/ + + WRAP_BINARY_INT(vsubq, detail::identity_return_type) + WRAP_BINARY_FLOAT(vsubq, detail::identity_return_type) + + template <class A, class T, detail::enable_neon_type_t<T> = 0> + inline batch<T, A> sub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + using register_type = typename batch<T, A>::register_type; + const detail::neon_dispatcher::binary dispatcher = { + std::make_tuple(wrap::vsubq_u8, wrap::vsubq_s8, wrap::vsubq_u16, wrap::vsubq_s16, + wrap::vsubq_u32, wrap::vsubq_s32, wrap::vsubq_u64, wrap::vsubq_s64, + wrap::vsubq_f32) + }; + return dispatcher.apply(register_type(lhs), register_type(rhs)); + } + + /******** + * ssub * + ********/ + + WRAP_BINARY_INT(vqsubq, detail::identity_return_type) + + template <class A, class T, detail::enable_neon_type_t<T> = 0> + inline batch<T, A> ssub(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + using register_type = typename batch<T, A>::register_type; + const detail::neon_dispatcher::binary dispatcher = { + std::make_tuple(wrap::vqsubq_u8, wrap::vqsubq_s8, wrap::vqsubq_u16, wrap::vqsubq_s16, + wrap::vqsubq_u32, wrap::vqsubq_s32, wrap::vqsubq_u64, wrap::vqsubq_s64, + wrap::vsubq_f32) + }; + return dispatcher.apply(register_type(lhs), register_type(rhs)); + } + + /******* + * mul * + *******/ + + WRAP_BINARY_INT_EXCLUDING_64(vmulq, detail::identity_return_type) + WRAP_BINARY_FLOAT(vmulq, detail::identity_return_type) + + template <class A, class T, detail::exclude_int64_neon_t<T> = 0> + inline batch<T, A> mul(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + using register_type = typename batch<T, A>::register_type; + const detail::excluding_int64_dispatcher::binary dispatcher = { + std::make_tuple(wrap::vmulq_u8, wrap::vmulq_s8, wrap::vmulq_u16, wrap::vmulq_s16, + wrap::vmulq_u32, wrap::vmulq_s32, wrap::vmulq_f32) + }; + return dispatcher.apply(register_type(lhs), register_type(rhs)); + } + + /******* + * div * + *******/ + +#if defined(XSIMD_FAST_INTEGER_DIVISION) + template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0> + inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + return vcvtq_s32_f32(vcvtq_f32_s32(lhs) / vcvtq_f32_s32(rhs)); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0> + inline batch<T, A> div(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + return vcvtq_u32_f32(vcvtq_f32_u32(lhs) / vcvtq_f32_u32(rhs)); + } +#endif + + template <class A> + inline batch<float, A> div(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept + { + // from stackoverflow & https://projectne10.github.io/Ne10/doc/NE10__divc_8neon_8c_source.html + // get an initial estimate of 1/b. + float32x4_t rcp = reciprocal(rhs); + + // use a couple Newton-Raphson steps to refine the estimate. Depending on your + // application's accuracy requirements, you may be able to get away with only + // one refinement (instead of the two used here). Be sure to test! + rcp = vmulq_f32(vrecpsq_f32(rhs, rcp), rcp); + rcp = vmulq_f32(vrecpsq_f32(rhs, rcp), rcp); + + // and finally, compute a / b = a * (1 / b) + return vmulq_f32(lhs, rcp); + } + + /****** + * eq * + ******/ + + WRAP_BINARY_INT_EXCLUDING_64(vceqq, detail::comp_return_type) + WRAP_BINARY_FLOAT(vceqq, detail::comp_return_type) + + template <class A, class T, detail::exclude_int64_neon_t<T> = 0> + inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + using register_type = typename batch<T, A>::register_type; + const detail::excluding_int64_comp_dispatcher::binary dispatcher = { + std::make_tuple(wrap::vceqq_u8, wrap::vceqq_s8, wrap::vceqq_u16, wrap::vceqq_s16, + wrap::vceqq_u32, wrap::vceqq_s32, wrap::vceqq_f32) + }; + return dispatcher.apply(register_type(lhs), register_type(rhs)); + } + + template <class A, class T, detail::exclude_int64_neon_t<T> = 0> + inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept + { + using register_type = typename batch_bool<T, A>::register_type; + using dispatcher_type = detail::neon_comp_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary; + const dispatcher_type dispatcher = { + std::make_tuple(wrap::vceqq_u8, wrap::vceqq_u16, wrap::vceqq_u32) + }; + return dispatcher.apply(register_type(lhs), register_type(rhs)); + } + + template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0> + inline batch_bool<T, A> eq(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + return batch_bool<T, A>({ lhs.get(0) == rhs.get(0), lhs.get(1) == rhs.get(1) }); + } + + template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0> + inline batch_bool<T, A> eq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept + { + return batch_bool<T, A>({ lhs.get(0) == rhs.get(0), lhs.get(1) == rhs.get(1) }); + } + + /************* + * fast_cast * + *************/ + + namespace detail + { + template <class A> + inline batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<neon>) noexcept + { + return vcvtq_f32_s32(self); + } + + template <class A> + inline batch<float, A> fast_cast(batch<uint32_t, A> const& self, batch<float, A> const&, requires_arch<neon>) noexcept + { + return vcvtq_f32_u32(self); + } + + template <class A> + inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<neon>) noexcept + { + return vcvtq_s32_f32(self); + } + + template <class A> + inline batch<uint32_t, A> fast_cast(batch<float, A> const& self, batch<uint32_t, A> const&, requires_arch<neon>) noexcept + { + return vcvtq_u32_f32(self); + } + + } + + /****** + * lt * + ******/ + + WRAP_BINARY_INT_EXCLUDING_64(vcltq, detail::comp_return_type) + WRAP_BINARY_FLOAT(vcltq, detail::comp_return_type) + + template <class A, class T, detail::exclude_int64_neon_t<T> = 0> + inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + using register_type = typename batch<T, A>::register_type; + const detail::excluding_int64_comp_dispatcher::binary dispatcher = { + std::make_tuple(wrap::vcltq_u8, wrap::vcltq_s8, wrap::vcltq_u16, wrap::vcltq_s16, + wrap::vcltq_u32, wrap::vcltq_s32, wrap::vcltq_f32) + }; + return dispatcher.apply(register_type(lhs), register_type(rhs)); + } + + template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0> + inline batch_bool<T, A> lt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + return batch_bool<T, A>({ lhs.get(0) < rhs.get(0), lhs.get(1) < rhs.get(1) }); + } + + /****** + * le * + ******/ + + WRAP_BINARY_INT_EXCLUDING_64(vcleq, detail::comp_return_type) + WRAP_BINARY_FLOAT(vcleq, detail::comp_return_type) + + template <class A, class T, detail::exclude_int64_neon_t<T> = 0> + inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + using register_type = typename batch<T, A>::register_type; + const detail::excluding_int64_comp_dispatcher::binary dispatcher = { + std::make_tuple(wrap::vcleq_u8, wrap::vcleq_s8, wrap::vcleq_u16, wrap::vcleq_s16, + wrap::vcleq_u32, wrap::vcleq_s32, wrap::vcleq_f32) + }; + return dispatcher.apply(register_type(lhs), register_type(rhs)); + } + + template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0> + inline batch_bool<T, A> le(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + return batch_bool<T, A>({ lhs.get(0) <= rhs.get(0), lhs.get(1) <= rhs.get(1) }); + } + + /****** + * gt * + ******/ + + WRAP_BINARY_INT_EXCLUDING_64(vcgtq, detail::comp_return_type) + WRAP_BINARY_FLOAT(vcgtq, detail::comp_return_type) + + template <class A, class T, detail::exclude_int64_neon_t<T> = 0> + inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + using register_type = typename batch<T, A>::register_type; + const detail::excluding_int64_comp_dispatcher::binary dispatcher = { + std::make_tuple(wrap::vcgtq_u8, wrap::vcgtq_s8, wrap::vcgtq_u16, wrap::vcgtq_s16, + wrap::vcgtq_u32, wrap::vcgtq_s32, wrap::vcgtq_f32) + }; + return dispatcher.apply(register_type(lhs), register_type(rhs)); + } + + template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0> + inline batch_bool<T, A> gt(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + return batch_bool<T, A>({ lhs.get(0) > rhs.get(0), lhs.get(1) > rhs.get(1) }); + } + + /****** + * ge * + ******/ + + WRAP_BINARY_INT_EXCLUDING_64(vcgeq, detail::comp_return_type) + WRAP_BINARY_FLOAT(vcgeq, detail::comp_return_type) + + template <class A, class T, detail::exclude_int64_neon_t<T> = 0> + inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + using register_type = typename batch<T, A>::register_type; + const detail::excluding_int64_comp_dispatcher::binary dispatcher = { + std::make_tuple(wrap::vcgeq_u8, wrap::vcgeq_s8, wrap::vcgeq_u16, wrap::vcgeq_s16, + wrap::vcgeq_u32, wrap::vcgeq_s32, wrap::vcgeq_f32) + }; + return dispatcher.apply(register_type(lhs), register_type(rhs)); + } + + template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0> + inline batch_bool<T, A> ge(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + return batch_bool<T, A>({ lhs.get(0) >= rhs.get(0), lhs.get(1) >= rhs.get(1) }); + } + + /******************* + * batch_bool_cast * + *******************/ + + template <class A, class T_out, class T_in> + inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<neon>) noexcept + { + using register_type = typename batch_bool<T_out, A>::register_type; + return register_type(self); + } + + /*************** + * bitwise_and * + ***************/ + + WRAP_BINARY_INT(vandq, detail::identity_return_type) + + namespace detail + { + inline float32x4_t bitwise_and_f32(float32x4_t lhs, float32x4_t rhs) noexcept + { + return vreinterpretq_f32_u32(vandq_u32(vreinterpretq_u32_f32(lhs), + vreinterpretq_u32_f32(rhs))); + } + + template <class V> + V bitwise_and_neon(V const& lhs, V const& rhs) + { + const neon_dispatcher::binary dispatcher = { + std::make_tuple(wrap::vandq_u8, wrap::vandq_s8, wrap::vandq_u16, wrap::vandq_s16, + wrap::vandq_u32, wrap::vandq_s32, wrap::vandq_u64, wrap::vandq_s64, + bitwise_and_f32) + }; + return dispatcher.apply(lhs, rhs); + } + } + + template <class A, class T, detail::enable_neon_type_t<T> = 0> + inline batch<T, A> bitwise_and(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + using register_type = typename batch<T, A>::register_type; + return detail::bitwise_and_neon(register_type(lhs), register_type(rhs)); + } + + template <class A, class T, detail::enable_neon_type_t<T> = 0> + inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept + { + using register_type = typename batch_bool<T, A>::register_type; + return detail::bitwise_and_neon(register_type(lhs), register_type(rhs)); + } + + /************** + * bitwise_or * + **************/ + + WRAP_BINARY_INT(vorrq, detail::identity_return_type) + + namespace detail + { + inline float32x4_t bitwise_or_f32(float32x4_t lhs, float32x4_t rhs) noexcept + { + return vreinterpretq_f32_u32(vorrq_u32(vreinterpretq_u32_f32(lhs), + vreinterpretq_u32_f32(rhs))); + } + + template <class V> + inline V bitwise_or_neon(V const& lhs, V const& rhs) noexcept + { + const neon_dispatcher::binary dispatcher = { + std::make_tuple(wrap::vorrq_u8, wrap::vorrq_s8, wrap::vorrq_u16, wrap::vorrq_s16, + wrap::vorrq_u32, wrap::vorrq_s32, wrap::vorrq_u64, wrap::vorrq_s64, + bitwise_or_f32) + }; + return dispatcher.apply(lhs, rhs); + } + } + + template <class A, class T, detail::enable_neon_type_t<T> = 0> + inline batch<T, A> bitwise_or(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + using register_type = typename batch<T, A>::register_type; + return detail::bitwise_or_neon(register_type(lhs), register_type(rhs)); + } + + template <class A, class T, detail::enable_neon_type_t<T> = 0> + inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept + { + using register_type = typename batch_bool<T, A>::register_type; + return detail::bitwise_or_neon(register_type(lhs), register_type(rhs)); + } + + /*************** + * bitwise_xor * + ***************/ + + WRAP_BINARY_INT(veorq, detail::identity_return_type) + + namespace detail + { + inline float32x4_t bitwise_xor_f32(float32x4_t lhs, float32x4_t rhs) noexcept + { + return vreinterpretq_f32_u32(veorq_u32(vreinterpretq_u32_f32(lhs), + vreinterpretq_u32_f32(rhs))); + } + + template <class V> + inline V bitwise_xor_neon(V const& lhs, V const& rhs) noexcept + { + const neon_dispatcher::binary dispatcher = { + std::make_tuple(wrap::veorq_u8, wrap::veorq_s8, wrap::veorq_u16, wrap::veorq_s16, + wrap::veorq_u32, wrap::veorq_s32, wrap::veorq_u64, wrap::veorq_s64, + bitwise_xor_f32) + }; + return dispatcher.apply(lhs, rhs); + } + } + + template <class A, class T, detail::enable_neon_type_t<T> = 0> + inline batch<T, A> bitwise_xor(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + using register_type = typename batch<T, A>::register_type; + return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs)); + } + + template <class A, class T, detail::enable_neon_type_t<T> = 0> + inline batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept + { + using register_type = typename batch_bool<T, A>::register_type; + return detail::bitwise_xor_neon(register_type(lhs), register_type(rhs)); + } + + /******* + * neq * + *******/ + + template <class A, class T> + inline batch_bool<T, A> neq(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept + { + return bitwise_xor(lhs, rhs, A {}); + } + + /*************** + * bitwise_not * + ***************/ + + WRAP_UNARY_INT_EXCLUDING_64(vmvnq) + + namespace detail + { + inline int64x2_t bitwise_not_s64(int64x2_t arg) noexcept + { + return vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_s64(arg))); + } + + inline uint64x2_t bitwise_not_u64(uint64x2_t arg) noexcept + { + return vreinterpretq_u64_u32(vmvnq_u32(vreinterpretq_u32_u64(arg))); + } + + inline float32x4_t bitwise_not_f32(float32x4_t arg) noexcept + { + return vreinterpretq_f32_u32(vmvnq_u32(vreinterpretq_u32_f32(arg))); + } + + template <class V> + inline V bitwise_not_neon(V const& arg) noexcept + { + const neon_dispatcher::unary dispatcher = { + std::make_tuple(wrap::vmvnq_u8, wrap::vmvnq_s8, wrap::vmvnq_u16, wrap::vmvnq_s16, + wrap::vmvnq_u32, wrap::vmvnq_s32, + bitwise_not_u64, bitwise_not_s64, + bitwise_not_f32) + }; + return dispatcher.apply(arg); + } + } + + template <class A, class T, detail::enable_neon_type_t<T> = 0> + inline batch<T, A> bitwise_not(batch<T, A> const& arg, requires_arch<neon>) noexcept + { + using register_type = typename batch<T, A>::register_type; + return detail::bitwise_not_neon(register_type(arg)); + } + + template <class A, class T, detail::enable_neon_type_t<T> = 0> + inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept + { + using register_type = typename batch_bool<T, A>::register_type; + return detail::bitwise_not_neon(register_type(arg)); + } + + /****************** + * bitwise_andnot * + ******************/ + + WRAP_BINARY_INT(vbicq, detail::identity_return_type) + + namespace detail + { + inline float32x4_t bitwise_andnot_f32(float32x4_t lhs, float32x4_t rhs) noexcept + { + return vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(lhs), vreinterpretq_u32_f32(rhs))); + } + + template <class V> + inline V bitwise_andnot_neon(V const& lhs, V const& rhs) noexcept + { + const detail::neon_dispatcher::binary dispatcher = { + std::make_tuple(wrap::vbicq_u8, wrap::vbicq_s8, wrap::vbicq_u16, wrap::vbicq_s16, + wrap::vbicq_u32, wrap::vbicq_s32, wrap::vbicq_u64, wrap::vbicq_s64, + bitwise_andnot_f32) + }; + return dispatcher.apply(lhs, rhs); + } + } + + template <class A, class T, detail::enable_neon_type_t<T> = 0> + inline batch<T, A> bitwise_andnot(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + using register_type = typename batch<T, A>::register_type; + return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs)); + } + + template <class A, class T, detail::enable_neon_type_t<T> = 0> + inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& lhs, batch_bool<T, A> const& rhs, requires_arch<neon>) noexcept + { + using register_type = typename batch_bool<T, A>::register_type; + return detail::bitwise_andnot_neon(register_type(lhs), register_type(rhs)); + } + + /******* + * min * + *******/ + + WRAP_BINARY_INT_EXCLUDING_64(vminq, detail::identity_return_type) + WRAP_BINARY_FLOAT(vminq, detail::identity_return_type) + + template <class A, class T, detail::exclude_int64_neon_t<T> = 0> + inline batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + using register_type = typename batch<T, A>::register_type; + const detail::excluding_int64_dispatcher::binary dispatcher = { + std::make_tuple(wrap::vminq_u8, wrap::vminq_s8, wrap::vminq_u16, wrap::vminq_s16, + wrap::vminq_u32, wrap::vminq_s32, wrap::vminq_f32) + }; + return dispatcher.apply(register_type(lhs), register_type(rhs)); + } + + template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0> + inline batch<T, A> min(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + return { std::min(lhs.get(0), rhs.get(0)), std::min(lhs.get(1), rhs.get(1)) }; + } + + /******* + * max * + *******/ + + WRAP_BINARY_INT_EXCLUDING_64(vmaxq, detail::identity_return_type) + WRAP_BINARY_FLOAT(vmaxq, detail::identity_return_type) + + template <class A, class T, detail::exclude_int64_neon_t<T> = 0> + inline batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + using register_type = typename batch<T, A>::register_type; + const detail::excluding_int64_dispatcher::binary dispatcher = { + std::make_tuple(wrap::vmaxq_u8, wrap::vmaxq_s8, wrap::vmaxq_u16, wrap::vmaxq_s16, + wrap::vmaxq_u32, wrap::vmaxq_s32, wrap::vmaxq_f32) + }; + return dispatcher.apply(register_type(lhs), register_type(rhs)); + } + + template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0> + inline batch<T, A> max(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + return { std::max(lhs.get(0), rhs.get(0)), std::max(lhs.get(1), rhs.get(1)) }; + } + + /******* + * abs * + *******/ + + namespace wrap + { + inline int8x16_t vabsq_s8(int8x16_t a) noexcept { return ::vabsq_s8(a); } + inline int16x8_t vabsq_s16(int16x8_t a) noexcept { return ::vabsq_s16(a); } + inline int32x4_t vabsq_s32(int32x4_t a) noexcept { return ::vabsq_s32(a); } + } + WRAP_UNARY_FLOAT(vabsq) + + namespace detail + { + inline uint8x16_t abs_u8(uint8x16_t arg) noexcept + { + return arg; + } + + inline uint16x8_t abs_u16(uint16x8_t arg) noexcept + { + return arg; + } + + inline uint32x4_t abs_u32(uint32x4_t arg) noexcept + { + return arg; + } + } + + template <class A, class T, detail::exclude_int64_neon_t<T> = 0> + inline batch<T, A> abs(batch<T, A> const& arg, requires_arch<neon>) noexcept + { + using register_type = typename batch<T, A>::register_type; + const detail::excluding_int64_dispatcher::unary dispatcher = { + std::make_tuple(detail::abs_u8, wrap::vabsq_s8, detail::abs_u16, wrap::vabsq_s16, + detail::abs_u32, wrap::vabsq_s32, wrap::vabsq_f32) + }; + return dispatcher.apply(register_type(arg)); + } + + /******** + * rsqrt * + ********/ + + template <class A> + inline batch<float, A> rsqrt(batch<float, A> const& arg, requires_arch<neon>) noexcept + { + return vrsqrteq_f32(arg); + } + + /******** + * sqrt * + ********/ + + template <class A> + inline batch<float, A> sqrt(batch<float, A> const& arg, requires_arch<neon>) noexcept + { + batch<float, A> sqrt_reciprocal = vrsqrteq_f32(arg); + // one iter + sqrt_reciprocal = sqrt_reciprocal * batch<float, A>(vrsqrtsq_f32(arg * sqrt_reciprocal, sqrt_reciprocal)); + batch<float, A> sqrt_approx = arg * sqrt_reciprocal * batch<float, A>(vrsqrtsq_f32(arg * sqrt_reciprocal, sqrt_reciprocal)); + batch<float, A> zero(0.f); + return select(arg == zero, zero, sqrt_approx); + } + + /******************** + * Fused operations * + ********************/ + +#ifdef __ARM_FEATURE_FMA + template <class A> + inline batch<float, A> fma(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<neon>) noexcept + { + return vfmaq_f32(z, x, y); + } + + template <class A> + inline batch<float, A> fms(batch<float, A> const& x, batch<float, A> const& y, batch<float, A> const& z, requires_arch<neon>) noexcept + { + return vfmaq_f32(-z, x, y); + } +#endif + + /********* + * haddp * + *********/ + + template <class A> + inline batch<float, A> haddp(const batch<float, A>* row, requires_arch<neon>) noexcept + { + // row = (a,b,c,d) + float32x2_t tmp1, tmp2, tmp3; + // tmp1 = (a0 + a2, a1 + a3) + tmp1 = vpadd_f32(vget_low_f32(row[0]), vget_high_f32(row[0])); + // tmp2 = (b0 + b2, b1 + b3) + tmp2 = vpadd_f32(vget_low_f32(row[1]), vget_high_f32(row[1])); + // tmp1 = (a0..3, b0..3) + tmp1 = vpadd_f32(tmp1, tmp2); + // tmp2 = (c0 + c2, c1 + c3) + tmp2 = vpadd_f32(vget_low_f32(row[2]), vget_high_f32(row[2])); + // tmp3 = (d0 + d2, d1 + d3) + tmp3 = vpadd_f32(vget_low_f32(row[3]), vget_high_f32(row[3])); + // tmp1 = (c0..3, d0..3) + tmp2 = vpadd_f32(tmp2, tmp3); + // return = (a0..3, b0..3, c0..3, d0..3) + return vcombine_f32(tmp1, tmp2); + } + + /************** + * reciprocal * + **************/ + + template <class A> + inline batch<float, A> + reciprocal(const batch<float, A>& x, + kernel::requires_arch<neon>) noexcept + { + return vrecpeq_f32(x); + } + + /********** + * insert * + **********/ + + template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 1> = 0> + inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept + { + return vsetq_lane_u8(val, self, I); + } + + template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 1> = 0> + inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept + { + return vsetq_lane_s8(val, self, I); + } + + template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 2> = 0> + inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept + { + return vsetq_lane_u16(val, self, I); + } + + template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 2> = 0> + inline batch<int16_t, A> insert(batch<int16_t, A> const& self, int16_t val, index<I>, requires_arch<neon>) noexcept + { + return vsetq_lane_s16(val, self, I); + } + + template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 4> = 0> + inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept + { + return vsetq_lane_u32(val, self, I); + } + + template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 4> = 0> + inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept + { + return vsetq_lane_s32(val, self, I); + } + + template <class A, class T, size_t I, detail::enable_sized_unsigned_t<T, 8> = 0> + inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept + { + return vsetq_lane_u64(val, self, I); + } + + template <class A, class T, size_t I, detail::enable_sized_signed_t<T, 8> = 0> + inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<neon>) noexcept + { + return vsetq_lane_s64(val, self, I); + } + + template <class A, size_t I> + inline batch<float, A> insert(batch<float, A> const& self, float val, index<I>, requires_arch<neon>) noexcept + { + return vsetq_lane_f32(val, self, I); + } + + /******************** + * nearbyint_as_int * + *******************/ + + template <class A> + inline batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self, + requires_arch<neon>) noexcept + { + /* origin: https://github.com/DLTcollab/sse2neon/blob/cad518a93b326f0f644b7972d488d04eaa2b0475/sse2neon.h#L4028-L4047 */ + // Contributors to this work are: + // John W. Ratcliff <jratcliffscarab@gmail.com> + // Brandon Rowlett <browlett@nvidia.com> + // Ken Fast <kfast@gdeb.com> + // Eric van Beurden <evanbeurden@nvidia.com> + // Alexander Potylitsin <apotylitsin@nvidia.com> + // Hasindu Gamaarachchi <hasindu2008@gmail.com> + // Jim Huang <jserv@biilabs.io> + // Mark Cheng <marktwtn@biilabs.io> + // Malcolm James MacLeod <malcolm@gulden.com> + // Devin Hussey (easyaspi314) <husseydevin@gmail.com> + // Sebastian Pop <spop@amazon.com> + // Developer Ecosystem Engineering <DeveloperEcosystemEngineering@apple.com> + // Danila Kutenin <danilak@google.com> + // François Turban (JishinMaster) <francois.turban@gmail.com> + // Pei-Hsuan Hung <afcidk@gmail.com> + // Yang-Hao Yuan <yanghau@biilabs.io> + // Syoyo Fujita <syoyo@lighttransport.com> + // Brecht Van Lommel <brecht@blender.org> + + /* + * sse2neon is freely redistributable under the MIT License. + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + const auto signmask = vdupq_n_u32(0x80000000); + const auto half = vbslq_f32(signmask, self, + vdupq_n_f32(0.5f)); /* +/- 0.5 */ + const auto r_normal = vcvtq_s32_f32(vaddq_f32( + self, half)); /* round to integer: [a + 0.5]*/ + const auto r_trunc = vcvtq_s32_f32(self); /* truncate to integer: [a] */ + const auto plusone = vreinterpretq_s32_u32(vshrq_n_u32( + vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */ + const auto r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), + vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ + const auto delta = vsubq_f32( + self, + vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ + const auto is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */ + return vbslq_s32(is_delta_half, r_even, r_normal); + } + + /************** + * reduce_add * + **************/ + + namespace detail + { + template <class T, class A, class V> + inline T sum_batch(V const& arg) noexcept + { + T res = T(0); + for (std::size_t i = 0; i < batch<T, A>::size; ++i) + { + res += arg[i]; + } + return res; + } + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0> + inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept + { + uint8x8_t tmp = vpadd_u8(vget_low_u8(arg), vget_high_u8(arg)); + tmp = vpadd_u8(tmp, tmp); + tmp = vpadd_u8(tmp, tmp); + tmp = vpadd_u8(tmp, tmp); + return vget_lane_u8(tmp, 0); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0> + inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept + { + int8x8_t tmp = vpadd_s8(vget_low_s8(arg), vget_high_s8(arg)); + tmp = vpadd_s8(tmp, tmp); + tmp = vpadd_s8(tmp, tmp); + tmp = vpadd_s8(tmp, tmp); + return vget_lane_s8(tmp, 0); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0> + inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept + { + uint16x4_t tmp = vpadd_u16(vget_low_u16(arg), vget_high_u16(arg)); + tmp = vpadd_u16(tmp, tmp); + tmp = vpadd_u16(tmp, tmp); + return vget_lane_u16(tmp, 0); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0> + inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept + { + int16x4_t tmp = vpadd_s16(vget_low_s16(arg), vget_high_s16(arg)); + tmp = vpadd_s16(tmp, tmp); + tmp = vpadd_s16(tmp, tmp); + return vget_lane_s16(tmp, 0); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0> + inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept + { + uint32x2_t tmp = vpadd_u32(vget_low_u32(arg), vget_high_u32(arg)); + tmp = vpadd_u32(tmp, tmp); + return vget_lane_u32(tmp, 0); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0> + inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept + { + int32x2_t tmp = vpadd_s32(vget_low_s32(arg), vget_high_s32(arg)); + tmp = vpadd_s32(tmp, tmp); + return vget_lane_s32(tmp, 0); + } + + template <class A, class T, detail::enable_sized_integral_t<T, 8> = 0> + inline typename batch<T, A>::value_type reduce_add(batch<T, A> const& arg, requires_arch<neon>) noexcept + { + return arg.get(0) + arg.get(1); + } + + template <class A> + inline float reduce_add(batch<float, A> const& arg, requires_arch<neon>) noexcept + { + float32x2_t tmp = vpadd_f32(vget_low_f32(arg), vget_high_f32(arg)); + tmp = vpadd_f32(tmp, tmp); + return vget_lane_f32(tmp, 0); + } + + /************** + * reduce_max * + **************/ + + // Using generic implementation because ARM doe snot provide intrinsics + // for this operation + + /************** + * reduce_min * + **************/ + + // Using generic implementation because ARM doe snot provide intrinsics + // for this operation + + /********** + * select * + **********/ + + namespace wrap + { + inline uint8x16_t vbslq_u8(uint8x16_t a, uint8x16_t b, uint8x16_t c) noexcept { return ::vbslq_u8(a, b, c); } + inline int8x16_t vbslq_s8(uint8x16_t a, int8x16_t b, int8x16_t c) noexcept { return ::vbslq_s8(a, b, c); } + inline uint16x8_t vbslq_u16(uint16x8_t a, uint16x8_t b, uint16x8_t c) noexcept { return ::vbslq_u16(a, b, c); } + inline int16x8_t vbslq_s16(uint16x8_t a, int16x8_t b, int16x8_t c) noexcept { return ::vbslq_s16(a, b, c); } + inline uint32x4_t vbslq_u32(uint32x4_t a, uint32x4_t b, uint32x4_t c) noexcept { return ::vbslq_u32(a, b, c); } + inline int32x4_t vbslq_s32(uint32x4_t a, int32x4_t b, int32x4_t c) noexcept { return ::vbslq_s32(a, b, c); } + inline uint64x2_t vbslq_u64(uint64x2_t a, uint64x2_t b, uint64x2_t c) noexcept { return ::vbslq_u64(a, b, c); } + inline int64x2_t vbslq_s64(uint64x2_t a, int64x2_t b, int64x2_t c) noexcept { return ::vbslq_s64(a, b, c); } + inline float32x4_t vbslq_f32(uint32x4_t a, float32x4_t b, float32x4_t c) noexcept { return ::vbslq_f32(a, b, c); } + } + + namespace detail + { + template <class... T> + struct neon_select_dispatcher_impl + { + using container_type = std::tuple<T (*)(comp_return_type<T>, T, T)...>; + const container_type m_func; + + template <class U> + U apply(comp_return_type<U> cond, U lhs, U rhs) const noexcept + { + using func_type = U (*)(comp_return_type<U>, U, U); + auto func = xsimd::detail::get<func_type>(m_func); + return func(cond, lhs, rhs); + } + }; + + using neon_select_dispatcher = neon_select_dispatcher_impl<uint8x16_t, int8x16_t, + uint16x8_t, int16x8_t, + uint32x4_t, int32x4_t, + uint64x2_t, int64x2_t, + float32x4_t>; + } + + template <class A, class T, detail::enable_neon_type_t<T> = 0> + inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& a, batch<T, A> const& b, requires_arch<neon>) noexcept + { + using bool_register_type = typename batch_bool<T, A>::register_type; + using register_type = typename batch<T, A>::register_type; + const detail::neon_select_dispatcher dispatcher = { + std::make_tuple(wrap::vbslq_u8, wrap::vbslq_s8, wrap::vbslq_u16, wrap::vbslq_s16, + wrap::vbslq_u32, wrap::vbslq_s32, wrap::vbslq_u64, wrap::vbslq_s64, + wrap::vbslq_f32) + }; + return dispatcher.apply(bool_register_type(cond), register_type(a), register_type(b)); + } + + template <class A, class T, bool... b, detail::enable_neon_type_t<T> = 0> + inline batch<T, A> select(batch_bool_constant<batch<T, A>, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<neon>) noexcept + { + return select(batch_bool<T, A> { b... }, true_br, false_br, neon {}); + } + + /********** + * zip_lo * + **********/ + + template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0> + inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + uint8x8x2_t tmp = vzip_u8(vget_low_u8(lhs), vget_low_u8(rhs)); + return vcombine_u8(tmp.val[0], tmp.val[1]); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0> + inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + int8x8x2_t tmp = vzip_s8(vget_low_s8(lhs), vget_low_s8(rhs)); + return vcombine_s8(tmp.val[0], tmp.val[1]); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0> + inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + uint16x4x2_t tmp = vzip_u16(vget_low_u16(lhs), vget_low_u16(rhs)); + return vcombine_u16(tmp.val[0], tmp.val[1]); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0> + inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + int16x4x2_t tmp = vzip_s16(vget_low_s16(lhs), vget_low_s16(rhs)); + return vcombine_s16(tmp.val[0], tmp.val[1]); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0> + inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + uint32x2x2_t tmp = vzip_u32(vget_low_u32(lhs), vget_low_u32(rhs)); + return vcombine_u32(tmp.val[0], tmp.val[1]); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0> + inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + int32x2x2_t tmp = vzip_s32(vget_low_s32(lhs), vget_low_s32(rhs)); + return vcombine_s32(tmp.val[0], tmp.val[1]); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> + inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + return vcombine_u64(vget_low_u64(lhs), vget_low_u64(rhs)); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0> + inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + return vcombine_s64(vget_low_s64(lhs), vget_low_s64(rhs)); + } + + template <class A> + inline batch<float, A> zip_lo(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept + { + float32x2x2_t tmp = vzip_f32(vget_low_f32(lhs), vget_low_f32(rhs)); + return vcombine_f32(tmp.val[0], tmp.val[1]); + } + + /********** + * zip_hi * + **********/ + + template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0> + inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + uint8x8x2_t tmp = vzip_u8(vget_high_u8(lhs), vget_high_u8(rhs)); + return vcombine_u8(tmp.val[0], tmp.val[1]); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0> + inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + int8x8x2_t tmp = vzip_s8(vget_high_s8(lhs), vget_high_s8(rhs)); + return vcombine_s8(tmp.val[0], tmp.val[1]); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0> + inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + uint16x4x2_t tmp = vzip_u16(vget_high_u16(lhs), vget_high_u16(rhs)); + return vcombine_u16(tmp.val[0], tmp.val[1]); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0> + inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + int16x4x2_t tmp = vzip_s16(vget_high_s16(lhs), vget_high_s16(rhs)); + return vcombine_s16(tmp.val[0], tmp.val[1]); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0> + inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + uint32x2x2_t tmp = vzip_u32(vget_high_u32(lhs), vget_high_u32(rhs)); + return vcombine_u32(tmp.val[0], tmp.val[1]); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0> + inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + int32x2x2_t tmp = vzip_s32(vget_high_s32(lhs), vget_high_s32(rhs)); + return vcombine_s32(tmp.val[0], tmp.val[1]); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> + inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + return vcombine_u64(vget_high_u64(lhs), vget_high_u64(rhs)); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0> + inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + return vcombine_s64(vget_high_s64(lhs), vget_high_s64(rhs)); + } + + template <class A> + inline batch<float, A> zip_hi(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon>) noexcept + { + float32x2x2_t tmp = vzip_f32(vget_high_f32(lhs), vget_high_f32(rhs)); + return vcombine_f32(tmp.val[0], tmp.val[1]); + } + + /**************** + * extract_pair * + ****************/ + + namespace detail + { + template <class A, class T> + inline batch<T, A> extract_pair(batch<T, A> const&, batch<T, A> const& /*rhs*/, std::size_t, ::xsimd::detail::index_sequence<>) noexcept + { + assert(false && "extract_pair out of bounds"); + return batch<T, A> {}; + } + + template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 1> = 0> + inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept + { + if (n == I) + { + return vextq_u8(rhs, lhs, I); + } + else + { + return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>()); + } + } + + template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 1> = 0> + inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept + { + if (n == I) + { + return vextq_s8(rhs, lhs, I); + } + else + { + return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>()); + } + } + + template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 2> = 0> + inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept + { + if (n == I) + { + return vextq_u16(rhs, lhs, I); + } + else + { + return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>()); + } + } + + template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 2> = 0> + inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept + { + if (n == I) + { + return vextq_s16(rhs, lhs, I); + } + else + { + return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>()); + } + } + + template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 4> = 0> + inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept + { + if (n == I) + { + return vextq_u32(rhs, lhs, I); + } + else + { + return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>()); + } + } + + template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 4> = 0> + inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept + { + if (n == I) + { + return vextq_s32(rhs, lhs, I); + } + else + { + return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>()); + } + } + + template <class A, class T, size_t I, size_t... Is, detail::enable_sized_unsigned_t<T, 8> = 0> + inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept + { + if (n == I) + { + return vextq_u64(rhs, lhs, I); + } + else + { + return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>()); + } + } + + template <class A, class T, size_t I, size_t... Is, detail::enable_sized_signed_t<T, 8> = 0> + inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept + { + if (n == I) + { + return vextq_s64(rhs, lhs, I); + } + else + { + return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>()); + } + } + + template <class A, size_t I, size_t... Is> + inline batch<float, A> extract_pair(batch<float, A> const& lhs, batch<float, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<I, Is...>) noexcept + { + if (n == I) + { + return vextq_f32(rhs, lhs, I); + } + else + { + return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>()); + } + } + + template <class A, class T, size_t... Is> + inline batch<T, A> extract_pair_impl(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, ::xsimd::detail::index_sequence<0, Is...>) noexcept + { + if (n == 0) + { + return rhs; + } + else + { + return extract_pair(lhs, rhs, n, ::xsimd::detail::index_sequence<Is...>()); + } + } + } + + template <class A, class T> + inline batch<T, A> extract_pair(batch<T, A> const& lhs, batch<T, A> const& rhs, std::size_t n, requires_arch<neon>) noexcept + { + constexpr std::size_t size = batch<T, A>::size; + assert(n < size && "index in bounds"); + return detail::extract_pair_impl(lhs, rhs, n, ::xsimd::detail::make_index_sequence<size>()); + } + + /****************** + * bitwise_lshift * + ******************/ + + namespace detail + { + template <class A, class T> + inline batch<T, A> bitwise_lshift(batch<T, A> const& /*lhs*/, int /*n*/, ::xsimd::detail::int_sequence<>) noexcept + { + assert(false && "bitwise_lshift out of bounds"); + return batch<T, A> {}; + } + + template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 1> = 0> + inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept + { + if (n == I) + { + return vshlq_n_u8(lhs, I); + } + else + { + return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); + } + } + + template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 1> = 0> + inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept + { + if (n == I) + { + return vshlq_n_s8(lhs, I); + } + else + { + return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); + } + } + + template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 2> = 0> + inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept + { + if (n == I) + { + return vshlq_n_u16(lhs, I); + } + else + { + return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); + } + } + + template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 2> = 0> + inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept + { + if (n == I) + { + return vshlq_n_s16(lhs, I); + } + else + { + return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); + } + } + + template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 4> = 0> + inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept + { + if (n == I) + { + return vshlq_n_u32(lhs, I); + } + else + { + return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); + } + } + + template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 4> = 0> + inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept + { + if (n == I) + { + return vshlq_n_s32(lhs, I); + } + else + { + return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); + } + } + + template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 8> = 0> + inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept + { + if (n == I) + { + return vshlq_n_u64(lhs, I); + } + else + { + return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); + } + } + + template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 8> = 0> + inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept + { + if (n == I) + { + return vshlq_n_s64(lhs, I); + } + else + { + return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); + } + } + + template <class A, class T, int... Is> + inline batch<T, A> bitwise_lshift_impl(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<0, Is...>) noexcept + { + if (n == 0) + { + return lhs; + } + else + { + return bitwise_lshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); + } + } + } + + template <class A, class T> + inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, int n, requires_arch<neon>) noexcept + { + constexpr int size = sizeof(typename batch<T, A>::value_type) * 8; + assert(0 <= n && n < size && "index in bounds"); + return detail::bitwise_lshift_impl(lhs, n, ::xsimd::detail::make_int_sequence<size>()); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0> + inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept + { + return vshlq_u8(lhs, rhs); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0> + inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + return vshlq_s8(lhs, rhs); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0> + inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept + { + return vshlq_u16(lhs, rhs); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0> + inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + return vshlq_s16(lhs, rhs); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0> + inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept + { + return vshlq_u32(lhs, rhs); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0> + inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + return vshlq_s32(lhs, rhs); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> + inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept + { + return vshlq_u64(lhs, rhs); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 8> = 0> + inline batch<T, A> bitwise_lshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + return vshlq_s64(lhs, rhs); + } + + /****************** + * bitwise_rshift * + ******************/ + + namespace detail + { + template <class A, class T> + inline batch<T, A> bitwise_rshift(batch<T, A> const& /*lhs*/, int /*n*/, ::xsimd::detail::int_sequence<>) noexcept + { + assert(false && "bitwise_rshift out of bounds"); + return batch<T, A> {}; + } + + template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 1> = 0> + inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept + { + if (n == I) + { + return vshrq_n_u8(lhs, I); + } + else + { + return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); + } + } + + template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 1> = 0> + inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept + { + if (n == I) + { + return vshrq_n_s8(lhs, I); + } + else + { + return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); + } + } + + template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 2> = 0> + inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept + { + if (n == I) + { + return vshrq_n_u16(lhs, I); + } + else + { + return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); + } + } + + template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 2> = 0> + inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept + { + if (n == I) + { + return vshrq_n_s16(lhs, I); + } + else + { + return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); + } + } + + template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 4> = 0> + inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept + { + if (n == I) + { + return vshrq_n_u32(lhs, I); + } + else + { + return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); + } + } + + template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 4> = 0> + inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept + { + if (n == I) + { + return vshrq_n_s32(lhs, I); + } + else + { + return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); + } + } + + template <class A, class T, int I, int... Is, detail::enable_sized_unsigned_t<T, 8> = 0> + inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept + { + if (n == I) + { + return vshrq_n_u64(lhs, I); + } + else + { + return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); + } + } + + template <class A, class T, int I, int... Is, detail::enable_sized_signed_t<T, 8> = 0> + inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<I, Is...>) noexcept + { + if (n == I) + { + return vshrq_n_s64(lhs, I); + } + else + { + return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); + } + } + + template <class A, class T, int... Is> + inline batch<T, A> bitwise_rshift_impl(batch<T, A> const& lhs, int n, ::xsimd::detail::int_sequence<0, Is...>) noexcept + { + if (n == 0) + { + return lhs; + } + else + { + return bitwise_rshift(lhs, n, ::xsimd::detail::int_sequence<Is...>()); + } + } + } + + template <class A, class T> + inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, int n, requires_arch<neon>) noexcept + { + constexpr int size = sizeof(typename batch<T, A>::value_type) * 8; + assert(0 <= n && n < size && "index in bounds"); + return detail::bitwise_rshift_impl(lhs, n, ::xsimd::detail::make_int_sequence<size>()); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0> + inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept + { + return vshlq_u8(lhs, vnegq_s8(rhs)); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0> + inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + return vshlq_s8(lhs, vnegq_s8(rhs)); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0> + inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept + { + return vshlq_u16(lhs, vnegq_s16(rhs)); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0> + inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + return vshlq_s16(lhs, vnegq_s16(rhs)); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0> + inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<as_signed_integer_t<T>, A> const& rhs, requires_arch<neon>) noexcept + { + return vshlq_u32(lhs, vnegq_s32(rhs)); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0> + inline batch<T, A> bitwise_rshift(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + return vshlq_s32(lhs, vnegq_s32(rhs)); + } + + // Overloads of bitwise shifts accepting two batches of uint64/int64 are not available with ARMv7 + + /******* + * all * + *******/ + + template <class A, class T, detail::enable_sized_t<T, 8> = 0> + inline bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept + { + uint64x1_t tmp = vand_u64(vget_low_u64(arg), vget_high_u64(arg)); + return vget_lane_u64(tmp, 0) == ~0ULL; + } + + template <class A, class T, detail::enable_sized_t<T, 1> = 0> + inline bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept + { + return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u8(arg)), neon {}); + } + + template <class A, class T, detail::enable_sized_t<T, 2> = 0> + inline bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept + { + return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u16(arg)), neon {}); + } + + template <class A, class T, detail::enable_sized_t<T, 4> = 0> + inline bool all(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept + { + return all(batch_bool<uint64_t, A>(vreinterpretq_u64_u32(arg)), neon {}); + } + + /******* + * any * + *******/ + + template <class A, class T, detail::enable_sized_t<T, 8> = 0> + inline bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept + { + uint32x2_t tmp = vqmovn_u64(arg); + return vget_lane_u64(vreinterpret_u64_u32(tmp), 0) != 0; + } + + template <class A, class T, detail::enable_sized_t<T, 1> = 0> + inline bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept + { + return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u8(arg)), neon {}); + } + + template <class A, class T, detail::enable_sized_t<T, 2> = 0> + inline bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept + { + return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u16(arg)), neon {}); + } + + template <class A, class T, detail::enable_sized_t<T, 4> = 0> + inline bool any(batch_bool<T, A> const& arg, requires_arch<neon>) noexcept + { + return any(batch_bool<uint64_t, A>(vreinterpretq_u64_u32(arg)), neon {}); + } + + /**************** + * bitwise_cast * + ****************/ + +#define WRAP_CAST(SUFFIX, TYPE) \ + namespace wrap \ + { \ + inline TYPE vreinterpretq_##SUFFIX##_u8(uint8x16_t a) noexcept \ + { \ + return ::vreinterpretq_##SUFFIX##_u8(a); \ + } \ + inline TYPE vreinterpretq_##SUFFIX##_s8(int8x16_t a) noexcept \ + { \ + return ::vreinterpretq_##SUFFIX##_s8(a); \ + } \ + inline TYPE vreinterpretq_##SUFFIX##_u16(uint16x8_t a) noexcept \ + { \ + return ::vreinterpretq_##SUFFIX##_u16(a); \ + } \ + inline TYPE vreinterpretq_##SUFFIX##_s16(int16x8_t a) noexcept \ + { \ + return ::vreinterpretq_##SUFFIX##_s16(a); \ + } \ + inline TYPE vreinterpretq_##SUFFIX##_u32(uint32x4_t a) noexcept \ + { \ + return ::vreinterpretq_##SUFFIX##_u32(a); \ + } \ + inline TYPE vreinterpretq_##SUFFIX##_s32(int32x4_t a) noexcept \ + { \ + return ::vreinterpretq_##SUFFIX##_s32(a); \ + } \ + inline TYPE vreinterpretq_##SUFFIX##_u64(uint64x2_t a) noexcept \ + { \ + return ::vreinterpretq_##SUFFIX##_u64(a); \ + } \ + inline TYPE vreinterpretq_##SUFFIX##_s64(int64x2_t a) noexcept \ + { \ + return ::vreinterpretq_##SUFFIX##_s64(a); \ + } \ + inline TYPE vreinterpretq_##SUFFIX##_f32(float32x4_t a) noexcept \ + { \ + return ::vreinterpretq_##SUFFIX##_f32(a); \ + } \ + } + + WRAP_CAST(u8, uint8x16_t) + WRAP_CAST(s8, int8x16_t) + WRAP_CAST(u16, uint16x8_t) + WRAP_CAST(s16, int16x8_t) + WRAP_CAST(u32, uint32x4_t) + WRAP_CAST(s32, int32x4_t) + WRAP_CAST(u64, uint64x2_t) + WRAP_CAST(s64, int64x2_t) + WRAP_CAST(f32, float32x4_t) + +#undef WRAP_CAST + + namespace detail + { + template <class R, class... T> + struct bitwise_caster_impl + { + using container_type = std::tuple<R (*)(T)...>; + container_type m_func; + + template <class U> + R apply(U rhs) const noexcept + { + using func_type = R (*)(U); + auto func = xsimd::detail::get<func_type>(m_func); + return func(rhs); + } + }; + + template <class R, class... T> + inline const bitwise_caster_impl<R, T...> make_bitwise_caster_impl(R (*... arg)(T)) noexcept + { + return { std::make_tuple(arg...) }; + } + + template <class... T> + struct type_list + { + }; + + template <class RTL, class TTL> + struct bitwise_caster; + + template <class... R, class... T> + struct bitwise_caster<type_list<R...>, type_list<T...>> + { + using container_type = std::tuple<bitwise_caster_impl<R, T...>...>; + container_type m_caster; + + template <class V, class U> + V apply(U rhs) const noexcept + { + using caster_type = bitwise_caster_impl<V, T...>; + auto caster = xsimd::detail::get<caster_type>(m_caster); + return caster.apply(rhs); + } + }; + + template <class... T> + using bitwise_caster_t = bitwise_caster<type_list<T...>, type_list<T...>>; + + using neon_bitwise_caster = bitwise_caster_t<uint8x16_t, int8x16_t, + uint16x8_t, int16x8_t, + uint32x4_t, int32x4_t, + uint64x2_t, int64x2_t, + float32x4_t>; + } + + template <class A, class T, class R> + inline batch<R, A> bitwise_cast(batch<T, A> const& arg, batch<R, A> const&, requires_arch<neon>) noexcept + { + const detail::neon_bitwise_caster caster = { + std::make_tuple( + detail::make_bitwise_caster_impl(wrap::vreinterpretq_u8_u8, wrap::vreinterpretq_u8_s8, wrap::vreinterpretq_u8_u16, wrap::vreinterpretq_u8_s16, + wrap::vreinterpretq_u8_u32, wrap::vreinterpretq_u8_s32, wrap::vreinterpretq_u8_u64, wrap::vreinterpretq_u8_s64, + wrap::vreinterpretq_u8_f32), + detail::make_bitwise_caster_impl(wrap::vreinterpretq_s8_u8, wrap::vreinterpretq_s8_s8, wrap::vreinterpretq_s8_u16, wrap::vreinterpretq_s8_s16, + wrap::vreinterpretq_s8_u32, wrap::vreinterpretq_s8_s32, wrap::vreinterpretq_s8_u64, wrap::vreinterpretq_s8_s64, + wrap::vreinterpretq_s8_f32), + detail::make_bitwise_caster_impl(wrap::vreinterpretq_u16_u8, wrap::vreinterpretq_u16_s8, wrap::vreinterpretq_u16_u16, wrap::vreinterpretq_u16_s16, + wrap::vreinterpretq_u16_u32, wrap::vreinterpretq_u16_s32, wrap::vreinterpretq_u16_u64, wrap::vreinterpretq_u16_s64, + wrap::vreinterpretq_u16_f32), + detail::make_bitwise_caster_impl(wrap::vreinterpretq_s16_u8, wrap::vreinterpretq_s16_s8, wrap::vreinterpretq_s16_u16, wrap::vreinterpretq_s16_s16, + wrap::vreinterpretq_s16_u32, wrap::vreinterpretq_s16_s32, wrap::vreinterpretq_s16_u64, wrap::vreinterpretq_s16_s64, + wrap::vreinterpretq_s16_f32), + detail::make_bitwise_caster_impl(wrap::vreinterpretq_u32_u8, wrap::vreinterpretq_u32_s8, wrap::vreinterpretq_u32_u16, wrap::vreinterpretq_u32_s16, + wrap::vreinterpretq_u32_u32, wrap::vreinterpretq_u32_s32, wrap::vreinterpretq_u32_u64, wrap::vreinterpretq_u32_s64, + wrap::vreinterpretq_u32_f32), + detail::make_bitwise_caster_impl(wrap::vreinterpretq_s32_u8, wrap::vreinterpretq_s32_s8, wrap::vreinterpretq_s32_u16, wrap::vreinterpretq_s32_s16, + wrap::vreinterpretq_s32_u32, wrap::vreinterpretq_s32_s32, wrap::vreinterpretq_s32_u64, wrap::vreinterpretq_s32_s64, + wrap::vreinterpretq_s32_f32), + detail::make_bitwise_caster_impl(wrap::vreinterpretq_u64_u8, wrap::vreinterpretq_u64_s8, wrap::vreinterpretq_u64_u16, wrap::vreinterpretq_u64_s16, + wrap::vreinterpretq_u64_u32, wrap::vreinterpretq_u64_s32, wrap::vreinterpretq_u64_u64, wrap::vreinterpretq_u64_s64, + wrap::vreinterpretq_u64_f32), + detail::make_bitwise_caster_impl(wrap::vreinterpretq_s64_u8, wrap::vreinterpretq_s64_s8, wrap::vreinterpretq_s64_u16, wrap::vreinterpretq_s64_s16, + wrap::vreinterpretq_s64_u32, wrap::vreinterpretq_s64_s32, wrap::vreinterpretq_s64_u64, wrap::vreinterpretq_s64_s64, + wrap::vreinterpretq_s64_f32), + detail::make_bitwise_caster_impl(wrap::vreinterpretq_f32_u8, wrap::vreinterpretq_f32_s8, wrap::vreinterpretq_f32_u16, wrap::vreinterpretq_f32_s16, + wrap::vreinterpretq_f32_u32, wrap::vreinterpretq_f32_s32, wrap::vreinterpretq_f32_u64, wrap::vreinterpretq_f32_s64, + wrap::vreinterpretq_f32_f32)) + }; + using src_register_type = typename batch<T, A>::register_type; + using dst_register_type = typename batch<R, A>::register_type; + return caster.apply<dst_register_type>(src_register_type(arg)); + } + + /********* + * isnan * + *********/ + + template <class A> + inline batch_bool<float, A> isnan(batch<float, A> const& arg, requires_arch<neon>) noexcept + { + return !(arg == arg); + } + + // slide_left + namespace detail + { + template <size_t N> + struct slider_left + { + template <class A, class T> + inline batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept + { + const auto left = vdupq_n_u8(0); + const auto right = bitwise_cast<uint8_t>(x).data; + const batch<uint8_t, A> res(vextq_u8(left, right, 16 - N)); + return bitwise_cast<T>(res); + } + }; + + template <> + struct slider_left<0> + { + template <class A, class T> + inline batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept + { + return x; + } + }; + } // namespace detail + + template <size_t N, class A, class T> + inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<neon>) noexcept + { + return detail::slider_left<N> {}(x, A {}); + } + + // slide_right + namespace detail + { + template <size_t N> + struct slider_right + { + template <class A, class T> + inline batch<T, A> operator()(batch<T, A> const& x, requires_arch<neon>) noexcept + { + const auto left = bitwise_cast<uint8_t>(x).data; + const auto right = vdupq_n_u8(0); + const batch<uint8_t, A> res(vextq_u8(left, right, N)); + return bitwise_cast<T>(res); + } + }; + + template <> + struct slider_right<16> + { + template <class A, class T> + inline batch<T, A> operator()(batch<T, A> const&, requires_arch<neon>) noexcept + { + return batch<T, A> {}; + } + }; + } // namespace detail + + template <size_t N, class A, class T> + inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<neon>) noexcept + { + return detail::slider_right<N> {}(x, A {}); + } + } + + template <class batch_type, typename batch_type::value_type... Values> + struct batch_constant; + + namespace kernel + { + /*********** + * swizzle * + ***********/ + + template <class A, class T, class I, I... idx> + inline batch<T, A> swizzle(batch<T, A> const& self, + batch_constant<batch<I, A>, idx...>, + requires_arch<neon>) noexcept + { + static_assert(batch<T, A>::size == sizeof...(idx), "valid swizzle indices"); + std::array<T, batch<T, A>::size> data; + self.store_aligned(data.data()); + return set(batch<T, A>(), A(), data[idx]...); + } + } +} + +#undef WRAP_BINARY_INT_EXCLUDING_64 +#undef WRAP_BINARY_INT +#undef WRAP_BINARY_FLOAT +#undef WRAP_UNARY_INT_EXCLUDING_64 +#undef WRAP_UNARY_INT +#undef WRAP_UNARY_FLOAT + +#endif |