summaryrefslogtreecommitdiffstats
path: root/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp')
-rw-r--r--third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp68
1 files changed, 55 insertions, 13 deletions
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp
index 54f09fb663..3510eb21d9 100644
--- a/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp
@@ -23,33 +23,39 @@
// Wrap intrinsics so we can pass them as function pointers
// - OP: intrinsics name prefix, e.g., vorrq
// - RT: type traits to deduce intrinsics return types
-#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \
+#define WRAP_BINARY_UINT_EXCLUDING_64(OP, RT) \
namespace wrap \
{ \
inline RT<uint8x16_t> OP##_u8(uint8x16_t a, uint8x16_t b) noexcept \
{ \
return ::OP##_u8(a, b); \
} \
- inline RT<int8x16_t> OP##_s8(int8x16_t a, int8x16_t b) noexcept \
- { \
- return ::OP##_s8(a, b); \
- } \
inline RT<uint16x8_t> OP##_u16(uint16x8_t a, uint16x8_t b) noexcept \
{ \
return ::OP##_u16(a, b); \
} \
- inline RT<int16x8_t> OP##_s16(int16x8_t a, int16x8_t b) noexcept \
- { \
- return ::OP##_s16(a, b); \
- } \
inline RT<uint32x4_t> OP##_u32(uint32x4_t a, uint32x4_t b) noexcept \
{ \
return ::OP##_u32(a, b); \
} \
- inline RT<int32x4_t> OP##_s32(int32x4_t a, int32x4_t b) noexcept \
- { \
- return ::OP##_s32(a, b); \
- } \
+ }
+
+#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \
+ WRAP_BINARY_UINT_EXCLUDING_64(OP, RT) \
+ namespace wrap \
+ { \
+ inline RT<int8x16_t> OP##_s8(int8x16_t a, int8x16_t b) noexcept \
+ { \
+ return ::OP##_s8(a, b); \
+ } \
+ inline RT<int16x8_t> OP##_s16(int16x8_t a, int16x8_t b) noexcept \
+ { \
+ return ::OP##_s16(a, b); \
+ } \
+ inline RT<int32x4_t> OP##_s32(int32x4_t a, int32x4_t b) noexcept \
+ { \
+ return ::OP##_s32(a, b); \
+ } \
}
#define WRAP_BINARY_INT(OP, RT) \
@@ -204,6 +210,10 @@ namespace xsimd
uint32x4_t, int32x4_t,
float32x4_t>;
+ using excluding_int64f32_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t,
+ uint16x8_t, int16x8_t,
+ uint32x4_t, int32x4_t>;
+
/**************************
* comparison dispatchers *
**************************/
@@ -744,6 +754,38 @@ namespace xsimd
return dispatcher.apply(register_type(lhs), register_type(rhs));
}
+ /*******
+ * avg *
+ *******/
+
+ WRAP_BINARY_UINT_EXCLUDING_64(vhaddq, detail::identity_return_type)
+
+ template <class A, class T, class = typename std::enable_if<(std::is_unsigned<T>::value && sizeof(T) != 8), void>::type>
+ inline batch<T, A> avg(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+ {
+ using register_type = typename batch<T, A>::register_type;
+ const detail::neon_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary dispatcher = {
+ std::make_tuple(wrap::vhaddq_u8, wrap::vhaddq_u16, wrap::vhaddq_u32)
+ };
+ return dispatcher.apply(register_type(lhs), register_type(rhs));
+ }
+
+ /********
+ * avgr *
+ ********/
+
+ WRAP_BINARY_UINT_EXCLUDING_64(vrhaddq, detail::identity_return_type)
+
+ template <class A, class T, class = typename std::enable_if<(std::is_unsigned<T>::value && sizeof(T) != 8), void>::type>
+ inline batch<T, A> avgr(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+ {
+ using register_type = typename batch<T, A>::register_type;
+ const detail::neon_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary dispatcher = {
+ std::make_tuple(wrap::vrhaddq_u8, wrap::vrhaddq_u16, wrap::vrhaddq_u32)
+ };
+ return dispatcher.apply(register_type(lhs), register_type(rhs));
+ }
+
/********
* sadd *
********/