diff options
Diffstat (limited to 'third_party/xsimd/include')
16 files changed, 508 insertions, 16 deletions
diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp index 8fa887dc57..7bcc4da241 100644 --- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp +++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp @@ -26,7 +26,7 @@ namespace xsimd using namespace types; // abs - template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/> + template <class A, class T, class> inline batch<T, A> abs(batch<T, A> const& self, requires_arch<generic>) noexcept { if (std::is_unsigned<T>::value) @@ -45,6 +45,63 @@ namespace xsimd return hypot(z.real(), z.imag()); } + // avg + namespace detail + { + template <class A, class T> + inline batch<T, A> avg(batch<T, A> const& x, batch<T, A> const& y, std::true_type, std::false_type) noexcept + { + return (x & y) + ((x ^ y) >> 1); + } + + template <class A, class T> + inline batch<T, A> avg(batch<T, A> const& x, batch<T, A> const& y, std::true_type, std::true_type) noexcept + { + // Inspired by + // https://stackoverflow.com/questions/5697500/take-the-average-of-two-signed-numbers-in-c + auto t = (x & y) + ((x ^ y) >> 1); + auto t_u = bitwise_cast<typename std::make_unsigned<T>::type>(t); + auto avg = t + (bitwise_cast<T>(t_u >> (8 * sizeof(T) - 1)) & (x ^ y)); + return avg; + } + + template <class A, class T> + inline batch<T, A> avg(batch<T, A> const& x, batch<T, A> const& y, std::false_type, std::true_type) noexcept + { + return (x + y) / 2; + } + } + + template <class A, class T> + inline batch<T, A> avg(batch<T, A> const& x, batch<T, A> const& y, requires_arch<generic>) noexcept + { + return detail::avg(x, y, typename std::is_integral<T>::type {}, typename std::is_signed<T>::type {}); + } + + // avgr + namespace detail + { + template <class A, class T> + inline batch<T, A> avgr(batch<T, A> const& x, batch<T, A> const& y, std::true_type) noexcept + { + constexpr unsigned shift = 8 * sizeof(T) - 1; + auto adj = std::is_signed<T>::value ? ((x ^ y) & 0x1) : (((x ^ y) << shift) >> shift); + return ::xsimd::kernel::avg(x, y, A {}) + adj; + } + + template <class A, class T> + inline batch<T, A> avgr(batch<T, A> const& x, batch<T, A> const& y, std::false_type) noexcept + { + return ::xsimd::kernel::avg(x, y, A {}); + } + } + + template <class A, class T> + inline batch<T, A> avgr(batch<T, A> const& x, batch<T, A> const& y, requires_arch<generic>) noexcept + { + return detail::avgr(x, y, typename std::is_integral<T>::type {}); + } + // batch_cast template <class A, class T> inline batch<T, A> batch_cast(batch<T, A> const& self, batch<T, A> const&, requires_arch<generic>) noexcept diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp index a5b07ec9da..a4881778e0 100644 --- a/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp +++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp @@ -76,6 +76,44 @@ namespace xsimd } } + // avgr + template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type> + inline batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_avg_epu8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_avg_epu16(self, other); + } + else + { + return avgr(self, other, generic {}); + } + } + + // avg + template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type> + inline batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + auto adj = ((self ^ other) << 7) >> 7; + return avgr(self, other, A {}) - adj; + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + auto adj = ((self ^ other) << 15) >> 15; + return avgr(self, other, A {}) - adj; + } + else + { + return avg(self, other, generic {}); + } + } + // bitwise_and template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp index 94a194dab7..95fbcbd461 100644 --- a/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp @@ -112,6 +112,44 @@ namespace xsimd } } + // avgr + template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type> + inline batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm512_avg_epu8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm512_avg_epu16(self, other); + } + else + { + return avgr(self, other, generic {}); + } + } + + // avg + template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type> + inline batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + auto adj = ((self ^ other) << 7) >> 7; + return avgr(self, other, A {}) - adj; + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + auto adj = ((self ^ other) << 15) >> 15; + return avgr(self, other, A {}) - adj; + } + else + { + return avg(self, other, generic {}); + } + } + // bitwise_lshift template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type> inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx512bw>) noexcept diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp new file mode 100644 index 0000000000..5533923020 --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp @@ -0,0 +1,17 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_I8MM_NEON64_HPP +#define XSIMD_I8MM_NEON64_HPP + +#include "../types/xsimd_i8mm_neon64_register.hpp" + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp index 0edd776741..dcd2df3fa9 100644 --- a/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp +++ b/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp @@ -104,6 +104,10 @@ #include "./xsimd_neon64.hpp" #endif +#if XSIMD_WITH_I8MM_NEON64 +#include "./xsimd_i8mm_neon64.hpp" +#endif + #if XSIMD_WITH_SVE #include "./xsimd_sve.hpp" #endif diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp index 54f09fb663..3510eb21d9 100644 --- a/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp +++ b/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp @@ -23,33 +23,39 @@ // Wrap intrinsics so we can pass them as function pointers // - OP: intrinsics name prefix, e.g., vorrq // - RT: type traits to deduce intrinsics return types -#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ +#define WRAP_BINARY_UINT_EXCLUDING_64(OP, RT) \ namespace wrap \ { \ inline RT<uint8x16_t> OP##_u8(uint8x16_t a, uint8x16_t b) noexcept \ { \ return ::OP##_u8(a, b); \ } \ - inline RT<int8x16_t> OP##_s8(int8x16_t a, int8x16_t b) noexcept \ - { \ - return ::OP##_s8(a, b); \ - } \ inline RT<uint16x8_t> OP##_u16(uint16x8_t a, uint16x8_t b) noexcept \ { \ return ::OP##_u16(a, b); \ } \ - inline RT<int16x8_t> OP##_s16(int16x8_t a, int16x8_t b) noexcept \ - { \ - return ::OP##_s16(a, b); \ - } \ inline RT<uint32x4_t> OP##_u32(uint32x4_t a, uint32x4_t b) noexcept \ { \ return ::OP##_u32(a, b); \ } \ - inline RT<int32x4_t> OP##_s32(int32x4_t a, int32x4_t b) noexcept \ - { \ - return ::OP##_s32(a, b); \ - } \ + } + +#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ + WRAP_BINARY_UINT_EXCLUDING_64(OP, RT) \ + namespace wrap \ + { \ + inline RT<int8x16_t> OP##_s8(int8x16_t a, int8x16_t b) noexcept \ + { \ + return ::OP##_s8(a, b); \ + } \ + inline RT<int16x8_t> OP##_s16(int16x8_t a, int16x8_t b) noexcept \ + { \ + return ::OP##_s16(a, b); \ + } \ + inline RT<int32x4_t> OP##_s32(int32x4_t a, int32x4_t b) noexcept \ + { \ + return ::OP##_s32(a, b); \ + } \ } #define WRAP_BINARY_INT(OP, RT) \ @@ -204,6 +210,10 @@ namespace xsimd uint32x4_t, int32x4_t, float32x4_t>; + using excluding_int64f32_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t, + uint16x8_t, int16x8_t, + uint32x4_t, int32x4_t>; + /************************** * comparison dispatchers * **************************/ @@ -744,6 +754,38 @@ namespace xsimd return dispatcher.apply(register_type(lhs), register_type(rhs)); } + /******* + * avg * + *******/ + + WRAP_BINARY_UINT_EXCLUDING_64(vhaddq, detail::identity_return_type) + + template <class A, class T, class = typename std::enable_if<(std::is_unsigned<T>::value && sizeof(T) != 8), void>::type> + inline batch<T, A> avg(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + using register_type = typename batch<T, A>::register_type; + const detail::neon_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary dispatcher = { + std::make_tuple(wrap::vhaddq_u8, wrap::vhaddq_u16, wrap::vhaddq_u32) + }; + return dispatcher.apply(register_type(lhs), register_type(rhs)); + } + + /******** + * avgr * + ********/ + + WRAP_BINARY_UINT_EXCLUDING_64(vrhaddq, detail::identity_return_type) + + template <class A, class T, class = typename std::enable_if<(std::is_unsigned<T>::value && sizeof(T) != 8), void>::type> + inline batch<T, A> avgr(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept + { + using register_type = typename batch<T, A>::register_type; + const detail::neon_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary dispatcher = { + std::make_tuple(wrap::vrhaddq_u8, wrap::vrhaddq_u16, wrap::vrhaddq_u32) + }; + return dispatcher.apply(register_type(lhs), register_type(rhs)); + } + /******** * sadd * ********/ diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp index bc982c7ce6..77538d1c2d 100644 --- a/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp +++ b/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp @@ -92,7 +92,7 @@ namespace xsimd template <class A, class T> inline batch<T, A> broadcast(T val, requires_arch<neon64>) noexcept { - return broadcast<neon64>(val, neon {}); + return broadcast<A>(val, neon {}); } template <class A> @@ -952,6 +952,41 @@ namespace xsimd /********** * zip_lo * **********/ + template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0> + inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept + { + return vzip1q_u8(lhs, rhs); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0> + inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept + { + return vzip1q_s8(lhs, rhs); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0> + inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept + { + return vzip1q_u16(lhs, rhs); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0> + inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept + { + return vzip1q_s16(lhs, rhs); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0> + inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept + { + return vzip1q_u32(lhs, rhs); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0> + inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept + { + return vzip1q_s32(lhs, rhs); + } template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept @@ -966,6 +1001,12 @@ namespace xsimd } template <class A> + inline batch<float, A> zip_lo(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon64>) noexcept + { + return vzip1q_f32(lhs, rhs); + } + + template <class A> inline batch<double, A> zip_lo(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept { return vzip1q_f64(lhs, rhs); @@ -975,6 +1016,42 @@ namespace xsimd * zip_hi * **********/ + template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0> + inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept + { + return vzip2q_u8(lhs, rhs); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0> + inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept + { + return vzip2q_s8(lhs, rhs); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0> + inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept + { + return vzip2q_u16(lhs, rhs); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0> + inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept + { + return vzip2q_s16(lhs, rhs); + } + + template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0> + inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept + { + return vzip2q_u32(lhs, rhs); + } + + template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0> + inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept + { + return vzip2q_s32(lhs, rhs); + } + template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0> inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept { @@ -988,6 +1065,12 @@ namespace xsimd } template <class A> + inline batch<float, A> zip_hi(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon64>) noexcept + { + return vzip2q_f32(lhs, rhs); + } + + template <class A> inline batch<double, A> zip_hi(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept { return vzip2q_f64(lhs, rhs); diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp index 39bd607be9..1cde15ffe1 100644 --- a/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp +++ b/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp @@ -142,6 +142,39 @@ namespace xsimd return x + y; } + template <class T, class Tp> + inline typename std::common_type<T, Tp>::type avg(T const& x, Tp const& y) noexcept + { + using common_type = typename std::common_type<T, Tp>::type; + if (std::is_floating_point<common_type>::value) + return (x + y) / 2; + else if (std::is_unsigned<common_type>::value) + { + return (x & y) + ((x ^ y) >> 1); + } + else + { + // Inspired by + // https://stackoverflow.com/questions/5697500/take-the-average-of-two-signed-numbers-in-c + auto t = (x & y) + ((x ^ y) >> 1); + auto t_u = static_cast<typename std::make_unsigned<common_type>::type>(t); + auto avg = t + (static_cast<T>(t_u >> (8 * sizeof(T) - 1)) & (x ^ y)); + return avg; + } + } + + template <class T, class Tp> + inline typename std::common_type<T, Tp>::type avgr(T const& x, Tp const& y) noexcept + { + using common_type = typename std::common_type<T, Tp>::type; + if (std::is_floating_point<common_type>::value) + return avg(x, y); + else + { + return avg(x, y) + ((x ^ y) & 1); + } + } + template <class T> inline T incr(T const& x) noexcept { diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp index 0a34cb1e9b..d39cc201f9 100644 --- a/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp +++ b/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp @@ -60,6 +60,10 @@ namespace xsimd inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept; template <class A, typename T, typename ITy, ITy... Indices> inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<batch<ITy, A>, Indices...>, requires_arch<generic>) noexcept; + template <class A, class T> + inline batch<T, A> avg(batch<T, A> const&, batch<T, A> const&, requires_arch<generic>) noexcept; + template <class A, class T> + inline batch<T, A> avgr(batch<T, A> const&, batch<T, A> const&, requires_arch<generic>) noexcept; // abs template <class A> @@ -148,6 +152,44 @@ namespace xsimd return _mm_movemask_epi8(self) != 0; } + // avgr + template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type> + inline batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_avg_epu8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_avg_epu16(self, other); + } + else + { + return avgr(self, other, generic {}); + } + } + + // avg + template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type> + inline batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + auto adj = ((self ^ other) << 7) >> 7; + return avgr(self, other, A {}) - adj; + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + auto adj = ((self ^ other) << 15) >> 15; + return avgr(self, other, A {}) - adj; + } + else + { + return avg(self, other, generic {}); + } + } + // batch_bool_cast template <class A, class T_out, class T_in> inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<sse2>) noexcept diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_wasm.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_wasm.hpp index 8160b2423b..ab9acdc8c3 100644 --- a/third_party/xsimd/include/xsimd/arch/xsimd_wasm.hpp +++ b/third_party/xsimd/include/xsimd/arch/xsimd_wasm.hpp @@ -37,6 +37,8 @@ namespace xsimd inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept; template <class A, typename T, typename ITy, ITy... Indices> inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<batch<ITy, A>, Indices...>, requires_arch<generic>) noexcept; + template <class A, class T> + inline batch<T, A> avg(batch<T, A> const&, batch<T, A> const&, requires_arch<generic>) noexcept; // abs template <class A, class T, typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, void>::type> @@ -116,6 +118,44 @@ namespace xsimd return wasm_f64x2_add(self, other); } + // avgr + template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type> + inline batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return wasm_u8x16_avgr(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return wasm_u16x8_avgr(self, other); + } + else + { + return avgr(self, other, generic {}); + } + } + + // avg + template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type> + inline batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + auto adj = ((self ^ other) << 7) >> 7; + return avgr(self, other, A {}) - adj; + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + auto adj = ((self ^ other) << 15) >> 15; + return avgr(self, other, A {}) - adj; + } + else + { + return avg(self, other, generic {}); + } + } + // all template <class A> inline bool all(batch_bool<float, A> const& self, requires_arch<wasm>) noexcept diff --git a/third_party/xsimd/include/xsimd/config/xsimd_arch.hpp b/third_party/xsimd/include/xsimd/config/xsimd_arch.hpp index 575459a009..ea48aa057d 100644 --- a/third_party/xsimd/include/xsimd/config/xsimd_arch.hpp +++ b/third_party/xsimd/include/xsimd/config/xsimd_arch.hpp @@ -194,7 +194,7 @@ namespace xsimd using all_sve_architectures = arch_list<detail::sve<512>, detail::sve<256>, detail::sve<128>>; using all_rvv_architectures = arch_list<detail::rvv<512>, detail::rvv<256>, detail::rvv<128>>; - using all_arm_architectures = typename detail::join<all_sve_architectures, arch_list<neon64, neon>>::type; + using all_arm_architectures = typename detail::join<all_sve_architectures, arch_list<i8mm<neon64>, neon64, neon>>::type; using all_riscv_architectures = all_rvv_architectures; using all_wasm_architectures = arch_list<wasm>; using all_architectures = typename detail::join<all_riscv_architectures, all_wasm_architectures, all_arm_architectures, all_x86_architectures>::type; diff --git a/third_party/xsimd/include/xsimd/config/xsimd_config.hpp b/third_party/xsimd/include/xsimd/config/xsimd_config.hpp index cf5163c37e..10a074dedb 100644 --- a/third_party/xsimd/include/xsimd/config/xsimd_config.hpp +++ b/third_party/xsimd/include/xsimd/config/xsimd_config.hpp @@ -352,6 +352,17 @@ /** * @ingroup xsimd_config_macro * + * Set to 1 if i8mm neon64 extension is available at compile-time, to 0 otherwise. + */ +#if defined(__ARM_FEATURE_MATMUL_INT8) +#define XSIMD_WITH_I8MM_NEON64 1 +#else +#define XSIMD_WITH_I8MM_NEON64 0 +#endif + +/** + * @ingroup xsimd_config_macro + * * Set to 1 if SVE is available and bit width is pre-set at compile-time, to 0 otherwise. */ #if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0 diff --git a/third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp b/third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp index 5c8b1f38d0..89b883a396 100644 --- a/third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp +++ b/third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp @@ -18,6 +18,11 @@ #if defined(__linux__) && (defined(__ARM_NEON) || defined(_M_ARM) || defined(__riscv_vector)) #include <asm/hwcap.h> #include <sys/auxv.h> + +#ifndef HWCAP2_I8MM +#define HWCAP2_I8MM (1 << 13) +#endif + #endif #if defined(_MSC_VER) @@ -66,6 +71,7 @@ namespace xsimd ARCH_FIELD_EX(avx512vnni<::xsimd::avx512vbmi>, avx512vnni_vbmi) ARCH_FIELD(neon) ARCH_FIELD(neon64) + ARCH_FIELD_EX(i8mm<::xsimd::neon64>, i8mm_neon64) ARCH_FIELD(sve) ARCH_FIELD(rvv) ARCH_FIELD(wasm) @@ -83,6 +89,9 @@ namespace xsimd #if defined(__aarch64__) || defined(_M_ARM64) neon = 1; neon64 = 1; +#if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18) + i8mm_neon64 = bool(getauxval(AT_HWCAP2) & HWCAP2_I8MM); +#endif #elif defined(__ARM_NEON) || defined(_M_ARM) #if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18) diff --git a/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp b/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp index 4350ca0a28..6537157bc6 100644 --- a/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp +++ b/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp @@ -36,6 +36,8 @@ #include "xsimd_avx512dq_register.hpp" #include "xsimd_avx512f_register.hpp" +#include "xsimd_i8mm_neon64_register.hpp" + #include "xsimd_neon64_register.hpp" #include "xsimd_neon_register.hpp" diff --git a/third_party/xsimd/include/xsimd/types/xsimd_api.hpp b/third_party/xsimd/include/xsimd/types/xsimd_api.hpp index 0420f0a09d..751e31d33a 100644 --- a/third_party/xsimd/include/xsimd/types/xsimd_api.hpp +++ b/third_party/xsimd/include/xsimd/types/xsimd_api.hpp @@ -203,6 +203,36 @@ namespace xsimd } /** + * @ingroup batch_math + * + * Computes the average of batches \c x and \c y + * @param x batch of T + * @param y batch of T + * @return the average of elements between \c x and \c y. + */ + template <class T, class A> + inline batch<T, A> avg(batch<T, A> const& x, batch<T, A> const& y) noexcept + { + detail::static_check_supported_config<T, A>(); + return kernel::avg<A>(x, y, A {}); + } + + /** + * @ingroup batch_math + * + * Computes the rounded average of batches \c x and \c y + * @param x batch of T + * @param y batch of T + * @return the rounded average of elements between \c x and \c y. + */ + template <class T, class A> + inline batch<T, A> avgr(batch<T, A> const& x, batch<T, A> const& y) noexcept + { + detail::static_check_supported_config<T, A>(); + return kernel::avgr<A>(x, y, A {}); + } + + /** * @ingroup batch_conversion * * Perform a static_cast from \c T_in to \c T_out on \c \c x. diff --git a/third_party/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp new file mode 100644 index 0000000000..fc0c884d0b --- /dev/null +++ b/third_party/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp @@ -0,0 +1,46 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_I8MM_NEON64_REGISTER_HPP +#define XSIMD_I8MM_NEON64_REGISTER_HPP + +#include "./xsimd_neon64_register.hpp" + +namespace xsimd +{ + template <typename arch> + struct i8mm; + + /** + * @ingroup architectures + * + * Neon64 + i8mm instructions + */ + template <> + struct i8mm<neon64> : neon64 + { + static constexpr bool supported() noexcept { return XSIMD_WITH_I8MM_NEON64; } + static constexpr bool available() noexcept { return true; } + static constexpr unsigned version() noexcept { return generic::version(8, 2, 0); } + static constexpr char const* name() noexcept { return "i8mm+neon64"; } + }; + +#if XSIMD_WITH_I8MM_NEON64 + namespace types + { + + XSIMD_DECLARE_SIMD_REGISTER_ALIAS(i8mm<neon64>, neon64); + + } +#endif + +} +#endif |