From fbaf0bb26397aa498eb9156f06d5a6fe34dd7dd8 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 19 Apr 2024 03:14:29 +0200 Subject: Merging upstream version 125.0.1. Signed-off-by: Daniel Baumann --- .../xsimd/arch/generic/xsimd_generic_math.hpp | 59 ++++++++++++++- .../xsimd/include/xsimd/arch/xsimd_avx2.hpp | 38 ++++++++++ .../xsimd/include/xsimd/arch/xsimd_avx512bw.hpp | 38 ++++++++++ .../xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp | 17 +++++ third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp | 4 + .../xsimd/include/xsimd/arch/xsimd_neon.hpp | 68 +++++++++++++---- .../xsimd/include/xsimd/arch/xsimd_neon64.hpp | 85 +++++++++++++++++++++- .../xsimd/include/xsimd/arch/xsimd_scalar.hpp | 33 +++++++++ .../xsimd/include/xsimd/arch/xsimd_sse2.hpp | 42 +++++++++++ .../xsimd/include/xsimd/arch/xsimd_wasm.hpp | 40 ++++++++++ .../xsimd/include/xsimd/config/xsimd_arch.hpp | 2 +- .../xsimd/include/xsimd/config/xsimd_config.hpp | 11 +++ .../xsimd/include/xsimd/config/xsimd_cpuid.hpp | 9 +++ .../include/xsimd/types/xsimd_all_registers.hpp | 2 + .../xsimd/include/xsimd/types/xsimd_api.hpp | 30 ++++++++ .../xsimd/types/xsimd_i8mm_neon64_register.hpp | 46 ++++++++++++ 16 files changed, 508 insertions(+), 16 deletions(-) create mode 100644 third_party/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp create mode 100644 third_party/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp (limited to 'third_party/xsimd/include') diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp index 8fa887dc57..7bcc4da241 100644 --- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp +++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp @@ -26,7 +26,7 @@ namespace xsimd using namespace types; // abs - template ::value, void>::type*/> + template inline batch abs(batch const& self, requires_arch) noexcept { if (std::is_unsigned::value) @@ -45,6 +45,63 @@ namespace xsimd return hypot(z.real(), z.imag()); } + // avg + namespace detail + { + template + inline batch avg(batch const& x, batch const& y, std::true_type, std::false_type) noexcept + { + return (x & y) + ((x ^ y) >> 1); + } + + template + inline batch avg(batch const& x, batch const& y, std::true_type, std::true_type) noexcept + { + // Inspired by + // https://stackoverflow.com/questions/5697500/take-the-average-of-two-signed-numbers-in-c + auto t = (x & y) + ((x ^ y) >> 1); + auto t_u = bitwise_cast::type>(t); + auto avg = t + (bitwise_cast(t_u >> (8 * sizeof(T) - 1)) & (x ^ y)); + return avg; + } + + template + inline batch avg(batch const& x, batch const& y, std::false_type, std::true_type) noexcept + { + return (x + y) / 2; + } + } + + template + inline batch avg(batch const& x, batch const& y, requires_arch) noexcept + { + return detail::avg(x, y, typename std::is_integral::type {}, typename std::is_signed::type {}); + } + + // avgr + namespace detail + { + template + inline batch avgr(batch const& x, batch const& y, std::true_type) noexcept + { + constexpr unsigned shift = 8 * sizeof(T) - 1; + auto adj = std::is_signed::value ? ((x ^ y) & 0x1) : (((x ^ y) << shift) >> shift); + return ::xsimd::kernel::avg(x, y, A {}) + adj; + } + + template + inline batch avgr(batch const& x, batch const& y, std::false_type) noexcept + { + return ::xsimd::kernel::avg(x, y, A {}); + } + } + + template + inline batch avgr(batch const& x, batch const& y, requires_arch) noexcept + { + return detail::avgr(x, y, typename std::is_integral::type {}); + } + // batch_cast template inline batch batch_cast(batch const& self, batch const&, requires_arch) noexcept diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp index a5b07ec9da..a4881778e0 100644 --- a/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp +++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp @@ -76,6 +76,44 @@ namespace xsimd } } + // avgr + template ::value, void>::type> + inline batch avgr(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm256_avg_epu8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm256_avg_epu16(self, other); + } + else + { + return avgr(self, other, generic {}); + } + } + + // avg + template ::value, void>::type> + inline batch avg(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + auto adj = ((self ^ other) << 7) >> 7; + return avgr(self, other, A {}) - adj; + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + auto adj = ((self ^ other) << 15) >> 15; + return avgr(self, other, A {}) - adj; + } + else + { + return avg(self, other, generic {}); + } + } + // bitwise_and template ::value, void>::type> inline batch bitwise_and(batch const& self, batch const& other, requires_arch) noexcept diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp index 94a194dab7..95fbcbd461 100644 --- a/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp +++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp @@ -112,6 +112,44 @@ namespace xsimd } } + // avgr + template ::value, void>::type> + inline batch avgr(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm512_avg_epu8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm512_avg_epu16(self, other); + } + else + { + return avgr(self, other, generic {}); + } + } + + // avg + template ::value, void>::type> + inline batch avg(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + auto adj = ((self ^ other) << 7) >> 7; + return avgr(self, other, A {}) - adj; + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + auto adj = ((self ^ other) << 15) >> 15; + return avgr(self, other, A {}) - adj; + } + else + { + return avg(self, other, generic {}); + } + } + // bitwise_lshift template ::value, void>::type> inline batch bitwise_lshift(batch const& self, int32_t other, requires_arch) noexcept diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp new file mode 100644 index 0000000000..5533923020 --- /dev/null +++ b/third_party/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp @@ -0,0 +1,17 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_I8MM_NEON64_HPP +#define XSIMD_I8MM_NEON64_HPP + +#include "../types/xsimd_i8mm_neon64_register.hpp" + +#endif diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp index 0edd776741..dcd2df3fa9 100644 --- a/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp +++ b/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp @@ -104,6 +104,10 @@ #include "./xsimd_neon64.hpp" #endif +#if XSIMD_WITH_I8MM_NEON64 +#include "./xsimd_i8mm_neon64.hpp" +#endif + #if XSIMD_WITH_SVE #include "./xsimd_sve.hpp" #endif diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp index 54f09fb663..3510eb21d9 100644 --- a/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp +++ b/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp @@ -23,33 +23,39 @@ // Wrap intrinsics so we can pass them as function pointers // - OP: intrinsics name prefix, e.g., vorrq // - RT: type traits to deduce intrinsics return types -#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ +#define WRAP_BINARY_UINT_EXCLUDING_64(OP, RT) \ namespace wrap \ { \ inline RT OP##_u8(uint8x16_t a, uint8x16_t b) noexcept \ { \ return ::OP##_u8(a, b); \ } \ - inline RT OP##_s8(int8x16_t a, int8x16_t b) noexcept \ - { \ - return ::OP##_s8(a, b); \ - } \ inline RT OP##_u16(uint16x8_t a, uint16x8_t b) noexcept \ { \ return ::OP##_u16(a, b); \ } \ - inline RT OP##_s16(int16x8_t a, int16x8_t b) noexcept \ - { \ - return ::OP##_s16(a, b); \ - } \ inline RT OP##_u32(uint32x4_t a, uint32x4_t b) noexcept \ { \ return ::OP##_u32(a, b); \ } \ - inline RT OP##_s32(int32x4_t a, int32x4_t b) noexcept \ - { \ - return ::OP##_s32(a, b); \ - } \ + } + +#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \ + WRAP_BINARY_UINT_EXCLUDING_64(OP, RT) \ + namespace wrap \ + { \ + inline RT OP##_s8(int8x16_t a, int8x16_t b) noexcept \ + { \ + return ::OP##_s8(a, b); \ + } \ + inline RT OP##_s16(int16x8_t a, int16x8_t b) noexcept \ + { \ + return ::OP##_s16(a, b); \ + } \ + inline RT OP##_s32(int32x4_t a, int32x4_t b) noexcept \ + { \ + return ::OP##_s32(a, b); \ + } \ } #define WRAP_BINARY_INT(OP, RT) \ @@ -204,6 +210,10 @@ namespace xsimd uint32x4_t, int32x4_t, float32x4_t>; + using excluding_int64f32_dispatcher = neon_dispatcher_impl; + /************************** * comparison dispatchers * **************************/ @@ -744,6 +754,38 @@ namespace xsimd return dispatcher.apply(register_type(lhs), register_type(rhs)); } + /******* + * avg * + *******/ + + WRAP_BINARY_UINT_EXCLUDING_64(vhaddq, detail::identity_return_type) + + template ::value && sizeof(T) != 8), void>::type> + inline batch avg(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + using register_type = typename batch::register_type; + const detail::neon_dispatcher_impl::binary dispatcher = { + std::make_tuple(wrap::vhaddq_u8, wrap::vhaddq_u16, wrap::vhaddq_u32) + }; + return dispatcher.apply(register_type(lhs), register_type(rhs)); + } + + /******** + * avgr * + ********/ + + WRAP_BINARY_UINT_EXCLUDING_64(vrhaddq, detail::identity_return_type) + + template ::value && sizeof(T) != 8), void>::type> + inline batch avgr(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + using register_type = typename batch::register_type; + const detail::neon_dispatcher_impl::binary dispatcher = { + std::make_tuple(wrap::vrhaddq_u8, wrap::vrhaddq_u16, wrap::vrhaddq_u32) + }; + return dispatcher.apply(register_type(lhs), register_type(rhs)); + } + /******** * sadd * ********/ diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp index bc982c7ce6..77538d1c2d 100644 --- a/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp +++ b/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp @@ -92,7 +92,7 @@ namespace xsimd template inline batch broadcast(T val, requires_arch) noexcept { - return broadcast(val, neon {}); + return broadcast(val, neon {}); } template @@ -952,6 +952,41 @@ namespace xsimd /********** * zip_lo * **********/ + template = 0> + inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return vzip1q_u8(lhs, rhs); + } + + template = 0> + inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return vzip1q_s8(lhs, rhs); + } + + template = 0> + inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return vzip1q_u16(lhs, rhs); + } + + template = 0> + inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return vzip1q_s16(lhs, rhs); + } + + template = 0> + inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return vzip1q_u32(lhs, rhs); + } + + template = 0> + inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return vzip1q_s32(lhs, rhs); + } template = 0> inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept @@ -965,6 +1000,12 @@ namespace xsimd return vzip1q_s64(lhs, rhs); } + template + inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return vzip1q_f32(lhs, rhs); + } + template inline batch zip_lo(batch const& lhs, batch const& rhs, requires_arch) noexcept { @@ -975,6 +1016,42 @@ namespace xsimd * zip_hi * **********/ + template = 0> + inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return vzip2q_u8(lhs, rhs); + } + + template = 0> + inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return vzip2q_s8(lhs, rhs); + } + + template = 0> + inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return vzip2q_u16(lhs, rhs); + } + + template = 0> + inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return vzip2q_s16(lhs, rhs); + } + + template = 0> + inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return vzip2q_u32(lhs, rhs); + } + + template = 0> + inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return vzip2q_s32(lhs, rhs); + } + template = 0> inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { @@ -987,6 +1064,12 @@ namespace xsimd return vzip2q_s64(lhs, rhs); } + template + inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept + { + return vzip2q_f32(lhs, rhs); + } + template inline batch zip_hi(batch const& lhs, batch const& rhs, requires_arch) noexcept { diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp index 39bd607be9..1cde15ffe1 100644 --- a/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp +++ b/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp @@ -142,6 +142,39 @@ namespace xsimd return x + y; } + template + inline typename std::common_type::type avg(T const& x, Tp const& y) noexcept + { + using common_type = typename std::common_type::type; + if (std::is_floating_point::value) + return (x + y) / 2; + else if (std::is_unsigned::value) + { + return (x & y) + ((x ^ y) >> 1); + } + else + { + // Inspired by + // https://stackoverflow.com/questions/5697500/take-the-average-of-two-signed-numbers-in-c + auto t = (x & y) + ((x ^ y) >> 1); + auto t_u = static_cast::type>(t); + auto avg = t + (static_cast(t_u >> (8 * sizeof(T) - 1)) & (x ^ y)); + return avg; + } + } + + template + inline typename std::common_type::type avgr(T const& x, Tp const& y) noexcept + { + using common_type = typename std::common_type::type; + if (std::is_floating_point::value) + return avg(x, y); + else + { + return avg(x, y) + ((x ^ y) & 1); + } + } + template inline T incr(T const& x) noexcept { diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp index 0a34cb1e9b..d39cc201f9 100644 --- a/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp +++ b/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp @@ -60,6 +60,10 @@ namespace xsimd inline batch insert(batch const& self, T val, index, requires_arch) noexcept; template inline batch shuffle(batch const& x, batch const& y, batch_constant, Indices...>, requires_arch) noexcept; + template + inline batch avg(batch const&, batch const&, requires_arch) noexcept; + template + inline batch avgr(batch const&, batch const&, requires_arch) noexcept; // abs template @@ -148,6 +152,44 @@ namespace xsimd return _mm_movemask_epi8(self) != 0; } + // avgr + template ::value, void>::type> + inline batch avgr(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return _mm_avg_epu8(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return _mm_avg_epu16(self, other); + } + else + { + return avgr(self, other, generic {}); + } + } + + // avg + template ::value, void>::type> + inline batch avg(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + auto adj = ((self ^ other) << 7) >> 7; + return avgr(self, other, A {}) - adj; + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + auto adj = ((self ^ other) << 15) >> 15; + return avgr(self, other, A {}) - adj; + } + else + { + return avg(self, other, generic {}); + } + } + // batch_bool_cast template inline batch_bool batch_bool_cast(batch_bool const& self, batch_bool const&, requires_arch) noexcept diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_wasm.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_wasm.hpp index 8160b2423b..ab9acdc8c3 100644 --- a/third_party/xsimd/include/xsimd/arch/xsimd_wasm.hpp +++ b/third_party/xsimd/include/xsimd/arch/xsimd_wasm.hpp @@ -37,6 +37,8 @@ namespace xsimd inline batch insert(batch const& self, T val, index, requires_arch) noexcept; template inline batch shuffle(batch const& x, batch const& y, batch_constant, Indices...>, requires_arch) noexcept; + template + inline batch avg(batch const&, batch const&, requires_arch) noexcept; // abs template ::value && std::is_signed::value, void>::type> @@ -116,6 +118,44 @@ namespace xsimd return wasm_f64x2_add(self, other); } + // avgr + template ::value, void>::type> + inline batch avgr(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + return wasm_u8x16_avgr(self, other); + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + return wasm_u16x8_avgr(self, other); + } + else + { + return avgr(self, other, generic {}); + } + } + + // avg + template ::value, void>::type> + inline batch avg(batch const& self, batch const& other, requires_arch) noexcept + { + XSIMD_IF_CONSTEXPR(sizeof(T) == 1) + { + auto adj = ((self ^ other) << 7) >> 7; + return avgr(self, other, A {}) - adj; + } + else XSIMD_IF_CONSTEXPR(sizeof(T) == 2) + { + auto adj = ((self ^ other) << 15) >> 15; + return avgr(self, other, A {}) - adj; + } + else + { + return avg(self, other, generic {}); + } + } + // all template inline bool all(batch_bool const& self, requires_arch) noexcept diff --git a/third_party/xsimd/include/xsimd/config/xsimd_arch.hpp b/third_party/xsimd/include/xsimd/config/xsimd_arch.hpp index 575459a009..ea48aa057d 100644 --- a/third_party/xsimd/include/xsimd/config/xsimd_arch.hpp +++ b/third_party/xsimd/include/xsimd/config/xsimd_arch.hpp @@ -194,7 +194,7 @@ namespace xsimd using all_sve_architectures = arch_list, detail::sve<256>, detail::sve<128>>; using all_rvv_architectures = arch_list, detail::rvv<256>, detail::rvv<128>>; - using all_arm_architectures = typename detail::join>::type; + using all_arm_architectures = typename detail::join, neon64, neon>>::type; using all_riscv_architectures = all_rvv_architectures; using all_wasm_architectures = arch_list; using all_architectures = typename detail::join::type; diff --git a/third_party/xsimd/include/xsimd/config/xsimd_config.hpp b/third_party/xsimd/include/xsimd/config/xsimd_config.hpp index cf5163c37e..10a074dedb 100644 --- a/third_party/xsimd/include/xsimd/config/xsimd_config.hpp +++ b/third_party/xsimd/include/xsimd/config/xsimd_config.hpp @@ -349,6 +349,17 @@ #define XSIMD_WITH_NEON64 0 #endif +/** + * @ingroup xsimd_config_macro + * + * Set to 1 if i8mm neon64 extension is available at compile-time, to 0 otherwise. + */ +#if defined(__ARM_FEATURE_MATMUL_INT8) +#define XSIMD_WITH_I8MM_NEON64 1 +#else +#define XSIMD_WITH_I8MM_NEON64 0 +#endif + /** * @ingroup xsimd_config_macro * diff --git a/third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp b/third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp index 5c8b1f38d0..89b883a396 100644 --- a/third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp +++ b/third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp @@ -18,6 +18,11 @@ #if defined(__linux__) && (defined(__ARM_NEON) || defined(_M_ARM) || defined(__riscv_vector)) #include #include + +#ifndef HWCAP2_I8MM +#define HWCAP2_I8MM (1 << 13) +#endif + #endif #if defined(_MSC_VER) @@ -66,6 +71,7 @@ namespace xsimd ARCH_FIELD_EX(avx512vnni<::xsimd::avx512vbmi>, avx512vnni_vbmi) ARCH_FIELD(neon) ARCH_FIELD(neon64) + ARCH_FIELD_EX(i8mm<::xsimd::neon64>, i8mm_neon64) ARCH_FIELD(sve) ARCH_FIELD(rvv) ARCH_FIELD(wasm) @@ -83,6 +89,9 @@ namespace xsimd #if defined(__aarch64__) || defined(_M_ARM64) neon = 1; neon64 = 1; +#if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18) + i8mm_neon64 = bool(getauxval(AT_HWCAP2) & HWCAP2_I8MM); +#endif #elif defined(__ARM_NEON) || defined(_M_ARM) #if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18) diff --git a/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp b/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp index 4350ca0a28..6537157bc6 100644 --- a/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp +++ b/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp @@ -36,6 +36,8 @@ #include "xsimd_avx512dq_register.hpp" #include "xsimd_avx512f_register.hpp" +#include "xsimd_i8mm_neon64_register.hpp" + #include "xsimd_neon64_register.hpp" #include "xsimd_neon_register.hpp" diff --git a/third_party/xsimd/include/xsimd/types/xsimd_api.hpp b/third_party/xsimd/include/xsimd/types/xsimd_api.hpp index 0420f0a09d..751e31d33a 100644 --- a/third_party/xsimd/include/xsimd/types/xsimd_api.hpp +++ b/third_party/xsimd/include/xsimd/types/xsimd_api.hpp @@ -202,6 +202,36 @@ namespace xsimd return kernel::atanh(x, A {}); } + /** + * @ingroup batch_math + * + * Computes the average of batches \c x and \c y + * @param x batch of T + * @param y batch of T + * @return the average of elements between \c x and \c y. + */ + template + inline batch avg(batch const& x, batch const& y) noexcept + { + detail::static_check_supported_config(); + return kernel::avg(x, y, A {}); + } + + /** + * @ingroup batch_math + * + * Computes the rounded average of batches \c x and \c y + * @param x batch of T + * @param y batch of T + * @return the rounded average of elements between \c x and \c y. + */ + template + inline batch avgr(batch const& x, batch const& y) noexcept + { + detail::static_check_supported_config(); + return kernel::avgr(x, y, A {}); + } + /** * @ingroup batch_conversion * diff --git a/third_party/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp new file mode 100644 index 0000000000..fc0c884d0b --- /dev/null +++ b/third_party/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp @@ -0,0 +1,46 @@ +/*************************************************************************** + * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and * + * Martin Renou * + * Copyright (c) QuantStack * + * Copyright (c) Serge Guelton * + * * + * Distributed under the terms of the BSD 3-Clause License. * + * * + * The full license is in the file LICENSE, distributed with this software. * + ****************************************************************************/ + +#ifndef XSIMD_I8MM_NEON64_REGISTER_HPP +#define XSIMD_I8MM_NEON64_REGISTER_HPP + +#include "./xsimd_neon64_register.hpp" + +namespace xsimd +{ + template + struct i8mm; + + /** + * @ingroup architectures + * + * Neon64 + i8mm instructions + */ + template <> + struct i8mm : neon64 + { + static constexpr bool supported() noexcept { return XSIMD_WITH_I8MM_NEON64; } + static constexpr bool available() noexcept { return true; } + static constexpr unsigned version() noexcept { return generic::version(8, 2, 0); } + static constexpr char const* name() noexcept { return "i8mm+neon64"; } + }; + +#if XSIMD_WITH_I8MM_NEON64 + namespace types + { + + XSIMD_DECLARE_SIMD_REGISTER_ALIAS(i8mm, neon64); + + } +#endif + +} +#endif -- cgit v1.2.3