summaryrefslogtreecommitdiffstats
path: root/third_party/xsimd
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/xsimd')
-rw-r--r--third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp59
-rw-r--r--third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp38
-rw-r--r--third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp38
-rw-r--r--third_party/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp17
-rw-r--r--third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp4
-rw-r--r--third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp68
-rw-r--r--third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp85
-rw-r--r--third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp33
-rw-r--r--third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp42
-rw-r--r--third_party/xsimd/include/xsimd/arch/xsimd_wasm.hpp40
-rw-r--r--third_party/xsimd/include/xsimd/config/xsimd_arch.hpp2
-rw-r--r--third_party/xsimd/include/xsimd/config/xsimd_config.hpp11
-rw-r--r--third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp9
-rw-r--r--third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp2
-rw-r--r--third_party/xsimd/include/xsimd/types/xsimd_api.hpp30
-rw-r--r--third_party/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp46
-rw-r--r--third_party/xsimd/moz.yaml4
17 files changed, 510 insertions, 18 deletions
diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp
index 8fa887dc57..7bcc4da241 100644
--- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp
@@ -26,7 +26,7 @@ namespace xsimd
using namespace types;
// abs
- template <class A, class T, class /*=typename std::enable_if<std::is_integral<T>::value, void>::type*/>
+ template <class A, class T, class>
inline batch<T, A> abs(batch<T, A> const& self, requires_arch<generic>) noexcept
{
if (std::is_unsigned<T>::value)
@@ -45,6 +45,63 @@ namespace xsimd
return hypot(z.real(), z.imag());
}
+ // avg
+ namespace detail
+ {
+ template <class A, class T>
+ inline batch<T, A> avg(batch<T, A> const& x, batch<T, A> const& y, std::true_type, std::false_type) noexcept
+ {
+ return (x & y) + ((x ^ y) >> 1);
+ }
+
+ template <class A, class T>
+ inline batch<T, A> avg(batch<T, A> const& x, batch<T, A> const& y, std::true_type, std::true_type) noexcept
+ {
+ // Inspired by
+ // https://stackoverflow.com/questions/5697500/take-the-average-of-two-signed-numbers-in-c
+ auto t = (x & y) + ((x ^ y) >> 1);
+ auto t_u = bitwise_cast<typename std::make_unsigned<T>::type>(t);
+ auto avg = t + (bitwise_cast<T>(t_u >> (8 * sizeof(T) - 1)) & (x ^ y));
+ return avg;
+ }
+
+ template <class A, class T>
+ inline batch<T, A> avg(batch<T, A> const& x, batch<T, A> const& y, std::false_type, std::true_type) noexcept
+ {
+ return (x + y) / 2;
+ }
+ }
+
+ template <class A, class T>
+ inline batch<T, A> avg(batch<T, A> const& x, batch<T, A> const& y, requires_arch<generic>) noexcept
+ {
+ return detail::avg(x, y, typename std::is_integral<T>::type {}, typename std::is_signed<T>::type {});
+ }
+
+ // avgr
+ namespace detail
+ {
+ template <class A, class T>
+ inline batch<T, A> avgr(batch<T, A> const& x, batch<T, A> const& y, std::true_type) noexcept
+ {
+ constexpr unsigned shift = 8 * sizeof(T) - 1;
+ auto adj = std::is_signed<T>::value ? ((x ^ y) & 0x1) : (((x ^ y) << shift) >> shift);
+ return ::xsimd::kernel::avg(x, y, A {}) + adj;
+ }
+
+ template <class A, class T>
+ inline batch<T, A> avgr(batch<T, A> const& x, batch<T, A> const& y, std::false_type) noexcept
+ {
+ return ::xsimd::kernel::avg(x, y, A {});
+ }
+ }
+
+ template <class A, class T>
+ inline batch<T, A> avgr(batch<T, A> const& x, batch<T, A> const& y, requires_arch<generic>) noexcept
+ {
+ return detail::avgr(x, y, typename std::is_integral<T>::type {});
+ }
+
// batch_cast
template <class A, class T>
inline batch<T, A> batch_cast(batch<T, A> const& self, batch<T, A> const&, requires_arch<generic>) noexcept
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp
index a5b07ec9da..a4881778e0 100644
--- a/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp
@@ -76,6 +76,44 @@ namespace xsimd
}
}
+ // avgr
+ template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+ inline batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ return _mm256_avg_epu8(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ return _mm256_avg_epu16(self, other);
+ }
+ else
+ {
+ return avgr(self, other, generic {});
+ }
+ }
+
+ // avg
+ template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+ inline batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ auto adj = ((self ^ other) << 7) >> 7;
+ return avgr(self, other, A {}) - adj;
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ auto adj = ((self ^ other) << 15) >> 15;
+ return avgr(self, other, A {}) - adj;
+ }
+ else
+ {
+ return avg(self, other, generic {});
+ }
+ }
+
// bitwise_and
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx2>) noexcept
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp
index 94a194dab7..95fbcbd461 100644
--- a/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp
@@ -112,6 +112,44 @@ namespace xsimd
}
}
+ // avgr
+ template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+ inline batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ return _mm512_avg_epu8(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ return _mm512_avg_epu16(self, other);
+ }
+ else
+ {
+ return avgr(self, other, generic {});
+ }
+ }
+
+ // avg
+ template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+ inline batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx512bw>) noexcept
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ auto adj = ((self ^ other) << 7) >> 7;
+ return avgr(self, other, A {}) - adj;
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ auto adj = ((self ^ other) << 15) >> 15;
+ return avgr(self, other, A {}) - adj;
+ }
+ else
+ {
+ return avg(self, other, generic {});
+ }
+ }
+
// bitwise_lshift
template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx512bw>) noexcept
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp
new file mode 100644
index 0000000000..5533923020
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_i8mm_neon64.hpp
@@ -0,0 +1,17 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
+ * Martin Renou *
+ * Copyright (c) QuantStack *
+ * Copyright (c) Serge Guelton *
+ * *
+ * Distributed under the terms of the BSD 3-Clause License. *
+ * *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_I8MM_NEON64_HPP
+#define XSIMD_I8MM_NEON64_HPP
+
+#include "../types/xsimd_i8mm_neon64_register.hpp"
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp
index 0edd776741..dcd2df3fa9 100644
--- a/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp
@@ -104,6 +104,10 @@
#include "./xsimd_neon64.hpp"
#endif
+#if XSIMD_WITH_I8MM_NEON64
+#include "./xsimd_i8mm_neon64.hpp"
+#endif
+
#if XSIMD_WITH_SVE
#include "./xsimd_sve.hpp"
#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp
index 54f09fb663..3510eb21d9 100644
--- a/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp
@@ -23,33 +23,39 @@
// Wrap intrinsics so we can pass them as function pointers
// - OP: intrinsics name prefix, e.g., vorrq
// - RT: type traits to deduce intrinsics return types
-#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \
+#define WRAP_BINARY_UINT_EXCLUDING_64(OP, RT) \
namespace wrap \
{ \
inline RT<uint8x16_t> OP##_u8(uint8x16_t a, uint8x16_t b) noexcept \
{ \
return ::OP##_u8(a, b); \
} \
- inline RT<int8x16_t> OP##_s8(int8x16_t a, int8x16_t b) noexcept \
- { \
- return ::OP##_s8(a, b); \
- } \
inline RT<uint16x8_t> OP##_u16(uint16x8_t a, uint16x8_t b) noexcept \
{ \
return ::OP##_u16(a, b); \
} \
- inline RT<int16x8_t> OP##_s16(int16x8_t a, int16x8_t b) noexcept \
- { \
- return ::OP##_s16(a, b); \
- } \
inline RT<uint32x4_t> OP##_u32(uint32x4_t a, uint32x4_t b) noexcept \
{ \
return ::OP##_u32(a, b); \
} \
- inline RT<int32x4_t> OP##_s32(int32x4_t a, int32x4_t b) noexcept \
- { \
- return ::OP##_s32(a, b); \
- } \
+ }
+
+#define WRAP_BINARY_INT_EXCLUDING_64(OP, RT) \
+ WRAP_BINARY_UINT_EXCLUDING_64(OP, RT) \
+ namespace wrap \
+ { \
+ inline RT<int8x16_t> OP##_s8(int8x16_t a, int8x16_t b) noexcept \
+ { \
+ return ::OP##_s8(a, b); \
+ } \
+ inline RT<int16x8_t> OP##_s16(int16x8_t a, int16x8_t b) noexcept \
+ { \
+ return ::OP##_s16(a, b); \
+ } \
+ inline RT<int32x4_t> OP##_s32(int32x4_t a, int32x4_t b) noexcept \
+ { \
+ return ::OP##_s32(a, b); \
+ } \
}
#define WRAP_BINARY_INT(OP, RT) \
@@ -204,6 +210,10 @@ namespace xsimd
uint32x4_t, int32x4_t,
float32x4_t>;
+ using excluding_int64f32_dispatcher = neon_dispatcher_impl<uint8x16_t, int8x16_t,
+ uint16x8_t, int16x8_t,
+ uint32x4_t, int32x4_t>;
+
/**************************
* comparison dispatchers *
**************************/
@@ -744,6 +754,38 @@ namespace xsimd
return dispatcher.apply(register_type(lhs), register_type(rhs));
}
+ /*******
+ * avg *
+ *******/
+
+ WRAP_BINARY_UINT_EXCLUDING_64(vhaddq, detail::identity_return_type)
+
+ template <class A, class T, class = typename std::enable_if<(std::is_unsigned<T>::value && sizeof(T) != 8), void>::type>
+ inline batch<T, A> avg(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+ {
+ using register_type = typename batch<T, A>::register_type;
+ const detail::neon_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary dispatcher = {
+ std::make_tuple(wrap::vhaddq_u8, wrap::vhaddq_u16, wrap::vhaddq_u32)
+ };
+ return dispatcher.apply(register_type(lhs), register_type(rhs));
+ }
+
+ /********
+ * avgr *
+ ********/
+
+ WRAP_BINARY_UINT_EXCLUDING_64(vrhaddq, detail::identity_return_type)
+
+ template <class A, class T, class = typename std::enable_if<(std::is_unsigned<T>::value && sizeof(T) != 8), void>::type>
+ inline batch<T, A> avgr(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon>) noexcept
+ {
+ using register_type = typename batch<T, A>::register_type;
+ const detail::neon_dispatcher_impl<uint8x16_t, uint16x8_t, uint32x4_t>::binary dispatcher = {
+ std::make_tuple(wrap::vrhaddq_u8, wrap::vrhaddq_u16, wrap::vrhaddq_u32)
+ };
+ return dispatcher.apply(register_type(lhs), register_type(rhs));
+ }
+
/********
* sadd *
********/
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp
index bc982c7ce6..77538d1c2d 100644
--- a/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp
@@ -92,7 +92,7 @@ namespace xsimd
template <class A, class T>
inline batch<T, A> broadcast(T val, requires_arch<neon64>) noexcept
{
- return broadcast<neon64>(val, neon {});
+ return broadcast<A>(val, neon {});
}
template <class A>
@@ -952,6 +952,41 @@ namespace xsimd
/**********
* zip_lo *
**********/
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+ inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+ {
+ return vzip1q_u8(lhs, rhs);
+ }
+
+ template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+ inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+ {
+ return vzip1q_s8(lhs, rhs);
+ }
+
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+ inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+ {
+ return vzip1q_u16(lhs, rhs);
+ }
+
+ template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+ inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+ {
+ return vzip1q_s16(lhs, rhs);
+ }
+
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+ inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+ {
+ return vzip1q_u32(lhs, rhs);
+ }
+
+ template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+ inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+ {
+ return vzip1q_s32(lhs, rhs);
+ }
template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
inline batch<T, A> zip_lo(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
@@ -966,6 +1001,12 @@ namespace xsimd
}
template <class A>
+ inline batch<float, A> zip_lo(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon64>) noexcept
+ {
+ return vzip1q_f32(lhs, rhs);
+ }
+
+ template <class A>
inline batch<double, A> zip_lo(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
{
return vzip1q_f64(lhs, rhs);
@@ -975,6 +1016,42 @@ namespace xsimd
* zip_hi *
**********/
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 1> = 0>
+ inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+ {
+ return vzip2q_u8(lhs, rhs);
+ }
+
+ template <class A, class T, detail::enable_sized_signed_t<T, 1> = 0>
+ inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+ {
+ return vzip2q_s8(lhs, rhs);
+ }
+
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 2> = 0>
+ inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+ {
+ return vzip2q_u16(lhs, rhs);
+ }
+
+ template <class A, class T, detail::enable_sized_signed_t<T, 2> = 0>
+ inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+ {
+ return vzip2q_s16(lhs, rhs);
+ }
+
+ template <class A, class T, detail::enable_sized_unsigned_t<T, 4> = 0>
+ inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+ {
+ return vzip2q_u32(lhs, rhs);
+ }
+
+ template <class A, class T, detail::enable_sized_signed_t<T, 4> = 0>
+ inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
+ {
+ return vzip2q_s32(lhs, rhs);
+ }
+
template <class A, class T, detail::enable_sized_unsigned_t<T, 8> = 0>
inline batch<T, A> zip_hi(batch<T, A> const& lhs, batch<T, A> const& rhs, requires_arch<neon64>) noexcept
{
@@ -988,6 +1065,12 @@ namespace xsimd
}
template <class A>
+ inline batch<float, A> zip_hi(batch<float, A> const& lhs, batch<float, A> const& rhs, requires_arch<neon64>) noexcept
+ {
+ return vzip2q_f32(lhs, rhs);
+ }
+
+ template <class A>
inline batch<double, A> zip_hi(batch<double, A> const& lhs, batch<double, A> const& rhs, requires_arch<neon64>) noexcept
{
return vzip2q_f64(lhs, rhs);
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp
index 39bd607be9..1cde15ffe1 100644
--- a/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp
@@ -142,6 +142,39 @@ namespace xsimd
return x + y;
}
+ template <class T, class Tp>
+ inline typename std::common_type<T, Tp>::type avg(T const& x, Tp const& y) noexcept
+ {
+ using common_type = typename std::common_type<T, Tp>::type;
+ if (std::is_floating_point<common_type>::value)
+ return (x + y) / 2;
+ else if (std::is_unsigned<common_type>::value)
+ {
+ return (x & y) + ((x ^ y) >> 1);
+ }
+ else
+ {
+ // Inspired by
+ // https://stackoverflow.com/questions/5697500/take-the-average-of-two-signed-numbers-in-c
+ auto t = (x & y) + ((x ^ y) >> 1);
+ auto t_u = static_cast<typename std::make_unsigned<common_type>::type>(t);
+ auto avg = t + (static_cast<T>(t_u >> (8 * sizeof(T) - 1)) & (x ^ y));
+ return avg;
+ }
+ }
+
+ template <class T, class Tp>
+ inline typename std::common_type<T, Tp>::type avgr(T const& x, Tp const& y) noexcept
+ {
+ using common_type = typename std::common_type<T, Tp>::type;
+ if (std::is_floating_point<common_type>::value)
+ return avg(x, y);
+ else
+ {
+ return avg(x, y) + ((x ^ y) & 1);
+ }
+ }
+
template <class T>
inline T incr(T const& x) noexcept
{
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp
index 0a34cb1e9b..d39cc201f9 100644
--- a/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp
@@ -60,6 +60,10 @@ namespace xsimd
inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
template <class A, typename T, typename ITy, ITy... Indices>
inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<batch<ITy, A>, Indices...>, requires_arch<generic>) noexcept;
+ template <class A, class T>
+ inline batch<T, A> avg(batch<T, A> const&, batch<T, A> const&, requires_arch<generic>) noexcept;
+ template <class A, class T>
+ inline batch<T, A> avgr(batch<T, A> const&, batch<T, A> const&, requires_arch<generic>) noexcept;
// abs
template <class A>
@@ -148,6 +152,44 @@ namespace xsimd
return _mm_movemask_epi8(self) != 0;
}
+ // avgr
+ template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+ inline batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ return _mm_avg_epu8(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ return _mm_avg_epu16(self, other);
+ }
+ else
+ {
+ return avgr(self, other, generic {});
+ }
+ }
+
+ // avg
+ template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+ inline batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<sse2>) noexcept
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ auto adj = ((self ^ other) << 7) >> 7;
+ return avgr(self, other, A {}) - adj;
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ auto adj = ((self ^ other) << 15) >> 15;
+ return avgr(self, other, A {}) - adj;
+ }
+ else
+ {
+ return avg(self, other, generic {});
+ }
+ }
+
// batch_bool_cast
template <class A, class T_out, class T_in>
inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<sse2>) noexcept
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_wasm.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_wasm.hpp
index 8160b2423b..ab9acdc8c3 100644
--- a/third_party/xsimd/include/xsimd/arch/xsimd_wasm.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_wasm.hpp
@@ -37,6 +37,8 @@ namespace xsimd
inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
template <class A, typename T, typename ITy, ITy... Indices>
inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<batch<ITy, A>, Indices...>, requires_arch<generic>) noexcept;
+ template <class A, class T>
+ inline batch<T, A> avg(batch<T, A> const&, batch<T, A> const&, requires_arch<generic>) noexcept;
// abs
template <class A, class T, typename std::enable_if<std::is_integral<T>::value && std::is_signed<T>::value, void>::type>
@@ -116,6 +118,44 @@ namespace xsimd
return wasm_f64x2_add(self, other);
}
+ // avgr
+ template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+ inline batch<T, A> avgr(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ return wasm_u8x16_avgr(self, other);
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ return wasm_u16x8_avgr(self, other);
+ }
+ else
+ {
+ return avgr(self, other, generic {});
+ }
+ }
+
+ // avg
+ template <class A, class T, class = typename std::enable_if<std::is_unsigned<T>::value, void>::type>
+ inline batch<T, A> avg(batch<T, A> const& self, batch<T, A> const& other, requires_arch<wasm>) noexcept
+ {
+ XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+ {
+ auto adj = ((self ^ other) << 7) >> 7;
+ return avgr(self, other, A {}) - adj;
+ }
+ else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+ {
+ auto adj = ((self ^ other) << 15) >> 15;
+ return avgr(self, other, A {}) - adj;
+ }
+ else
+ {
+ return avg(self, other, generic {});
+ }
+ }
+
// all
template <class A>
inline bool all(batch_bool<float, A> const& self, requires_arch<wasm>) noexcept
diff --git a/third_party/xsimd/include/xsimd/config/xsimd_arch.hpp b/third_party/xsimd/include/xsimd/config/xsimd_arch.hpp
index 575459a009..ea48aa057d 100644
--- a/third_party/xsimd/include/xsimd/config/xsimd_arch.hpp
+++ b/third_party/xsimd/include/xsimd/config/xsimd_arch.hpp
@@ -194,7 +194,7 @@ namespace xsimd
using all_sve_architectures = arch_list<detail::sve<512>, detail::sve<256>, detail::sve<128>>;
using all_rvv_architectures = arch_list<detail::rvv<512>, detail::rvv<256>, detail::rvv<128>>;
- using all_arm_architectures = typename detail::join<all_sve_architectures, arch_list<neon64, neon>>::type;
+ using all_arm_architectures = typename detail::join<all_sve_architectures, arch_list<i8mm<neon64>, neon64, neon>>::type;
using all_riscv_architectures = all_rvv_architectures;
using all_wasm_architectures = arch_list<wasm>;
using all_architectures = typename detail::join<all_riscv_architectures, all_wasm_architectures, all_arm_architectures, all_x86_architectures>::type;
diff --git a/third_party/xsimd/include/xsimd/config/xsimd_config.hpp b/third_party/xsimd/include/xsimd/config/xsimd_config.hpp
index cf5163c37e..10a074dedb 100644
--- a/third_party/xsimd/include/xsimd/config/xsimd_config.hpp
+++ b/third_party/xsimd/include/xsimd/config/xsimd_config.hpp
@@ -352,6 +352,17 @@
/**
* @ingroup xsimd_config_macro
*
+ * Set to 1 if i8mm neon64 extension is available at compile-time, to 0 otherwise.
+ */
+#if defined(__ARM_FEATURE_MATMUL_INT8)
+#define XSIMD_WITH_I8MM_NEON64 1
+#else
+#define XSIMD_WITH_I8MM_NEON64 0
+#endif
+
+/**
+ * @ingroup xsimd_config_macro
+ *
* Set to 1 if SVE is available and bit width is pre-set at compile-time, to 0 otherwise.
*/
#if defined(__ARM_FEATURE_SVE) && defined(__ARM_FEATURE_SVE_BITS) && __ARM_FEATURE_SVE_BITS > 0
diff --git a/third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp b/third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp
index 5c8b1f38d0..89b883a396 100644
--- a/third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp
+++ b/third_party/xsimd/include/xsimd/config/xsimd_cpuid.hpp
@@ -18,6 +18,11 @@
#if defined(__linux__) && (defined(__ARM_NEON) || defined(_M_ARM) || defined(__riscv_vector))
#include <asm/hwcap.h>
#include <sys/auxv.h>
+
+#ifndef HWCAP2_I8MM
+#define HWCAP2_I8MM (1 << 13)
+#endif
+
#endif
#if defined(_MSC_VER)
@@ -66,6 +71,7 @@ namespace xsimd
ARCH_FIELD_EX(avx512vnni<::xsimd::avx512vbmi>, avx512vnni_vbmi)
ARCH_FIELD(neon)
ARCH_FIELD(neon64)
+ ARCH_FIELD_EX(i8mm<::xsimd::neon64>, i8mm_neon64)
ARCH_FIELD(sve)
ARCH_FIELD(rvv)
ARCH_FIELD(wasm)
@@ -83,6 +89,9 @@ namespace xsimd
#if defined(__aarch64__) || defined(_M_ARM64)
neon = 1;
neon64 = 1;
+#if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18)
+ i8mm_neon64 = bool(getauxval(AT_HWCAP2) & HWCAP2_I8MM);
+#endif
#elif defined(__ARM_NEON) || defined(_M_ARM)
#if defined(__linux__) && (!defined(__ANDROID_API__) || __ANDROID_API__ >= 18)
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp b/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp
index 4350ca0a28..6537157bc6 100644
--- a/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp
@@ -36,6 +36,8 @@
#include "xsimd_avx512dq_register.hpp"
#include "xsimd_avx512f_register.hpp"
+#include "xsimd_i8mm_neon64_register.hpp"
+
#include "xsimd_neon64_register.hpp"
#include "xsimd_neon_register.hpp"
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_api.hpp b/third_party/xsimd/include/xsimd/types/xsimd_api.hpp
index 0420f0a09d..751e31d33a 100644
--- a/third_party/xsimd/include/xsimd/types/xsimd_api.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_api.hpp
@@ -203,6 +203,36 @@ namespace xsimd
}
/**
+ * @ingroup batch_math
+ *
+ * Computes the average of batches \c x and \c y
+ * @param x batch of T
+ * @param y batch of T
+ * @return the average of elements between \c x and \c y.
+ */
+ template <class T, class A>
+ inline batch<T, A> avg(batch<T, A> const& x, batch<T, A> const& y) noexcept
+ {
+ detail::static_check_supported_config<T, A>();
+ return kernel::avg<A>(x, y, A {});
+ }
+
+ /**
+ * @ingroup batch_math
+ *
+ * Computes the rounded average of batches \c x and \c y
+ * @param x batch of T
+ * @param y batch of T
+ * @return the rounded average of elements between \c x and \c y.
+ */
+ template <class T, class A>
+ inline batch<T, A> avgr(batch<T, A> const& x, batch<T, A> const& y) noexcept
+ {
+ detail::static_check_supported_config<T, A>();
+ return kernel::avgr<A>(x, y, A {});
+ }
+
+ /**
* @ingroup batch_conversion
*
* Perform a static_cast from \c T_in to \c T_out on \c \c x.
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp
new file mode 100644
index 0000000000..fc0c884d0b
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_i8mm_neon64_register.hpp
@@ -0,0 +1,46 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
+ * Martin Renou *
+ * Copyright (c) QuantStack *
+ * Copyright (c) Serge Guelton *
+ * *
+ * Distributed under the terms of the BSD 3-Clause License. *
+ * *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_I8MM_NEON64_REGISTER_HPP
+#define XSIMD_I8MM_NEON64_REGISTER_HPP
+
+#include "./xsimd_neon64_register.hpp"
+
+namespace xsimd
+{
+ template <typename arch>
+ struct i8mm;
+
+ /**
+ * @ingroup architectures
+ *
+ * Neon64 + i8mm instructions
+ */
+ template <>
+ struct i8mm<neon64> : neon64
+ {
+ static constexpr bool supported() noexcept { return XSIMD_WITH_I8MM_NEON64; }
+ static constexpr bool available() noexcept { return true; }
+ static constexpr unsigned version() noexcept { return generic::version(8, 2, 0); }
+ static constexpr char const* name() noexcept { return "i8mm+neon64"; }
+ };
+
+#if XSIMD_WITH_I8MM_NEON64
+ namespace types
+ {
+
+ XSIMD_DECLARE_SIMD_REGISTER_ALIAS(i8mm<neon64>, neon64);
+
+ }
+#endif
+
+}
+#endif
diff --git a/third_party/xsimd/moz.yaml b/third_party/xsimd/moz.yaml
index 33b7a20dba..7bd3d2fd13 100644
--- a/third_party/xsimd/moz.yaml
+++ b/third_party/xsimd/moz.yaml
@@ -10,8 +10,8 @@ origin:
url: https://github.com/QuantStack/xsimd
- release: 2af7de3641ba38324375699ce261f20557c02dc9 (2024-01-09T11:30:28Z).
- revision: 2af7de3641ba38324375699ce261f20557c02dc9
+ release: ce58d62666c315140eb54042498d93114edbaa68 (2024-02-27T16:05:37Z).
+ revision: ce58d62666c315140eb54042498d93114edbaa68
license: BSD-3-Clause