Merging upstream version 127.0.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-06-12 05:43:14 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2024-06-12 05:43:14 +0000
commit: 8dd16259287f58f9273002717ec4d27e97127719 (patch)
tree: 3863e62a53829a84037444beab3abd4ed9dfc7d0 /third_party/xsimd
parent: Releasing progress-linux version 126.0.1-1~progress7.99u1. (diff)
download: firefox-8dd16259287f58f9273002717ec4d27e97127719.tar.xz
firefox-8dd16259287f58f9273002717ec4d27e97127719.zip
22 files changed, 1227 insertions, 310 deletions
diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp
index 7bcc4da241..a7b8e2f90d 100644
--- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_math.hpp
@@ -2064,7 +2064,7 @@ namespace xsimd
             inline T reduce(Op op, batch<T, A> const& self, std::integral_constant<unsigned, Lvl>) noexcept
             {
                 using index_type = as_unsigned_integer_t<T>;
-                batch<T, A> split = swizzle(self, make_batch_constant<batch<index_type, A>, split_high<index_type, Lvl / 2>>());
+                batch<T, A> split = swizzle(self, make_batch_constant<index_type, A, split_high<index_type, Lvl / 2>>());
                 return reduce(op, op(split, self), std::integral_constant<unsigned, Lvl / 2>());
             }
         }
diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp
index e9e9065832..f92f6b48b4 100644
--- a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp
@@ -21,10 +21,10 @@
 
 namespace xsimd
 {
-    template <class batch_type, typename batch_type::value_type... Values>
+    template <typename T, class A, T... Values>
     struct batch_constant;
 
-    template <class batch_type, bool... Values>
+    template <typename T, class A, bool... Values>
     struct batch_bool_constant;
 
     namespace kernel
@@ -180,7 +180,7 @@ namespace xsimd
                 }
             };
             batch<T, A> tmp(val);
-            return select(make_batch_bool_constant<batch<T, A>, index_mask>(), self, tmp);
+            return select(make_batch_bool_constant<T, A, index_mask>(), self, tmp);
         }
 
         // get
@@ -295,7 +295,7 @@ namespace xsimd
                 }
             };
 
-            return swizzle(self, make_batch_constant<batch<as_unsigned_integer_t<T>, A>, rotate_generator>(), A {});
+            return swizzle(self, make_batch_constant<as_unsigned_integer_t<T>, A, rotate_generator>(), A {});
         }
 
         template <size_t N, class A, class T>
@@ -316,7 +316,7 @@ namespace xsimd
                 }
             };
 
-            return swizzle(self, make_batch_constant<batch<as_unsigned_integer_t<T>, A>, rotate_generator>(), A {});
+            return swizzle(self, make_batch_constant<as_unsigned_integer_t<T>, A, rotate_generator>(), A {});
         }
 
         template <size_t N, class A, class T>
@@ -412,6 +412,12 @@ namespace xsimd
                 return true;
             }
 
+            template <typename ITy>
+            constexpr bool is_zip_lo(size_t, ITy)
+            {
+                return false;
+            }
+
             template <typename ITy0, typename ITy1, typename... ITys>
             constexpr bool is_zip_lo(size_t bsize, ITy0 index0, ITy1 index1, ITys... indices)
             {
@@ -423,6 +429,12 @@ namespace xsimd
                 return true;
             }
 
+            template <typename ITy>
+            constexpr bool is_zip_hi(size_t, ITy)
+            {
+                return false;
+            }
+
             template <typename ITy0, typename ITy1, typename... ITys>
             constexpr bool is_zip_hi(size_t bsize, ITy0 index0, ITy1 index1, ITys... indices)
             {
@@ -443,19 +455,19 @@ namespace xsimd
         }
 
         template <class A, typename T, typename ITy, ITy... Indices>
-        inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<batch<ITy, A>, Indices...>, requires_arch<generic>) noexcept
+        inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept
         {
             constexpr size_t bsize = sizeof...(Indices);
 
             // Detect common patterns
             XSIMD_IF_CONSTEXPR(detail::is_swizzle_fst(bsize, Indices...))
             {
-                return swizzle(x, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? 0 /* never happens */ : Indices)...>());
+                return swizzle(x, batch_constant<ITy, A, ((Indices >= bsize) ? 0 /* never happens */ : Indices)...>());
             }
 
             XSIMD_IF_CONSTEXPR(detail::is_swizzle_snd(bsize, Indices...))
             {
-                return swizzle(y, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? (Indices - bsize) : 0 /* never happens */)...>());
+                return swizzle(y, batch_constant<ITy, A, ((Indices >= bsize) ? (Indices - bsize) : 0 /* never happens */)...>());
             }
 
             XSIMD_IF_CONSTEXPR(detail::is_zip_lo(bsize, Indices...))
@@ -470,7 +482,7 @@ namespace xsimd
 
             XSIMD_IF_CONSTEXPR(detail::is_select(bsize, Indices...))
             {
-                return select(batch_bool_constant<batch<T, A>, (Indices < bsize)...>(), x, y);
+                return select(batch_bool_constant<T, A, (Indices < bsize)...>(), x, y);
             }
 
 #if defined(__has_builtin)
@@ -491,9 +503,9 @@ namespace xsimd
 #else
             // Use a generic_pattern. It is suboptimal but clang optimizes this
             // pretty well.
-            batch<T, A> x_lane = swizzle(x, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
-            batch<T, A> y_lane = swizzle(y, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
-            batch_bool_constant<batch<T, A>, (Indices < bsize)...> select_x_lane;
+            batch<T, A> x_lane = swizzle(x, batch_constant<ITy, A, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
+            batch<T, A> y_lane = swizzle(y, batch_constant<ITy, A, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
+            batch_bool_constant<T, A, (Indices < bsize)...> select_x_lane;
             return select(select_x_lane, x_lane, y_lane);
 #endif
         }
@@ -530,7 +542,7 @@ namespace xsimd
 
         // swizzle
         template <class A, class T, class ITy, ITy... Vs>
-        inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<batch<ITy, A>, Vs...> mask, requires_arch<generic>) noexcept
+        inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<ITy, A, Vs...> mask, requires_arch<generic>) noexcept
         {
             return { swizzle(self.real(), mask), swizzle(self.imag(), mask) };
         }
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp
index 5ec1e02d48..66bcb45022 100644
--- a/third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp
@@ -1161,22 +1161,22 @@ namespace xsimd
             return detail::merge_sse(res_low, res_hi);
         }
         template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
+        inline batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
         {
             return select(batch_bool<T, A> { Values... }, true_br, false_br, avx2 {});
         }
 
         template <class A, bool... Values>
-        inline batch<float, A> select(batch_bool_constant<batch<float, A>, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx>) noexcept
+        inline batch<float, A> select(batch_bool_constant<float, A, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx>) noexcept
         {
-            constexpr auto mask = batch_bool_constant<batch<float, A>, Values...>::mask();
+            constexpr auto mask = batch_bool_constant<float, A, Values...>::mask();
             return _mm256_blend_ps(false_br, true_br, mask);
         }
 
         template <class A, bool... Values>
-        inline batch<double, A> select(batch_bool_constant<batch<double, A>, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx>) noexcept
+        inline batch<double, A> select(batch_bool_constant<double, A, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx>) noexcept
         {
-            constexpr auto mask = batch_bool_constant<batch<double, A>, Values...>::mask();
+            constexpr auto mask = batch_bool_constant<double, A, Values...>::mask();
             return _mm256_blend_pd(false_br, true_br, mask);
         }
 
@@ -1238,7 +1238,7 @@ namespace xsimd
 
         // shuffle
         template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3, ITy I4, ITy I5, ITy I6, ITy I7>
-        inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx>) noexcept
+        inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx>) noexcept
         {
             constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3);
             // shuffle within lane
@@ -1253,7 +1253,7 @@ namespace xsimd
         }
 
         template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
-        inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3> mask, requires_arch<avx>) noexcept
+        inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3> mask, requires_arch<avx>) noexcept
         {
             constexpr uint32_t smask = (I0 & 0x1) | ((I1 & 0x1) << 1) | ((I2 & 0x1) << 2) | ((I3 & 0x1) << 3);
             // shuffle within lane
@@ -1504,7 +1504,7 @@ namespace xsimd
 
         // swizzle (constant mask)
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
-        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<avx>) noexcept
+        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<avx>) noexcept
         {
             // duplicate low and high part of input
             __m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1));
@@ -1514,14 +1514,14 @@ namespace xsimd
             __m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1);
 
             // normalize mask
-            batch_constant<batch<uint32_t, A>, (V0 % 4), (V1 % 4), (V2 % 4), (V3 % 4), (V4 % 4), (V5 % 4), (V6 % 4), (V7 % 4)> half_mask;
+            batch_constant<uint32_t, A, (V0 % 4), (V1 % 4), (V2 % 4), (V3 % 4), (V4 % 4), (V5 % 4), (V6 % 4), (V7 % 4)> half_mask;
 
             // permute within each lane
-            __m256 r0 = _mm256_permutevar_ps(low_low, (batch<uint32_t, A>)half_mask);
-            __m256 r1 = _mm256_permutevar_ps(hi_hi, (batch<uint32_t, A>)half_mask);
+            __m256 r0 = _mm256_permutevar_ps(low_low, half_mask.as_batch());
+            __m256 r1 = _mm256_permutevar_ps(hi_hi, half_mask.as_batch());
 
             // mask to choose the right lane
-            batch_bool_constant<batch<uint32_t, A>, (V0 >= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> blend_mask;
+            batch_bool_constant<uint32_t, A, (V0 >= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> blend_mask;
 
             // blend the two permutes
             constexpr auto mask = blend_mask.mask();
@@ -1529,7 +1529,7 @@ namespace xsimd
         }
 
         template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
-        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx>) noexcept
+        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx>) noexcept
         {
             // duplicate low and high part of input
             __m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1));
@@ -1539,14 +1539,14 @@ namespace xsimd
             __m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1);
 
             // normalize mask
-            batch_constant<batch<uint64_t, A>, (V0 % 2) * -1, (V1 % 2) * -1, (V2 % 2) * -1, (V3 % 2) * -1> half_mask;
+            batch_constant<uint64_t, A, (V0 % 2) * -1, (V1 % 2) * -1, (V2 % 2) * -1, (V3 % 2) * -1> half_mask;
 
             // permute within each lane
-            __m256d r0 = _mm256_permutevar_pd(low_low, (batch<uint64_t, A>)half_mask);
-            __m256d r1 = _mm256_permutevar_pd(hi_hi, (batch<uint64_t, A>)half_mask);
+            __m256d r0 = _mm256_permutevar_pd(low_low, half_mask.as_batch());
+            __m256d r1 = _mm256_permutevar_pd(hi_hi, half_mask.as_batch());
 
             // mask to choose the right lane
-            batch_bool_constant<batch<uint64_t, A>, (V0 >= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask;
+            batch_bool_constant<uint64_t, A, (V0 >= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask;
 
             // blend the two permutes
             constexpr auto mask = blend_mask.mask();
@@ -1564,7 +1564,7 @@ namespace xsimd
                   uint32_t V7,
                   detail::enable_sized_integral_t<T, 4> = 0>
         inline batch<T, A> swizzle(batch<T, A> const& self,
-                                   batch_constant<batch<uint32_t, A>,
+                                   batch_constant<uint32_t, A,
                                                   V0,
                                                   V1,
                                                   V2,
@@ -1588,7 +1588,7 @@ namespace xsimd
                   detail::enable_sized_integral_t<T, 8> = 0>
         inline batch<T, A>
         swizzle(batch<T, A> const& self,
-                batch_constant<batch<uint64_t, A>, V0, V1, V2, V3> const& mask,
+                batch_constant<uint64_t, A, V0, V1, V2, V3> const& mask,
                 requires_arch<avx>) noexcept
         {
             return bitwise_cast<T>(
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp
index a4881778e0..c052e0f269 100644
--- a/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx2.hpp
@@ -729,9 +729,9 @@ namespace xsimd
             }
         }
         template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
+        inline batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx2>) noexcept
         {
-            constexpr int mask = batch_bool_constant<batch<T, A>, Values...>::mask();
+            constexpr int mask = batch_bool_constant<T, A, Values...>::mask();
             // FIXME: for some reason mask here is not considered as an immediate,
             // but it's okay for _mm256_blend_epi32
             // case 2: return _mm256_blend_epi16(false_br, true_br, mask);
@@ -912,36 +912,36 @@ namespace xsimd
 
         // swizzle (constant mask)
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
-        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
+        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
         {
-            return _mm256_permutevar8x32_ps(self, (batch<uint32_t, A>)mask);
+            return _mm256_permutevar8x32_ps(self, mask.as_batch());
         }
 
         template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
-        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
+        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
         {
             constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
             return _mm256_permute4x64_pd(self, mask);
         }
 
         template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
-        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
+        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3>, requires_arch<avx2>) noexcept
         {
             constexpr auto mask = detail::shuffle(V0, V1, V2, V3);
             return _mm256_permute4x64_epi64(self, mask);
         }
         template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
-        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
+        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1, V2, V3> mask, requires_arch<avx2>) noexcept
         {
             return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, avx2 {}));
         }
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
-        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
+        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
         {
-            return _mm256_permutevar8x32_epi32(self, (batch<uint32_t, A>)mask);
+            return _mm256_permutevar8x32_epi32(self, mask.as_batch());
         }
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
-        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
+        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<avx2>) noexcept
         {
             return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, avx2 {}));
         }
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp
index 95fbcbd461..db72b73ca5 100644
--- a/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512bw.hpp
@@ -617,27 +617,27 @@ namespace xsimd
 
         // swizzle (static version)
         template <class A, uint16_t... Vs>
-        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
+        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
         {
-            return swizzle(self, (batch<uint16_t, A>)mask, avx512bw {});
+            return swizzle(self, mask.as_batch(), avx512bw {});
         }
 
         template <class A, uint16_t... Vs>
-        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
+        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
         {
-            return swizzle(self, (batch<uint16_t, A>)mask, avx512bw {});
+            return swizzle(self, mask.as_batch(), avx512bw {});
         }
 
         template <class A, uint8_t... Vs>
-        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
+        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<uint8_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
         {
-            return swizzle(self, (batch<uint8_t, A>)mask, avx512bw {});
+            return swizzle(self, mask.as_batch(), avx512bw {});
         }
 
         template <class A, uint8_t... Vs>
-        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, Vs...> mask, requires_arch<avx512bw>) noexcept
+        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<uint8_t, A, Vs...> mask, requires_arch<avx512bw>) noexcept
         {
-            return swizzle(self, (batch<uint8_t, A>)mask, avx512bw {});
+            return swizzle(self, mask.as_batch(), avx512bw {});
         }
 
         // zip_hi
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp
index 7ee4610135..d94c681015 100644
--- a/third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx512f.hpp
@@ -1422,8 +1422,8 @@ namespace xsimd
         template <class A, class T, class _ = typename std::enable_if<(sizeof(T) == 1), void>::type>
         inline T reduce_max(batch<T, A> const& self, requires_arch<avx512f>) noexcept
         {
-            constexpr batch_constant<batch<uint64_t, A>, 5, 6, 7, 8, 0, 0, 0, 0> mask;
-            batch<T, A> step = _mm512_permutexvar_epi64((batch<uint64_t, A>)mask, self);
+            constexpr batch_constant<uint64_t, A, 5, 6, 7, 8, 0, 0, 0, 0> mask;
+            batch<T, A> step = _mm512_permutexvar_epi64(mask.as_batch(), self);
             batch<T, A> acc = max(self, step);
             __m256i low = _mm512_castsi512_si256(acc);
             return reduce_max(batch<T, avx2>(low));
@@ -1433,8 +1433,8 @@ namespace xsimd
         template <class A, class T, class _ = typename std::enable_if<(sizeof(T) == 1), void>::type>
         inline T reduce_min(batch<T, A> const& self, requires_arch<avx512f>) noexcept
         {
-            constexpr batch_constant<batch<uint64_t, A>, 5, 6, 7, 8, 0, 0, 0, 0> mask;
-            batch<T, A> step = _mm512_permutexvar_epi64((batch<uint64_t, A>)mask, self);
+            constexpr batch_constant<uint64_t, A, 5, 6, 7, 8, 0, 0, 0, 0> mask;
+            batch<T, A> step = _mm512_permutexvar_epi64(mask.as_batch(), self);
             batch<T, A> acc = min(self, step);
             __m256i low = _mm512_castsi512_si256(acc);
             return reduce_min(batch<T, avx2>(low));
@@ -1571,7 +1571,7 @@ namespace xsimd
         }
 
         template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512f>) noexcept
+        inline batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx512f>) noexcept
         {
             return select(batch_bool<T, A> { Values... }, true_br, false_br, avx512f {});
         }
@@ -1709,7 +1709,7 @@ namespace xsimd
         // shuffle
         template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3, ITy I4, ITy I5, ITy I6, ITy I7, ITy I8, ITy I9, ITy I10, ITy I11, ITy I12, ITy I13, ITy I14, ITy I15>
         inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y,
-                                       batch_constant<batch<ITy, A>, I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15> mask,
+                                       batch_constant<ITy, A, I0, I1, I2, I3, I4, I5, I6, I7, I8, I9, I10, I11, I12, I13, I14, I15> mask,
                                        requires_arch<avx512f>) noexcept
         {
             constexpr uint32_t smask = (I0 & 0x3) | ((I1 & 0x3) << 2) | ((I2 & 0x3) << 4) | ((I3 & 0x3) << 6);
@@ -1726,7 +1726,7 @@ namespace xsimd
         }
 
         template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3, ITy I4, ITy I5, ITy I6, ITy I7>
-        inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx512f>) noexcept
+        inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx512f>) noexcept
         {
             constexpr uint32_t smask = (I0 & 0x1) | ((I1 & 0x1) << 1) | ((I2 & 0x1) << 2) | ((I3 & 0x1) << 3) | ((I4 & 0x1) << 4) | ((I5 & 0x1) << 5) | ((I6 & 0x1) << 6) | ((I7 & 0x1) << 7);
             // shuffle within lane
@@ -1917,39 +1917,39 @@ namespace xsimd
 
         // swizzle (constant version)
         template <class A, uint32_t... Vs>
-        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
         {
-            return swizzle(self, (batch<uint32_t, A>)mask, avx512f {});
+            return swizzle(self, mask.as_batch(), avx512f {});
         }
 
         template <class A, uint64_t... Vs>
-        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
         {
-            return swizzle(self, (batch<uint64_t, A>)mask, avx512f {});
+            return swizzle(self, mask.as_batch(), avx512f {});
         }
 
         template <class A, uint64_t... Vs>
-        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
         {
-            return swizzle(self, (batch<uint64_t, A>)mask, avx512f {});
+            return swizzle(self, mask.as_batch(), avx512f {});
         }
 
         template <class A, uint64_t... Vs>
-        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
         {
-            return swizzle(self, (batch<uint64_t, A>)mask, avx512f {});
+            return swizzle(self, mask.as_batch(), avx512f {});
         }
 
         template <class A, uint32_t... Vs>
-        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
         {
-            return swizzle(self, (batch<uint32_t, A>)mask, avx512f {});
+            return swizzle(self, mask.as_batch(), avx512f {});
         }
 
         template <class A, uint32_t... Vs>
-        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
         {
-            return swizzle(self, (batch<uint32_t, A>)mask, avx512f {});
+            return swizzle(self, mask.as_batch(), avx512f {});
         }
 
         namespace detail
@@ -1973,14 +1973,14 @@ namespace xsimd
                       uint16_t I24, uint16_t I25, uint16_t I26, uint16_t I27, uint16_t I28, uint16_t I29, uint16_t I30, uint16_t I31>
             struct fold_batch_constant
             {
-                using type = batch_constant<batch<uint32_t, A>, I0 / 2, I2 / 2, I4 / 2, I6 / 2, I8 / 2, I10 / 2, I12 / 2, I14 / 2,
+                using type = batch_constant<uint32_t, A, I0 / 2, I2 / 2, I4 / 2, I6 / 2, I8 / 2, I10 / 2, I12 / 2, I14 / 2,
                                             I16 / 2, I18 / 2, I20 / 2, I22 / 2, I24 / 2, I26 / 2, I28 / 2, I30 / 2>;
             };
 
         }
 
         template <class A, uint16_t... Idx, class _ = typename std::enable_if<detail::is_pair_of_contiguous_indices<uint16_t, A, Idx...>::value, void>::type>
-        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, Idx...>, requires_arch<avx512f>) noexcept
+        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, Idx...>, requires_arch<avx512f>) noexcept
         {
             constexpr typename detail::fold_batch_constant<A, Idx...>::type mask32;
             return _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
@@ -1988,13 +1988,13 @@ namespace xsimd
 
         template <class A>
         inline batch<uint16_t, A>
-        swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, (uint16_t)1, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1>, requires_arch<avx512f>) noexcept
+        swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, (uint16_t)1, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1, (uint16_t)0, (uint16_t)1>, requires_arch<avx512f>) noexcept
         {
             // FIXME: this sequence is very inefficient, but it's here to catch
             // a pattern generated by detail::reduce from xsimd_generic_math.hpp.
             // The whole pattern is actually decently folded by GCC and Clang,
             // so bare with it.
-            constexpr batch_constant<batch<uint32_t, A>, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> mask32;
+            constexpr batch_constant<uint32_t, A, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0> mask32;
             auto tmp = _mm512_permutexvar_epi32(static_cast<batch<uint32_t, A>>(mask32), self);
 
             alignas(A::alignment()) uint16_t buffer[32];
@@ -2005,7 +2005,7 @@ namespace xsimd
 
         template <class A, uint16_t... Vs>
         inline batch<int16_t, A>
-        swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, Vs...> mask, requires_arch<avx512f>) noexcept
+        swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, Vs...> mask, requires_arch<avx512f>) noexcept
         {
             return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, avx512f {}));
         }
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_emulated.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_emulated.hpp
new file mode 100644
index 0000000000..ac3dd4fef3
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_emulated.hpp
@@ -0,0 +1,757 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_EMULATED_HPP
+#define XSIMD_EMULATED_HPP
+
+#include <complex>
+#include <limits>
+#include <numeric>
+#include <type_traits>
+
+#include "../arch/xsimd_scalar.hpp"
+
+#include "../types/xsimd_emulated_register.hpp"
+#include "../types/xsimd_utils.hpp"
+
+namespace xsimd
+{
+    template <typename T, class A, bool... Values>
+    struct batch_bool_constant;
+
+    template <class T_out, class T_in, class A>
+    inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept;
+
+    template <typename T, class A, T... Values>
+    struct batch_constant;
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // fwd
+        template <class A, class T, size_t I>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
+        template <class A, typename T, typename ITy, ITy... Indices>
+        inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept;
+
+        namespace detail
+        {
+            template <size_t I, class F, class... Bs>
+            auto emulated_apply(F func, Bs const&... bs) -> decltype(func(bs.data[I]...))
+            {
+                return func(bs.data[I]...);
+            }
+
+            template <class F, class B, class... Bs, size_t... Is>
+            auto emulated_apply(F func, ::xsimd::detail::index_sequence<Is...>, B const& b, Bs const&... bs) -> std::array<decltype(func(b.data[0], bs.data[0]...)), B::size>
+            {
+                return { emulated_apply<Is>(func, b, bs...)... };
+            }
+
+            template <class B, class F, class... Bs>
+            auto emulated_apply(F func, B const& b, Bs const&... bs) -> std::array<decltype(func(b.data[0], bs.data[0]...)), B::size>
+            {
+                return emulated_apply(func, ::xsimd::detail::make_index_sequence<B::size>(), b, bs...);
+            }
+        }
+
+        // abs
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> abs(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v)
+                                          { return xsimd::abs(v); },
+                                          self);
+        }
+
+        // add
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::add(v0, v1); },
+                                          self, other);
+        }
+
+        // all
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline bool all(batch_bool<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return std::all_of(self.data.begin(), self.data.end(), [](T v)
+                               { return bool(v); });
+        }
+
+        // any
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline bool any(batch_bool<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return std::any_of(self.data.begin(), self.data.end(), [](T v)
+                               { return bool(v); });
+        }
+
+        // batch_bool_cast
+        template <class A, class T_out, class T_in, size_t N = 8 * sizeof(T_in) * batch<T_in, A>::size>
+        inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<emulated<N>>) noexcept
+        {
+            return { self.data };
+        }
+
+        // bitwise_and
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::bitwise_and(v0, v1); },
+                                          self, other);
+        }
+
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v0, bool v1)
+                                          { return xsimd::bitwise_and(v0, v1); },
+                                          self, other);
+        }
+
+        // bitwise_andnot
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::bitwise_andnot(v0, v1); },
+                                          self, other);
+        }
+
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v0, bool v1)
+                                          { return xsimd::bitwise_andnot(v0, v1); },
+                                          self, other);
+        }
+
+        // bitwise_lshift
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([other](T v)
+                                          { return xsimd::bitwise_lshift(v, other); },
+                                          self);
+        }
+
+        // bitwise_not
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v)
+                                          { return xsimd::bitwise_not(v); },
+                                          self);
+        }
+
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v)
+                                          { return xsimd::bitwise_not(v); },
+                                          self);
+        }
+
+        // bitwise_or
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::bitwise_or(v0, v1); },
+                                          self, other);
+        }
+
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v0, bool v1)
+                                          { return xsimd::bitwise_or(v0, v1); },
+                                          self, other);
+        }
+
+        // bitwise_rshift
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([other](T v)
+                                          { return xsimd::bitwise_rshift(v, other); },
+                                          self);
+        }
+
+        // bitwise_xor
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::bitwise_xor(v0, v1); },
+                                          self, other);
+        }
+
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch_bool<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v0, bool v1)
+                                          { return xsimd::bitwise_xor(v0, v1); },
+                                          self, other);
+        }
+
+        // bitwise_cast
+        template <class A, class T_in, class T_out, size_t N = 8 * sizeof(T_in) * batch<T_in, A>::size>
+        inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& self, batch<T_out, A> const&, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T_out, A>::size;
+            std::array<T_out, size> result;
+            char* raw_data = reinterpret_cast<char*>(result.data());
+            const char* raw_input = reinterpret_cast<const char*>(self.data.data());
+            memcpy(raw_data, raw_input, size * sizeof(T_out));
+            return result;
+        }
+
+        // broadcast
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        batch<T, A> inline broadcast(T val, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<T, size> r;
+            std::fill(r.begin(), r.end(), val);
+            return r;
+        }
+
+        // store_complex
+        namespace detail
+        {
+            // complex_low
+            template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+            inline batch<T, A> complex_low(batch<std::complex<T>, A> const& self, requires_arch<emulated<N>>) noexcept
+            {
+                constexpr size_t size = batch<T, A>::size;
+                std::array<T, size> result;
+                for (size_t i = 0; i < size / 2; ++i)
+                {
+                    result[2 * i] = self.real().data[i];
+                    result[1 + 2 * i] = self.imag().data[i];
+                }
+                return result;
+            }
+            // complex_high
+            template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+            inline batch<T, A> complex_high(batch<std::complex<T>, A> const& self, requires_arch<emulated<N>>) noexcept
+            {
+                constexpr size_t size = batch<T, A>::size;
+                std::array<T, size> result;
+                for (size_t i = 0; i < size / 2; ++i)
+                {
+                    result[2 * i] = self.real().data[i + size / 2];
+                    result[1 + 2 * i] = self.imag().data[i + size / 2];
+                }
+                return result;
+            }
+        }
+
+        // decr_if
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> decr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<emulated<N>>) noexcept
+        {
+            return self - batch<T, A>(mask.data);
+        }
+
+        // div
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> div(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::div(v0, v1); },
+                                          self, other);
+        }
+
+        // fast_cast
+        namespace detail
+        {
+            template <class A, size_t N = 8 * sizeof(float) * batch<float, A>::size>
+            inline batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<emulated<N>>) noexcept
+            {
+                return detail::emulated_apply([](int32_t v)
+                                              { return float(v); },
+                                              self);
+            }
+
+            template <class A, size_t N = 8 * sizeof(float) * batch<float, A>::size>
+            inline batch<float, A> fast_cast(batch<uint32_t, A> const& self, batch<float, A> const&, requires_arch<emulated<N>>) noexcept
+            {
+                return detail::emulated_apply([](uint32_t v)
+                                              { return float(v); },
+                                              self);
+            }
+
+            template <class A, size_t N = 8 * sizeof(double) * batch<double, A>::size>
+            inline batch<double, A> fast_cast(batch<int64_t, A> const& self, batch<double, A> const&, requires_arch<emulated<N>>) noexcept
+            {
+                return detail::emulated_apply([](int64_t v)
+                                              { return double(v); },
+                                              self);
+            }
+
+            template <class A, size_t N = 8 * sizeof(double) * batch<double, A>::size>
+            inline batch<double, A> fast_cast(batch<uint64_t, A> const& self, batch<double, A> const&, requires_arch<emulated<N>>) noexcept
+            {
+                return detail::emulated_apply([](uint64_t v)
+                                              { return double(v); },
+                                              self);
+            }
+
+            template <class A, size_t N = 8 * sizeof(int32_t) * batch<int32_t, A>::size>
+            inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<emulated<N>>) noexcept
+            {
+                return detail::emulated_apply([](float v)
+                                              { return int32_t(v); },
+                                              self);
+            }
+
+            template <class A, size_t N = 8 * sizeof(double) * batch<double, A>::size>
+            inline batch<int64_t, A> fast_cast(batch<double, A> const& self, batch<int64_t, A> const&, requires_arch<emulated<N>>) noexcept
+            {
+                return detail::emulated_apply([](double v)
+                                              { return int64_t(v); },
+                                              self);
+            }
+        }
+
+        // eq
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch_bool<T, emulated<N>> eq(batch<T, emulated<N>> const& self, batch<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::eq(v0, v1); },
+                                          self, other);
+        }
+
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch_bool<T, A>::size>
+        inline batch_bool<T, emulated<N>> eq(batch_bool<T, emulated<N>> const& self, batch_bool<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v0, bool v1)
+                                          { return xsimd::eq(v0, v1); },
+                                          self, other);
+        }
+
+        // from_bool
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> from_bool(batch_bool<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v)
+                                          { return T(v); },
+                                          self);
+        }
+
+        // from_mask
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<bool, size> vmask;
+            for (size_t i = 0; i < size; ++i)
+                vmask[i] = (mask >> i) & 1u;
+            return vmask;
+        }
+
+        // ge
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch_bool<T, emulated<N>> ge(batch<T, emulated<N>> const& self, batch<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::ge(v0, v1); },
+                                          self, other);
+        }
+
+        // gt
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch_bool<T, emulated<N>> gt(batch<T, emulated<N>> const& self, batch<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::gt(v0, v1); },
+                                          self, other);
+        }
+
+        // haddp
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> haddp(batch<T, A> const* row, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<T, size> r;
+            for (size_t i = 0; i < size; ++i)
+                r[i] = std::accumulate(row[i].data.begin() + 1, row[i].data.end(), row[i].data.front());
+            return r;
+        }
+
+        // incr_if
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> incr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<emulated<N>>) noexcept
+        {
+            return self + batch<T, A>(mask.data);
+        }
+
+        // insert
+        template <class A, class T, size_t I, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<emulated<N>>) noexcept
+        {
+            batch<T, A> other = self;
+            other.data[I] = val;
+            return other;
+        }
+
+        // isnan
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size, class = typename std::enable_if<std::is_floating_point<T>::value, void>::type>
+        inline batch_bool<T, A> isnan(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v)
+                                          { return xsimd::isnan(v); },
+                                          self);
+        }
+
+        // load_aligned
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<T, size> res;
+            std::copy(mem, mem + size, res.begin());
+            return res;
+        }
+
+        // load_unaligned
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<T, size> res;
+            std::copy(mem, mem + size, res.begin());
+            return res;
+        }
+
+        // load_complex
+        namespace detail
+        {
+            template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+            inline batch<std::complex<T>, A> load_complex(batch<T, A> const& hi, batch<T, A> const& lo, requires_arch<emulated<N>>) noexcept
+            {
+                constexpr size_t size = batch<T, A>::size;
+                std::array<T, size> real, imag;
+                for (size_t i = 0; i < size / 2; ++i)
+                {
+                    real[i] = hi.data[2 * i];
+                    imag[i] = hi.data[1 + 2 * i];
+                }
+                for (size_t i = 0; i < size / 2; ++i)
+                {
+                    real[size / 2 + i] = lo.data[2 * i];
+                    imag[size / 2 + i] = lo.data[1 + 2 * i];
+                }
+                return { real, imag };
+            }
+        }
+
+        // le
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch_bool<T, emulated<N>> le(batch<T, emulated<N>> const& self, batch<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::le(v0, v1); },
+                                          self, other);
+        }
+
+        // lt
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch_bool<T, emulated<N>> lt(batch<T, emulated<N>> const& self, batch<T, emulated<N>> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::lt(v0, v1); },
+                                          self, other);
+        }
+
+        // mask
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            uint64_t res = 0;
+            for (size_t i = 0; i < size; ++i)
+                res |= (self.data[i] ? 1u : 0u) << i;
+            return res;
+        }
+
+        // max
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::max(v0, v1); },
+                                          self, other);
+        }
+
+        // min
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::min(v0, v1); },
+                                          self, other);
+        }
+
+        // mul
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> mul(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::mul(v0, v1); },
+                                          self, other);
+        }
+
+        // nearbyint_as_int
+        template <class A, typename T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<as_integer_t<T>, A> nearbyint_as_int(batch<T, A> const& self,
+                                                          requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v)
+                                          { return xsimd::nearbyint_as_int(v); },
+                                          self);
+        }
+
+        // neg
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> neg(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v)
+                                          { return xsimd::neg(v); },
+                                          self);
+        }
+
+        // neq
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::neq(v0, v1); },
+                                          self, other);
+        }
+
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool v0, bool v1)
+                                          { return xsimd::neq(v0, v1); },
+                                          self, other);
+        }
+
+        // reduce_add
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline T reduce_add(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<T, size> buffer;
+            self.store_unaligned(buffer.data());
+            return std::accumulate(buffer.begin() + 1, buffer.end(), *buffer.begin());
+        }
+
+        // reduce_max
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline T reduce_max(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return std::accumulate(self.data.begin() + 1, self.data.end(), *self.data.begin(), [](T const& x, T const& y)
+                                   { return xsimd::max(x, y); });
+        }
+
+        // reduce_min
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline T reduce_min(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return std::accumulate(self.data.begin() + 1, self.data.end(), *self.data.begin(), [](T const& x, T const& y)
+                                   { return xsimd::min(x, y); });
+        }
+
+        // rsqrt
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> rsqrt(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v)
+                                          { return xsimd::rsqrt(v); },
+                                          self);
+        }
+
+        // select
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](bool c, T t, T f)
+                                          { return xsimd::select(c, t, f); },
+                                          cond, true_br, false_br);
+        }
+
+        template <class A, class T, bool... Values>
+        inline batch<T, A> select(batch_bool_constant<T, A, Values...> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<emulated<8 * sizeof(T) * batch<T, A>::size>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            static_assert(sizeof...(Values) == size, "consistent init");
+            return select((batch_bool<T, A>)cond, true_br, false_br, emulated<8 * sizeof(T) * size> {});
+        }
+
+        // shuffle
+        template <class A, typename T, class ITy, ITy... Is>
+        inline batch<T, A> shuffle(batch<T, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, Is...> mask, requires_arch<emulated<batch<T, A>::size>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            batch<ITy, A> bmask = mask;
+            std::array<T, size> res;
+            for (size_t i = 0; i < size; ++i)
+                res[i] = bmask.data[i] < size ? x.data[bmask.data[i]] : y.data[bmask.data[i] - size];
+            return res;
+        }
+
+        // sqrt
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> sqrt(batch<T, A> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v)
+                                          { return xsimd::sqrt(v); },
+                                          self);
+        }
+
+        // slide_left
+        template <size_t M, class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<T, size> result;
+            char* raw_data = reinterpret_cast<char*>(result.data());
+            memset(raw_data, 0, M);
+            memcpy(raw_data + M, reinterpret_cast<const char*>(x.data.data()), sizeof(T) * result.size() - M);
+            return result;
+        }
+
+        // slide_right
+        template <size_t M, class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            std::array<T, size> result;
+            char* raw_data = reinterpret_cast<char*>(result.data());
+            memcpy(raw_data, reinterpret_cast<const char*>(x.data.data()) + M, sizeof(T) * result.size() - M);
+            memset(raw_data + sizeof(T) * result.size() - M, 0, M);
+            return result;
+        }
+
+        // sadd
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::sadd(v0, v1); },
+                                          self, other);
+        }
+
+        // set
+        template <class A, class T, size_t N, class... Values>
+        inline batch<T, emulated<N>> set(batch<T, emulated<N>> const&, requires_arch<emulated<N>>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<T, emulated<N>>::size, "consistent init");
+            return { typename batch<T, emulated<N>>::register_type { static_cast<T>(values)... } };
+        }
+
+        template <class A, class T, size_t N, class... Values>
+        inline batch_bool<T, emulated<N>> set(batch_bool<T, emulated<N>> const&, requires_arch<emulated<N>>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<T, emulated<N>>::size, "consistent init");
+            return { std::array<bool, sizeof...(Values)> { static_cast<bool>(values)... } };
+        }
+
+        // ssub
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::ssub(v0, v1); },
+                                          self, other);
+        }
+
+        // store_aligned
+        template <class A, class T, size_t N>
+        inline void store_aligned(T* mem, batch<T, emulated<N>> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            std::copy(self.data.begin(), self.data.end(), mem);
+        }
+
+        // store_unaligned
+        template <class A, class T, size_t N>
+        inline void store_unaligned(T* mem, batch<T, emulated<N>> const& self, requires_arch<emulated<N>>) noexcept
+        {
+            std::copy(self.data.begin(), self.data.end(), mem);
+        }
+
+        // sub
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            return detail::emulated_apply([](T v0, T v1)
+                                          { return xsimd::sub(v0, v1); },
+                                          self, other);
+        }
+
+        // swizzle
+
+        template <class A, typename T, class ITy, ITy... Is>
+        inline batch<T, A> swizzle(batch<T, A> const& self, batch_constant<ITy, A, Is...> mask, requires_arch<emulated<8 * sizeof(T) * batch<T, A>::size>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            batch<ITy, A> bmask = mask;
+            std::array<T, size> res;
+            for (size_t i = 0; i < size; ++i)
+                res[i] = self.data[bmask.data[i]];
+            return res;
+        }
+
+        // zip_hi
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            // Note: irregular behavior for odd numbers.
+            std::array<T, size> res;
+            if (size % 2)
+            {
+                for (size_t i = 0; i < size; ++i)
+                    res[i] = (i % 2 ? self : other).data[size / 2 + i / 2];
+            }
+            else
+            {
+                for (size_t i = 0; i < size; ++i)
+                    res[i] = (i % 2 ? other : self).data[size / 2 + i / 2];
+            }
+            return res;
+        }
+
+        // zip_lo
+        template <class A, class T, size_t N = 8 * sizeof(T) * batch<T, A>::size>
+        inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<emulated<N>>) noexcept
+        {
+            constexpr size_t size = batch<T, A>::size;
+            // Note: irregular behavior for odd numbers.
+            std::array<T, size> res;
+            for (size_t i = 0; i < size; ++i)
+                res[i] = (i % 2 ? other : self).data[i / 2];
+            return res;
+        }
+    }
+}
+
+#endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp
index dcd2df3fa9..5b714b2991 100644
--- a/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_isa.hpp
@@ -16,6 +16,10 @@
 
 #include "./xsimd_generic_fwd.hpp"
 
+#if XSIMD_WITH_EMULATED
+#include "./xsimd_emulated.hpp"
+#endif
+
 #if XSIMD_WITH_SSE2
 #include "./xsimd_sse2.hpp"
 #endif
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp
index 3510eb21d9..b0edae8633 100644
--- a/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_neon.hpp
@@ -146,7 +146,7 @@ inline float32x4_t vreinterpretq_f32_f32(float32x4_t arg) noexcept { return arg;
 
 namespace xsimd
 {
-    template <class batch_type, bool... Values>
+    template <typename T, class A, bool... Values>
     struct batch_bool_constant;
 
     namespace kernel
@@ -1743,7 +1743,7 @@ namespace xsimd
         }
 
         template <class A, class T, bool... b, detail::enable_neon_type_t<T> = 0>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<neon>) noexcept
+        inline batch<T, A> select(batch_bool_constant<T, A, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<neon>) noexcept
         {
             return select(batch_bool<T, A> { b... }, true_br, false_br, neon {});
         }
@@ -2717,7 +2717,7 @@ namespace xsimd
         }
     }
 
-    template <class batch_type, typename batch_type::value_type... Values>
+    template <typename T, class A, T... Values>
     struct batch_constant;
 
     namespace kernel
@@ -2728,7 +2728,7 @@ namespace xsimd
 
         template <class A, class T, class I, I... idx>
         inline batch<T, A> swizzle(batch<T, A> const& self,
-                                   batch_constant<batch<I, A>, idx...>,
+                                   batch_constant<I, A, idx...>,
                                    requires_arch<neon>) noexcept
         {
             static_assert(batch<T, A>::size == sizeof...(idx), "valid swizzle indices");
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp
index 77538d1c2d..2469b14f37 100644
--- a/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_neon64.hpp
@@ -21,7 +21,7 @@
 
 namespace xsimd
 {
-    template <class batch_type, bool... Values>
+    template <typename T, class A, bool... Values>
     struct batch_bool_constant;
 
     namespace kernel
@@ -942,7 +942,7 @@ namespace xsimd
         }
 
         template <class A, bool... b>
-        inline batch<double, A> select(batch_bool_constant<batch<double, A>, b...> const&,
+        inline batch<double, A> select(batch_bool_constant<double, A, b...> const&,
                                        batch<double, A> const& true_br,
                                        batch<double, A> const& false_br,
                                        requires_arch<neon64>) noexcept
@@ -1243,7 +1243,7 @@ namespace xsimd
         }
     }
 
-    template <class batch_type, typename batch_type::value_type... Values>
+    template <typename T, class A, T... Values>
     struct batch_constant;
 
     namespace kernel
@@ -1354,42 +1354,40 @@ namespace xsimd
             template <class CB1, class CB2, class IS>
             struct index_burst_impl;
 
-            template <class B1, class B2, typename B2::value_type... V,
-                      typename B2::value_type... incr>
-            struct index_burst_impl<batch_constant<B1>, batch_constant<B2, V...>,
-                                    integer_sequence<typename B2::value_type, incr...>>
+            template <typename T1, class A, typename T2, T2... V,
+                      T2... incr>
+            struct index_burst_impl<batch_constant<T1, A>, batch_constant<T2, A, V...>,
+                                    integer_sequence<T2, incr...>>
             {
-                using type = batch_constant<B2, V...>;
+                using type = batch_constant<T2, A, V...>;
             };
 
-            template <class B1, typename B1::value_type V0, typename B1::value_type... V1,
-                      class B2, typename B2::value_type... V2,
-                      typename B2::value_type... incr>
-            struct index_burst_impl<batch_constant<B1, V0, V1...>, batch_constant<B2, V2...>,
-                                    integer_sequence<typename B2::value_type, incr...>>
+            template <typename T1, class A, T1 V0, T1... V1,
+                      typename T2, T2... V2, T2... incr>
+            struct index_burst_impl<batch_constant<T1, A, V0, V1...>, batch_constant<T2, A, V2...>,
+                                    integer_sequence<T2, incr...>>
             {
-                using value_type = typename B2::value_type;
-                using next_input = batch_constant<B1, V1...>;
-                using next_output = batch_constant<B2, V2..., (V0 + incr)...>;
-                using type = typename index_burst_impl<next_input, next_output, integer_sequence<value_type, incr...>>::type;
+                using next_input = batch_constant<T1, A, V1...>;
+                using next_output = batch_constant<T2, A, V2..., (V0 + incr)...>;
+                using type = typename index_burst_impl<next_input, next_output, integer_sequence<T2, incr...>>::type;
             };
 
             template <class B, class T>
             struct index_burst;
 
-            template <class B, typename B::value_type... V, class T>
-            struct index_burst<batch_constant<B, V...>, T>
+            template <typename Tp, class A, Tp... V, typename T>
+            struct index_burst<batch_constant<Tp, A, V...>, T>
             {
-                static constexpr size_t mul = sizeof(typename B::value_type) / sizeof(T);
-                using input = batch_constant<B, (mul * V)...>;
-                using output = batch_constant<batch<T, typename B::arch_type>>;
+                static constexpr size_t mul = sizeof(Tp) / sizeof(T);
+                using input = batch_constant<Tp, A, (mul * V)...>;
+                using output = batch_constant<T, A>;
                 using type = typename index_burst_impl<input, output, make_integer_sequence<T, mul>>::type;
             };
 
-            template <class B, class T>
+            template <class B, typename T>
             using index_burst_t = typename index_burst<B, T>::type;
 
-            template <class T, class B>
+            template <typename T, class B>
             inline index_burst_t<B, T> burst_index(B)
             {
                 return index_burst_t<B, T>();
@@ -1399,7 +1397,7 @@ namespace xsimd
         template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
                   uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
         inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self,
-                                         batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
+                                         batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
                                          requires_arch<neon64>) noexcept
         {
             return vqtbl1q_u8(self, batch<uint8_t, A>(idx));
@@ -1408,7 +1406,7 @@ namespace xsimd
         template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
                   uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
         inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self,
-                                        batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
+                                        batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> idx,
                                         requires_arch<neon64>) noexcept
         {
             return vqtbl1q_s8(self, batch<uint8_t, A>(idx));
@@ -1416,7 +1414,7 @@ namespace xsimd
 
         template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
         inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self,
-                                          batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> idx,
+                                          batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> idx,
                                           requires_arch<neon64>) noexcept
         {
             using batch_type = batch<uint8_t, A>;
@@ -1425,7 +1423,7 @@ namespace xsimd
 
         template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
         inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self,
-                                         batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> idx,
+                                         batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> idx,
                                          requires_arch<neon64>) noexcept
         {
             using batch_type = batch<int8_t, A>;
@@ -1434,7 +1432,7 @@ namespace xsimd
 
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
         inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self,
-                                          batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
+                                          batch_constant<uint32_t, A, V0, V1, V2, V3> idx,
                                           requires_arch<neon64>) noexcept
         {
             using batch_type = batch<uint8_t, A>;
@@ -1443,7 +1441,7 @@ namespace xsimd
 
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
         inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self,
-                                         batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
+                                         batch_constant<uint32_t, A, V0, V1, V2, V3> idx,
                                          requires_arch<neon64>) noexcept
         {
             using batch_type = batch<int8_t, A>;
@@ -1452,7 +1450,7 @@ namespace xsimd
 
         template <class A, uint64_t V0, uint64_t V1>
         inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self,
-                                          batch_constant<batch<uint64_t, A>, V0, V1> idx,
+                                          batch_constant<uint64_t, A, V0, V1> idx,
                                           requires_arch<neon64>) noexcept
         {
             using batch_type = batch<uint8_t, A>;
@@ -1461,7 +1459,7 @@ namespace xsimd
 
         template <class A, uint64_t V0, uint64_t V1>
         inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self,
-                                         batch_constant<batch<uint64_t, A>, V0, V1> idx,
+                                         batch_constant<uint64_t, A, V0, V1> idx,
                                          requires_arch<neon64>) noexcept
         {
             using batch_type = batch<int8_t, A>;
@@ -1470,7 +1468,7 @@ namespace xsimd
 
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
         inline batch<float, A> swizzle(batch<float, A> const& self,
-                                       batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
+                                       batch_constant<uint32_t, A, V0, V1, V2, V3> idx,
                                        requires_arch<neon64>) noexcept
         {
             using batch_type = batch<uint8_t, A>;
@@ -1479,7 +1477,7 @@ namespace xsimd
 
         template <class A, uint64_t V0, uint64_t V1>
         inline batch<double, A> swizzle(batch<double, A> const& self,
-                                        batch_constant<batch<uint64_t, A>, V0, V1> idx,
+                                        batch_constant<uint64_t, A, V0, V1> idx,
                                         requires_arch<neon64>) noexcept
         {
             using batch_type = batch<uint8_t, A>;
@@ -1488,7 +1486,7 @@ namespace xsimd
 
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
         inline batch<std::complex<float>, A> swizzle(batch<std::complex<float>, A> const& self,
-                                                     batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> idx,
+                                                     batch_constant<uint32_t, A, V0, V1, V2, V3> idx,
                                                      requires_arch<neon64>) noexcept
         {
             return batch<std::complex<float>>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A()));
@@ -1496,7 +1494,7 @@ namespace xsimd
 
         template <class A, uint64_t V0, uint64_t V1>
         inline batch<std::complex<double>, A> swizzle(batch<std::complex<double>, A> const& self,
-                                                      batch_constant<batch<uint64_t, A>, V0, V1> idx,
+                                                      batch_constant<uint64_t, A, V0, V1> idx,
                                                       requires_arch<neon64>) noexcept
         {
             return batch<std::complex<double>>(swizzle(self.real(), idx, A()), swizzle(self.imag(), idx, A()));
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_rvv.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_rvv.hpp
index 98d1de9ce3..2b8cebe5c9 100644
--- a/third_party/xsimd/include/xsimd/arch/xsimd_rvv.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_rvv.hpp
@@ -284,7 +284,7 @@
 
 namespace xsimd
 {
-    template <class batch_type, typename batch_type::value_type... Values>
+    template <typename T, class A, T... Values>
     struct batch_constant;
 
     namespace kernel
@@ -1150,7 +1150,7 @@ namespace xsimd
 
         // swizzle
         template <class A, class T, class I, I... idx>
-        inline batch<T, A> swizzle(batch<T, A> const& arg, batch_constant<batch<I, A>, idx...>, requires_arch<rvv>) noexcept
+        inline batch<T, A> swizzle(batch<T, A> const& arg, batch_constant<I, A, idx...>, requires_arch<rvv>) noexcept
         {
             static_assert(batch<T, A>::size == sizeof...(idx), "invalid swizzle indices");
             const batch<I, A> indices { idx... };
@@ -1159,11 +1159,11 @@ namespace xsimd
 
         template <class A, class T, class I, I... idx>
         inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self,
-                                                 batch_constant<batch<I, A>, idx...>,
+                                                 batch_constant<I, A, idx...>,
                                                  requires_arch<rvv>) noexcept
         {
-            const auto real = swizzle(self.real(), batch_constant<batch<I, A>, idx...> {}, rvv {});
-            const auto imag = swizzle(self.imag(), batch_constant<batch<I, A>, idx...> {}, rvv {});
+            const auto real = swizzle(self.real(), batch_constant<I, A, idx...> {}, rvv {});
+            const auto imag = swizzle(self.imag(), batch_constant<I, A, idx...> {}, rvv {});
             return batch<std::complex<T>>(real, imag);
         }
 
@@ -1188,7 +1188,7 @@ namespace xsimd
         }
 
         template <class A, class T, bool... b>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<rvv>) noexcept
+        inline batch<T, A> select(batch_bool_constant<T, A, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<rvv>) noexcept
         {
             return select(batch_bool<T, A> { b... }, true_br, false_br, rvv {});
         }
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp
index 1cde15ffe1..38b9f841df 100644
--- a/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_scalar.hpp
@@ -86,6 +86,57 @@ namespace xsimd
     using std::tgamma;
     using std::trunc;
 
+    inline signed char abs(signed char v)
+    {
+        return v < 0 ? -v : v;
+    }
+
+    namespace detail
+    {
+        // Use templated type here to prevent automatic instantiation that may
+        // ends up in a warning
+        template <typename char_type>
+        inline char abs(char_type v, std::true_type)
+        {
+            return v;
+        }
+        template <typename char_type>
+        inline char abs(char_type v, std::false_type)
+        {
+            return v < 0 ? -v : v;
+        }
+    }
+
+    inline char abs(char v)
+    {
+        return detail::abs(v, std::is_unsigned<char>::type {});
+    }
+
+    inline short abs(short v)
+    {
+        return v < 0 ? -v : v;
+    }
+    inline unsigned char abs(unsigned char v)
+    {
+        return v;
+    }
+    inline unsigned short abs(unsigned short v)
+    {
+        return v;
+    }
+    inline unsigned int abs(unsigned int v)
+    {
+        return v;
+    }
+    inline unsigned long abs(unsigned long v)
+    {
+        return v;
+    }
+    inline unsigned long long abs(unsigned long long v)
+    {
+        return v;
+    }
+
 #ifndef _WIN32
     using std::isfinite;
     using std::isinf;
@@ -137,7 +188,7 @@ namespace xsimd
 #endif
 
     template <class T, class Tp>
-    inline auto add(T const& x, Tp const& y) noexcept -> decltype(x + y)
+    inline typename std::common_type<T, Tp>::type add(T const& x, Tp const& y) noexcept
     {
         return x + y;
     }
@@ -209,52 +260,32 @@ namespace xsimd
         return x & y;
     }
 
-    inline float bitwise_and(float x, float y) noexcept
-    {
-        uint32_t ix, iy;
-        std::memcpy((void*)&ix, (void*)&x, sizeof(float));
-        std::memcpy((void*)&iy, (void*)&y, sizeof(float));
-        uint32_t ir = bitwise_and(ix, iy);
-        float r;
-        std::memcpy((void*)&r, (void*)&ir, sizeof(float));
-        return r;
-    }
-
-    inline double bitwise_and(double x, double y) noexcept
+    template <class T_out, class T_in>
+    inline T_out bitwise_cast(T_in x) noexcept
     {
-        uint64_t ix, iy;
-        std::memcpy((void*)&ix, (void*)&x, sizeof(double));
-        std::memcpy((void*)&iy, (void*)&y, sizeof(double));
-        uint64_t ir = bitwise_and(ix, iy);
-        double r;
-        std::memcpy((void*)&r, (void*)&ir, sizeof(double));
+        static_assert(sizeof(T_in) == sizeof(T_out), "bitwise_cast between types of the same size");
+        T_out r;
+        std::memcpy((void*)&r, (void*)&x, sizeof(T_in));
         return r;
     }
 
-    template <class T>
-    inline typename std::enable_if<std::is_integral<T>::value, T>::type
-    bitwise_andnot(T x, T y) noexcept
-    {
-        return x & ~y;
-    }
-
-    inline float bitwise_andnot(float x, float y) noexcept
+    inline float bitwise_and(float x, float y) noexcept
     {
         uint32_t ix, iy;
         std::memcpy((void*)&ix, (void*)&x, sizeof(float));
         std::memcpy((void*)&iy, (void*)&y, sizeof(float));
-        uint32_t ir = bitwise_andnot(ix, iy);
+        uint32_t ir = bitwise_and(ix, iy);
         float r;
         std::memcpy((void*)&r, (void*)&ir, sizeof(float));
         return r;
     }
 
-    inline double bitwise_andnot(double x, double y) noexcept
+    inline double bitwise_and(double x, double y) noexcept
     {
         uint64_t ix, iy;
         std::memcpy((void*)&ix, (void*)&x, sizeof(double));
         std::memcpy((void*)&iy, (void*)&y, sizeof(double));
-        uint64_t ir = bitwise_andnot(ix, iy);
+        uint64_t ir = bitwise_and(ix, iy);
         double r;
         std::memcpy((void*)&r, (void*)&ir, sizeof(double));
         return r;
@@ -281,6 +312,11 @@ namespace xsimd
         return ~x;
     }
 
+    inline bool bitwise_not(bool x) noexcept
+    {
+        return !x;
+    }
+
     inline float bitwise_not(float x) noexcept
     {
         uint32_t ix;
@@ -302,6 +338,12 @@ namespace xsimd
     }
 
     template <class T>
+    inline typename std::enable_if<std::is_scalar<T>::value, T>::type bitwise_andnot(T x, T y) noexcept
+    {
+        return bitwise_and(x, bitwise_not(y));
+    }
+
+    template <class T>
     inline typename std::enable_if<std::is_integral<T>::value, T>::type
     bitwise_or(T x, T y) noexcept
     {
@@ -360,7 +402,7 @@ namespace xsimd
     }
 
     template <class T, class Tp>
-    inline auto div(T const& x, Tp const& y) noexcept -> decltype(x / y)
+    inline typename std::common_type<T, Tp>::type div(T const& x, Tp const& y) noexcept
     {
         return x / y;
     }
@@ -372,13 +414,13 @@ namespace xsimd
     }
 
     template <class T, class Tp>
-    inline auto mul(T const& x, Tp const& y) noexcept -> decltype(x * y)
+    inline typename std::common_type<T, Tp>::type mul(T const& x, Tp const& y) noexcept
     {
         return x * y;
     }
 
     template <class T>
-    inline auto neg(T const& x) noexcept -> decltype(-x)
+    inline T neg(T const& x) noexcept
     {
         return -x;
     }
@@ -776,9 +818,9 @@ namespace xsimd
     }
 
     template <class T, class = typename std::enable_if<std::is_scalar<T>::value>::type>
-    inline bool bitofsign(T const& x) noexcept
+    inline T bitofsign(T const& x) noexcept
     {
-        return x < T(0);
+        return T(x < T(0));
     }
 
     template <class T>
@@ -842,7 +884,7 @@ namespace xsimd
     }
 
     template <class T, class Tp>
-    inline auto sub(T const& x, Tp const& y) noexcept -> decltype(x - y)
+    inline typename std::common_type<T, Tp>::type sub(T const& x, Tp const& y) noexcept
     {
         return x - y;
     }
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp
index d39cc201f9..d733a8c362 100644
--- a/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_sse2.hpp
@@ -20,13 +20,13 @@
 
 namespace xsimd
 {
-    template <class batch_type, bool... Values>
+    template <typename T, class A, bool... Values>
     struct batch_bool_constant;
 
     template <class T_out, class T_in, class A>
     inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept;
 
-    template <class batch_type, typename batch_type::value_type... Values>
+    template <typename T, class A, T... Values>
     struct batch_constant;
 
     namespace kernel
@@ -59,7 +59,7 @@ namespace xsimd
         template <class A, class T, size_t I>
         inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
         template <class A, typename T, typename ITy, ITy... Indices>
-        inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<batch<ITy, A>, Indices...>, requires_arch<generic>) noexcept;
+        inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept;
         template <class A, class T>
         inline batch<T, A> avg(batch<T, A> const&, batch<T, A> const&, requires_arch<generic>) noexcept;
         template <class A, class T>
@@ -1216,6 +1216,43 @@ namespace xsimd
             return _mm_cvtss_f32(tmp1);
         }
 
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline T reduce_add(batch<T, A> const& self, requires_arch<sse2>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
+                __m128i tmp2 = _mm_add_epi32(self, tmp1);
+                __m128i tmp3 = _mm_shuffle_epi32(tmp2, 0x01);
+                __m128i tmp4 = _mm_add_epi32(tmp2, tmp3);
+                return _mm_cvtsi128_si32(tmp4);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
+                __m128i tmp2 = _mm_add_epi64(self, tmp1);
+#if defined(__x86_64__)
+                return _mm_cvtsi128_si64(tmp2);
+#else
+                __m128i m;
+                _mm_storel_epi64(&m, tmp2);
+                int64_t i;
+                std::memcpy(&i, &m, sizeof(i));
+                return i;
+#endif
+            }
+            else
+            {
+                return hadd(self, generic {});
+            }
+        }
+
+        template <class A>
+        inline double reduce_add(batch<double, A> const& self, requires_arch<sse2>) noexcept
+        {
+            return _mm_cvtsd_f64(_mm_add_sd(self, _mm_unpackhi_pd(self, self)));
+        }
+
         // reduce_max
         template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
         inline T reduce_max(batch<T, A> const& self, requires_arch<sse2>) noexcept
@@ -1260,42 +1297,6 @@ namespace xsimd
             return acc3.get(0);
         }
 
-        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline T reduce_add(batch<T, A> const& self, requires_arch<sse2>) noexcept
-        {
-            XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
-            {
-                __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
-                __m128i tmp2 = _mm_add_epi32(self, tmp1);
-                __m128i tmp3 = _mm_shuffle_epi32(tmp2, 0x01);
-                __m128i tmp4 = _mm_add_epi32(tmp2, tmp3);
-                return _mm_cvtsi128_si32(tmp4);
-            }
-            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
-            {
-                __m128i tmp1 = _mm_shuffle_epi32(self, 0x0E);
-                __m128i tmp2 = _mm_add_epi64(self, tmp1);
-#if defined(__x86_64__)
-                return _mm_cvtsi128_si64(tmp2);
-#else
-                __m128i m;
-                _mm_storel_epi64(&m, tmp2);
-                int64_t i;
-                std::memcpy(&i, &m, sizeof(i));
-                return i;
-#endif
-            }
-            else
-            {
-                return hadd(self, generic {});
-            }
-        }
-        template <class A>
-        inline double reduce_add(batch<double, A> const& self, requires_arch<sse2>) noexcept
-        {
-            return _mm_cvtsd_f64(_mm_add_sd(self, _mm_unpackhi_pd(self, self)));
-        }
-
         // rsqrt
         template <class A>
         inline batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<sse2>) noexcept
@@ -1321,7 +1322,7 @@ namespace xsimd
             return _mm_or_si128(_mm_and_si128(cond, true_br), _mm_andnot_si128(cond, false_br));
         }
         template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse2>) noexcept
+        inline batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse2>) noexcept
         {
             return select(batch_bool<T, A> { Values... }, true_br, false_br, sse2 {});
         }
@@ -1333,7 +1334,7 @@ namespace xsimd
 
         // shuffle
         template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
-        inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3> mask, requires_arch<sse2>) noexcept
+        inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3> mask, requires_arch<sse2>) noexcept
         {
             constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3);
             // shuffle within lane
@@ -1347,7 +1348,7 @@ namespace xsimd
         }
 
         template <class A, class ITy, ITy I0, ITy I1>
-        inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<batch<ITy, A>, I0, I1> mask, requires_arch<sse2>) noexcept
+        inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1> mask, requires_arch<sse2>) noexcept
         {
             constexpr uint32_t smask = detail::mod_shuffle(I0, I1);
             // shuffle within lane
@@ -1600,41 +1601,41 @@ namespace xsimd
         // swizzle
 
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
-        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
+        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
         {
             constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
             return _mm_shuffle_ps(self, self, index);
         }
 
         template <class A, uint64_t V0, uint64_t V1>
-        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1>, requires_arch<sse2>) noexcept
+        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<sse2>) noexcept
         {
             constexpr uint32_t index = detail::shuffle(V0, V1);
             return _mm_shuffle_pd(self, self, index);
         }
 
         template <class A, uint64_t V0, uint64_t V1>
-        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1>, requires_arch<sse2>) noexcept
+        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<sse2>) noexcept
         {
             constexpr uint32_t index = detail::shuffle(2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1);
             return _mm_shuffle_epi32(self, index);
         }
 
         template <class A, uint64_t V0, uint64_t V1>
-        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1> mask, requires_arch<sse2>) noexcept
+        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1> mask, requires_arch<sse2>) noexcept
         {
             return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, sse2 {}));
         }
 
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
-        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
+        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<sse2>) noexcept
         {
             constexpr uint32_t index = detail::shuffle(V0, V1, V2, V3);
             return _mm_shuffle_epi32(self, index);
         }
 
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
-        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> mask, requires_arch<sse2>) noexcept
+        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3> mask, requires_arch<sse2>) noexcept
         {
             return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, sse2 {}));
         }
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp
index 165a191e42..18cfe38cef 100644
--- a/third_party/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_sse4_1.hpp
@@ -284,9 +284,9 @@ namespace xsimd
         }
 
         template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse4_1>) noexcept
+        inline batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sse4_1>) noexcept
         {
-            constexpr int mask = batch_bool_constant<batch<T, A>, Values...>::mask();
+            constexpr int mask = batch_bool_constant<T, A, Values...>::mask();
             XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
             {
                 return _mm_blend_epi16(false_br, true_br, mask);
@@ -304,19 +304,19 @@ namespace xsimd
             }
             else
             {
-                return select(batch_bool_constant<batch<T, A>, Values...>(), true_br, false_br, ssse3 {});
+                return select(batch_bool_constant<T, A, Values...>(), true_br, false_br, ssse3 {});
             }
         }
         template <class A, bool... Values>
-        inline batch<float, A> select(batch_bool_constant<batch<float, A>, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse4_1>) noexcept
+        inline batch<float, A> select(batch_bool_constant<float, A, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<sse4_1>) noexcept
         {
-            constexpr int mask = batch_bool_constant<batch<float, A>, Values...>::mask();
+            constexpr int mask = batch_bool_constant<float, A, Values...>::mask();
             return _mm_blend_ps(false_br, true_br, mask);
         }
         template <class A, bool... Values>
-        inline batch<double, A> select(batch_bool_constant<batch<double, A>, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse4_1>) noexcept
+        inline batch<double, A> select(batch_bool_constant<double, A, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<sse4_1>) noexcept
         {
-            constexpr int mask = batch_bool_constant<batch<double, A>, Values...>::mask();
+            constexpr int mask = batch_bool_constant<double, A, Values...>::mask();
             return _mm_blend_pd(false_br, true_br, mask);
         }
 
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_ssse3.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_ssse3.hpp
index b6ea119213..d4c0b171cb 100644
--- a/third_party/xsimd/include/xsimd/arch/xsimd_ssse3.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_ssse3.hpp
@@ -140,32 +140,32 @@ namespace xsimd
 
         // swizzle (constant mask)
         template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
-        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<ssse3>) noexcept
+        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<ssse3>) noexcept
         {
-            constexpr batch_constant<batch<uint8_t, A>, 2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1, 2 * V2, 2 * V2 + 1, 2 * V3, 2 * V3 + 1,
+            constexpr batch_constant<uint8_t, A, 2 * V0, 2 * V0 + 1, 2 * V1, 2 * V1 + 1, 2 * V2, 2 * V2 + 1, 2 * V3, 2 * V3 + 1,
                                      2 * V4, 2 * V4 + 1, 2 * V5, 2 * V5 + 1, 2 * V6, 2 * V6 + 1, 2 * V7, 2 * V7 + 1>
                 mask8;
-            return _mm_shuffle_epi8(self, (batch<uint8_t, A>)mask8);
+            return _mm_shuffle_epi8(self, mask8.as_batch());
         }
 
         template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
-        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<ssse3>) noexcept
+        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<ssse3>) noexcept
         {
             return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, ssse3 {}));
         }
 
         template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
                   uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
-        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
+        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
         {
-            return swizzle(self, (batch<uint8_t, A>)mask, ssse3 {});
+            return swizzle(self, mask.as_batch(), ssse3 {});
         }
 
         template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
                   uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
-        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
+        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<ssse3>) noexcept
         {
-            return swizzle(self, (batch<uint8_t, A>)mask, ssse3 {});
+            return swizzle(self, mask.as_batch(), ssse3 {});
         }
 
     }
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_sve.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_sve.hpp
index 3177c97b28..553f026cc3 100644
--- a/third_party/xsimd/include/xsimd/arch/xsimd_sve.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_sve.hpp
@@ -20,7 +20,7 @@
 
 namespace xsimd
 {
-    template <class batch_type, typename batch_type::value_type... Values>
+    template <typename T, class A, T... Values>
     struct batch_constant;
 
     namespace kernel
@@ -739,19 +739,19 @@ namespace xsimd
 
         // swizzle (static)
         template <class A, class T, class I, I... idx>
-        inline batch<T, A> swizzle(batch<T, A> const& arg, batch_constant<batch<I, A>, idx...> indices, requires_arch<sve>) noexcept
+        inline batch<T, A> swizzle(batch<T, A> const& arg, batch_constant<I, A, idx...> indices, requires_arch<sve>) noexcept
         {
             static_assert(batch<T, A>::size == sizeof...(idx), "invalid swizzle indices");
-            return swizzle(arg, (batch<I, A>)indices, sve {});
+            return swizzle(arg, indices.as_batch(), sve {});
         }
 
         template <class A, class T, class I, I... idx>
         inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& arg,
-                                                 batch_constant<batch<I, A>, idx...> indices,
+                                                 batch_constant<I, A, idx...> indices,
                                                  requires_arch<sve>) noexcept
         {
             static_assert(batch<std::complex<T>, A>::size == sizeof...(idx), "invalid swizzle indices");
-            return swizzle(arg, (batch<I, A>)indices, sve {});
+            return swizzle(arg, indices.as_batch(), sve {});
         }
 
         /*************
@@ -811,7 +811,7 @@ namespace xsimd
         }
 
         template <class A, class T, bool... b>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sve>) noexcept
+        inline batch<T, A> select(batch_bool_constant<T, A, b...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<sve>) noexcept
         {
             return select(batch_bool<T, A> { b... }, true_br, false_br, sve {});
         }
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_wasm.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_wasm.hpp
index ab9acdc8c3..050b1a08fc 100644
--- a/third_party/xsimd/include/xsimd/arch/xsimd_wasm.hpp
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_wasm.hpp
@@ -19,13 +19,13 @@
 
 namespace xsimd
 {
-    template <class batch_type, bool... Values>
+    template <typename T, class A, bool... Values>
     struct batch_bool_constant;
 
     template <class T_out, class T_in, class A>
     inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& x) noexcept;
 
-    template <class batch_type, typename batch_type::value_type... Values>
+    template <typename T, class A, T... Values>
     struct batch_constant;
 
     namespace kernel
@@ -36,7 +36,7 @@ namespace xsimd
         template <class A, class T, size_t I>
         inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
         template <class A, typename T, typename ITy, ITy... Indices>
-        inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<batch<ITy, A>, Indices...>, requires_arch<generic>) noexcept;
+        inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<ITy, A, Indices...>, requires_arch<generic>) noexcept;
         template <class A, class T>
         inline batch<T, A> avg(batch<T, A> const&, batch<T, A> const&, requires_arch<generic>) noexcept;
 
@@ -1275,7 +1275,7 @@ namespace xsimd
             return wasm_v128_or(wasm_v128_and(cond, true_br), wasm_v128_andnot(false_br, cond));
         }
         template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
-        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<wasm>) noexcept
+        inline batch<T, A> select(batch_bool_constant<T, A, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<wasm>) noexcept
         {
             return select(batch_bool<T, A> { Values... }, true_br, false_br, wasm {});
         }
@@ -1287,13 +1287,13 @@ namespace xsimd
 
         // shuffle
         template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
-        inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3>, requires_arch<wasm>) noexcept
+        inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<ITy, A, I0, I1, I2, I3>, requires_arch<wasm>) noexcept
         {
             return wasm_i32x4_shuffle(x, y, I0, I1, I2, I3);
         }
 
         template <class A, class ITy, ITy I0, ITy I1>
-        inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<batch<ITy, A>, I0, I1>, requires_arch<wasm>) noexcept
+        inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<ITy, A, I0, I1>, requires_arch<wasm>) noexcept
         {
             return wasm_i64x2_shuffle(x, y, I0, I1);
         }
@@ -1515,63 +1515,63 @@ namespace xsimd
 
         // swizzle
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
-        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<wasm>) noexcept
+        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<wasm>) noexcept
         {
             return wasm_i32x4_shuffle(self, self, V0, V1, V2, V3);
         }
 
         template <class A, uint64_t V0, uint64_t V1>
-        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1>, requires_arch<wasm>) noexcept
+        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<wasm>) noexcept
         {
             return wasm_i64x2_shuffle(self, self, V0, V1);
         }
 
         template <class A, uint64_t V0, uint64_t V1>
-        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1>, requires_arch<wasm>) noexcept
+        inline batch<uint64_t, A> swizzle(batch<uint64_t, A> const& self, batch_constant<uint64_t, A, V0, V1>, requires_arch<wasm>) noexcept
         {
             return wasm_i64x2_shuffle(self, self, V0, V1);
         }
 
         template <class A, uint64_t V0, uint64_t V1>
-        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1> mask, requires_arch<wasm>) noexcept
+        inline batch<int64_t, A> swizzle(batch<int64_t, A> const& self, batch_constant<uint64_t, A, V0, V1> mask, requires_arch<wasm>) noexcept
         {
             return bitwise_cast<int64_t>(swizzle(bitwise_cast<uint64_t>(self), mask, wasm {}));
         }
 
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
-        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3>, requires_arch<wasm>) noexcept
+        inline batch<uint32_t, A> swizzle(batch<uint32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3>, requires_arch<wasm>) noexcept
         {
             return wasm_i32x4_shuffle(self, self, V0, V1, V2, V3);
         }
 
         template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3>
-        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3> mask, requires_arch<wasm>) noexcept
+        inline batch<int32_t, A> swizzle(batch<int32_t, A> const& self, batch_constant<uint32_t, A, V0, V1, V2, V3> mask, requires_arch<wasm>) noexcept
         {
             return bitwise_cast<int32_t>(swizzle(bitwise_cast<uint32_t>(self), mask, wasm {}));
         }
 
         template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
-        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<wasm>) noexcept
+        inline batch<uint16_t, A> swizzle(batch<uint16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<wasm>) noexcept
         {
             return wasm_i16x8_shuffle(self, self, V0, V1, V2, V3, V4, V5, V6, V7);
         }
 
         template <class A, uint16_t V0, uint16_t V1, uint16_t V2, uint16_t V3, uint16_t V4, uint16_t V5, uint16_t V6, uint16_t V7>
-        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<batch<uint16_t, A>, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<wasm>) noexcept
+        inline batch<int16_t, A> swizzle(batch<int16_t, A> const& self, batch_constant<uint16_t, A, V0, V1, V2, V3, V4, V5, V6, V7> mask, requires_arch<wasm>) noexcept
         {
             return bitwise_cast<int16_t>(swizzle(bitwise_cast<uint16_t>(self), mask, wasm {}));
         }
 
         template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
                   uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
-        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15>, requires_arch<wasm>) noexcept
+        inline batch<uint8_t, A> swizzle(batch<uint8_t, A> const& self, batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15>, requires_arch<wasm>) noexcept
         {
             return wasm_i8x16_shuffle(self, self, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15);
         }
 
         template <class A, uint8_t V0, uint8_t V1, uint8_t V2, uint8_t V3, uint8_t V4, uint8_t V5, uint8_t V6, uint8_t V7,
                   uint8_t V8, uint8_t V9, uint8_t V10, uint8_t V11, uint8_t V12, uint8_t V13, uint8_t V14, uint8_t V15>
-        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<batch<uint8_t, A>, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<wasm>) noexcept
+        inline batch<int8_t, A> swizzle(batch<int8_t, A> const& self, batch_constant<uint8_t, A, V0, V1, V2, V3, V4, V5, V6, V7, V8, V9, V10, V11, V12, V13, V14, V15> mask, requires_arch<wasm>) noexcept
         {
             return bitwise_cast<int8_t>(swizzle(bitwise_cast<uint8_t>(self), mask, wasm {}));
         }
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp b/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp
index 6537157bc6..6d024a1677 100644
--- a/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_all_registers.hpp
@@ -46,3 +46,7 @@
 #include "xsimd_rvv_register.hpp"
 
 #include "xsimd_wasm_register.hpp"
+
+#if XSIMD_WITH_EMULATED
+#include "xsimd_emulated_register.hpp"
+#endif
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_api.hpp b/third_party/xsimd/include/xsimd/types/xsimd_api.hpp
index 751e31d33a..138c9642d0 100644
--- a/third_party/xsimd/include/xsimd/types/xsimd_api.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_api.hpp
@@ -2031,7 +2031,7 @@ namespace xsimd
      * @return the result of the selection.
      */
     template <class T, class A, bool... Values>
-    inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br) noexcept
+    inline batch<T, A> select(batch_bool_constant<T, A, Values...> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br) noexcept
     {
         detail::static_check_supported_config<T, A>();
         return kernel::select<A>(cond, true_br, false_br, A {});
@@ -2047,7 +2047,7 @@ namespace xsimd
      * element of \c x and \c y. Each element of the mask index the vector that
      * would be formed by the concatenation of \c x and \c y. For instance
      * \code{.cpp}
-     * batch_constant<batch<uint32_t, sse2>, 0, 4, 3, 7>
+     * batch_constant<uint32_t, sse2, 0, 4, 3, 7>
      * \endcode
      * Picks \c x[0], \c y[0], \c x[3], \c y[3]
      *
@@ -2055,7 +2055,7 @@ namespace xsimd
      */
     template <class T, class A, class Vt, Vt... Values>
     inline typename std::enable_if<std::is_arithmetic<T>::value, batch<T, A>>::type
-    shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<batch<Vt, A>, Values...> mask) noexcept
+    shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<Vt, A, Values...> mask) noexcept
     {
         static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
         detail::static_check_supported_config<T, A>();
@@ -2210,19 +2210,22 @@ namespace xsimd
     template <class To, class A = default_arch, class From>
     inline void store_as(To* dst, batch<From, A> const& src, aligned_mode) noexcept
     {
-        kernel::store_aligned(dst, src, A {});
+        detail::static_check_supported_config<From, A>();
+        kernel::store_aligned<A>(dst, src, A {});
     }
 
     template <class A = default_arch, class From>
     inline void store_as(bool* dst, batch_bool<From, A> const& src, aligned_mode) noexcept
     {
-        kernel::store(src, dst, A {});
+        detail::static_check_supported_config<From, A>();
+        kernel::store<A>(src, dst, A {});
     }
 
     template <class To, class A = default_arch, class From>
     inline void store_as(std::complex<To>* dst, batch<std::complex<From>, A> const& src, aligned_mode) noexcept
     {
-        kernel::store_complex_aligned(dst, src, A {});
+        detail::static_check_supported_config<std::complex<From>, A>();
+        kernel::store_complex_aligned<A>(dst, src, A {});
     }
 
 #ifdef XSIMD_ENABLE_XTL_COMPLEX
@@ -2244,25 +2247,29 @@ namespace xsimd
     template <class To, class A = default_arch, class From>
     inline void store_as(To* dst, batch<From, A> const& src, unaligned_mode) noexcept
     {
-        kernel::store_unaligned(dst, src, A {});
+        detail::static_check_supported_config<From, A>();
+        kernel::store_unaligned<A>(dst, src, A {});
     }
 
     template <class A = default_arch, class From>
     inline void store_as(bool* dst, batch_bool<From, A> const& src, unaligned_mode) noexcept
     {
-        kernel::store(src, dst, A {});
+        detail::static_check_supported_config<From, A>();
+        kernel::store<A>(src, dst, A {});
     }
 
     template <class To, class A = default_arch, class From>
     inline void store_as(std::complex<To>* dst, batch<std::complex<From>, A> const& src, unaligned_mode) noexcept
     {
-        kernel::store_complex_unaligned(dst, src, A {});
+        detail::static_check_supported_config<std::complex<From>, A>();
+        kernel::store_complex_unaligned<A>(dst, src, A {});
     }
 
 #ifdef XSIMD_ENABLE_XTL_COMPLEX
     template <class To, class A = default_arch, class From, bool i3ec>
     inline void store_as(xtl::xcomplex<To, To, i3ec>* dst, batch<std::complex<From>, A> const& src, unaligned_mode) noexcept
     {
+        detail::static_check_supported_config<std::complex<From>, A>();
         store_as(reinterpret_cast<std::complex<To>*>(dst), src, unaligned_mode());
     }
 #endif
@@ -2350,14 +2357,14 @@ namespace xsimd
      */
     template <class T, class A, class Vt, Vt... Values>
     inline typename std::enable_if<std::is_arithmetic<T>::value, batch<T, A>>::type
-    swizzle(batch<T, A> const& x, batch_constant<batch<Vt, A>, Values...> mask) noexcept
+    swizzle(batch<T, A> const& x, batch_constant<Vt, A, Values...> mask) noexcept
     {
         static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
         detail::static_check_supported_config<T, A>();
         return kernel::swizzle<A>(x, mask, A {});
     }
     template <class T, class A, class Vt, Vt... Values>
-    inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& x, batch_constant<batch<Vt, A>, Values...> mask) noexcept
+    inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& x, batch_constant<Vt, A, Values...> mask) noexcept
     {
         static_assert(sizeof(T) == sizeof(Vt), "consistent mask");
         detail::static_check_supported_config<T, A>();
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_batch_constant.hpp b/third_party/xsimd/include/xsimd/types/xsimd_batch_constant.hpp
index 0de9c8ad42..cb28220441 100644
--- a/third_party/xsimd/include/xsimd/types/xsimd_batch_constant.hpp
+++ b/third_party/xsimd/include/xsimd/types/xsimd_batch_constant.hpp
@@ -25,17 +25,24 @@ namespace xsimd
      * @tparam batch_type the type of the associated batch values.
      * @tparam Values boolean constant represented by this batch
      **/
-    template <class batch_type, bool... Values>
+    template <typename T, class A, bool... Values>
     struct batch_bool_constant
     {
-
-    public:
+        using batch_type = batch_bool<T, A>;
         static constexpr std::size_t size = sizeof...(Values);
-        using arch_type = typename batch_type::arch_type;
         using value_type = bool;
         static_assert(sizeof...(Values) == batch_type::size, "consistent batch size");
 
-        constexpr operator batch_bool<typename batch_type::value_type, arch_type>() const noexcept { return { Values... }; }
+    public:
+        /**
+         * @brief Generate a batch of @p batch_type from this @p batch_bool_constant
+         */
+        constexpr batch_type as_batch_bool() const noexcept { return { Values... }; }
+
+        /**
+         * @brief Generate a batch of @p batch_type from this @p batch_bool_constant
+         */
+        constexpr operator batch_type() const noexcept { return as_batch_bool(); }
 
         constexpr bool get(size_t i) const noexcept
         {
@@ -70,14 +77,14 @@ namespace xsimd
         };
 
         template <class F, class SelfPack, class OtherPack, size_t... Indices>
-        static constexpr batch_bool_constant<batch_type, F()(std::tuple_element<Indices, SelfPack>::type::value, std::tuple_element<Indices, OtherPack>::type::value)...>
+        static constexpr batch_bool_constant<T, A, F()(std::tuple_element<Indices, SelfPack>::type::value, std::tuple_element<Indices, OtherPack>::type::value)...>
         apply(detail::index_sequence<Indices...>)
         {
             return {};
         }
 
         template <class F, bool... OtherValues>
-        static constexpr auto apply(batch_bool_constant<batch_type, Values...>, batch_bool_constant<batch_type, OtherValues...>)
+        static constexpr auto apply(batch_bool_constant<T, A, Values...>, batch_bool_constant<T, A, OtherValues...>)
             -> decltype(apply<F, std::tuple<std::integral_constant<bool, Values>...>, std::tuple<std::integral_constant<bool, OtherValues>...>>(detail::make_index_sequence<sizeof...(Values)>()))
         {
             static_assert(sizeof...(Values) == sizeof...(OtherValues), "compatible constant batches");
@@ -85,12 +92,12 @@ namespace xsimd
         }
 
     public:
-#define MAKE_BINARY_OP(OP, NAME)                                                            \
-    template <bool... OtherValues>                                                          \
-    constexpr auto operator OP(batch_bool_constant<batch_type, OtherValues...> other) const \
-        -> decltype(apply<NAME>(*this, other))                                              \
-    {                                                                                       \
-        return apply<NAME>(*this, other);                                                   \
+#define MAKE_BINARY_OP(OP, NAME)                                                      \
+    template <bool... OtherValues>                                                    \
+    constexpr auto operator OP(batch_bool_constant<T, A, OtherValues...> other) const \
+        -> decltype(apply<NAME>(*this, other))                                        \
+    {                                                                                 \
+        return apply<NAME>(*this, other);                                             \
     }
 
         MAKE_BINARY_OP(|, logical_or)
@@ -101,12 +108,12 @@ namespace xsimd
 
 #undef MAKE_BINARY_OP
 
-        constexpr batch_bool_constant<batch_type, !Values...> operator!() const
+        constexpr batch_bool_constant<T, A, !Values...> operator!() const
         {
             return {};
         }
 
-        constexpr batch_bool_constant<batch_type, !Values...> operator~() const
+        constexpr batch_bool_constant<T, A, !Values...> operator~() const
         {
             return {};
         }
@@ -120,88 +127,93 @@ namespace xsimd
      * @tparam batch_type the type of the associated batch values.
      * @tparam Values constants represented by this batch
      **/
-    template <class batch_type, typename batch_type::value_type... Values>
+    template <typename T, class A, T... Values>
     struct batch_constant
     {
         static constexpr std::size_t size = sizeof...(Values);
-        using arch_type = typename batch_type::arch_type;
+        using batch_type = batch<T, A>;
         using value_type = typename batch_type::value_type;
         static_assert(sizeof...(Values) == batch_type::size, "consistent batch size");
 
         /**
          * @brief Generate a batch of @p batch_type from this @p batch_constant
          */
-        inline operator batch_type() const noexcept { return { Values... }; }
+        inline batch_type as_batch() const noexcept { return { Values... }; }
+
+        /**
+         * @brief Generate a batch of @p batch_type from this @p batch_constant
+         */
+        inline operator batch_type() const noexcept { return as_batch(); }
 
         /**
          * @brief Get the @p i th element of this @p batch_constant
          */
-        constexpr value_type get(size_t i) const noexcept
+        constexpr T get(size_t i) const noexcept
         {
-            return get(i, std::array<value_type, size> { Values... });
+            return get(i, std::array<T, size> { Values... });
         }
 
     private:
-        constexpr value_type get(size_t i, std::array<value_type, size> const& values) const noexcept
+        constexpr T get(size_t i, std::array<T, size> const& values) const noexcept
         {
             return values[i];
         }
 
         struct arithmetic_add
         {
-            constexpr value_type operator()(value_type x, value_type y) const { return x + y; }
+            constexpr T operator()(T x, T y) const { return x + y; }
         };
         struct arithmetic_sub
         {
-            constexpr value_type operator()(value_type x, value_type y) const { return x - y; }
+            constexpr T operator()(T x, T y) const { return x - y; }
         };
         struct arithmetic_mul
         {
-            constexpr value_type operator()(value_type x, value_type y) const { return x * y; }
+            constexpr T operator()(T x, T y) const { return x * y; }
         };
         struct arithmetic_div
         {
-            constexpr value_type operator()(value_type x, value_type y) const { return x / y; }
+            constexpr T operator()(T x, T y) const { return x / y; }
         };
         struct arithmetic_mod
         {
-            constexpr value_type operator()(value_type x, value_type y) const { return x % y; }
+            constexpr T operator()(T x, T y) const { return x % y; }
         };
         struct binary_and
         {
-            constexpr value_type operator()(value_type x, value_type y) const { return x & y; }
+            constexpr T operator()(T x, T y) const { return x & y; }
         };
         struct binary_or
         {
-            constexpr value_type operator()(value_type x, value_type y) const { return x | y; }
+            constexpr T operator()(T x, T y) const { return x | y; }
         };
         struct binary_xor
         {
-            constexpr value_type operator()(value_type x, value_type y) const { return x ^ y; }
+            constexpr T operator()(T x, T y) const { return x ^ y; }
         };
 
         template <class F, class SelfPack, class OtherPack, size_t... Indices>
-        static constexpr batch_constant<batch_type, F()(std::tuple_element<Indices, SelfPack>::type::value, std::tuple_element<Indices, OtherPack>::type::value)...>
+        static constexpr batch_constant<T, A, F()(std::tuple_element<Indices, SelfPack>::type::value, std::tuple_element<Indices, OtherPack>::type::value)...>
         apply(detail::index_sequence<Indices...>)
         {
             return {};
         }
 
-        template <class F, value_type... OtherValues>
-        static constexpr auto apply(batch_constant<batch_type, Values...>, batch_constant<batch_type, OtherValues...>)
-            -> decltype(apply<F, std::tuple<std::integral_constant<value_type, Values>...>, std::tuple<std::integral_constant<value_type, OtherValues>...>>(detail::make_index_sequence<sizeof...(Values)>()))
+        template <class F, T... OtherValues>
+        static constexpr auto apply(batch_constant<T, A, Values...>, batch_constant<T, A, OtherValues...>)
+            -> decltype(apply<F, std::tuple<std::integral_constant<T, Values>...>, std::tuple<std::integral_constant<T, OtherValues>...>>(detail::make_index_sequence<sizeof...(Values)>()))
         {
             static_assert(sizeof...(Values) == sizeof...(OtherValues), "compatible constant batches");
-            return apply<F, std::tuple<std::integral_constant<value_type, Values>...>, std::tuple<std::integral_constant<value_type, OtherValues>...>>(detail::make_index_sequence<sizeof...(Values)>());
+            return apply<F, std::tuple<std::integral_constant<T, Values>...>, std::tuple<std::integral_constant<T, OtherValues>...>>(detail::make_index_sequence<sizeof...(Values)>());
         }
 
     public:
-#define MAKE_BINARY_OP(OP, NAME)                                                       \
-    template <value_type... OtherValues>                                               \
-    constexpr auto operator OP(batch_constant<batch_type, OtherValues...> other) const \
-        -> decltype(apply<NAME>(*this, other))                                         \
-    {                                                                                  \
-        return apply<NAME>(*this, other);                                              \
+#define MAKE_BINARY_OP(OP, NAME)                                                 \
+    template <T... OtherValues>                                                  \
+    constexpr auto operator OP(batch_constant<T, A, OtherValues...> other) const \
+        -> decltype(apply<NAME>(*this, other))                                   \
+    {                                                                            \
+        return apply<NAME>(*this, other);                                        \
     }
 
         MAKE_BINARY_OP(+, arithmetic_add)
@@ -215,17 +227,17 @@ namespace xsimd
 
 #undef MAKE_BINARY_OP
 
-        constexpr batch_constant<batch_type, (value_type)-Values...> operator-() const
+        constexpr batch_constant<T, A, (T)-Values...> operator-() const
         {
             return {};
         }
 
-        constexpr batch_constant<batch_type, (value_type) + Values...> operator+() const
+        constexpr batch_constant<T, A, (T) + Values...> operator+() const
         {
             return {};
         }
 
-        constexpr batch_constant<batch_type, (value_type)~Values...> operator~() const
+        constexpr batch_constant<T, A, (T)~Values...> operator~() const
         {
             return {};
         }
@@ -233,15 +245,15 @@ namespace xsimd
 
     namespace detail
     {
-        template <class batch_type, class G, std::size_t... Is>
+        template <typename T, class A, class G, std::size_t... Is>
         inline constexpr auto make_batch_constant(detail::index_sequence<Is...>) noexcept
-            -> batch_constant<batch_type, (typename batch_type::value_type)G::get(Is, sizeof...(Is))...>
+            -> batch_constant<T, A, (T)G::get(Is, sizeof...(Is))...>
         {
             return {};
         }
-        template <class batch_type, class G, std::size_t... Is>
+        template <typename T, class A, class G, std::size_t... Is>
         inline constexpr auto make_batch_bool_constant(detail::index_sequence<Is...>) noexcept
-            -> batch_bool_constant<batch_type, G::get(Is, sizeof...(Is))...>
+            -> batch_bool_constant<T, A, G::get(Is, sizeof...(Is))...>
         {
             return {};
         }
@@ -268,19 +280,19 @@ namespace xsimd
      * };
      * @endcode
      */
-    template <class batch_type, class G>
-    inline constexpr auto make_batch_constant() noexcept -> decltype(detail::make_batch_constant<batch_type, G>(detail::make_index_sequence<batch_type::size>()))
+    template <typename T, class A, class G>
+    inline constexpr auto make_batch_constant() noexcept -> decltype(detail::make_batch_constant<T, A, G>(detail::make_index_sequence<batch<T, A>::size>()))
     {
-        return detail::make_batch_constant<batch_type, G>(detail::make_index_sequence<batch_type::size>());
+        return detail::make_batch_constant<T, A, G>(detail::make_index_sequence<batch<T, A>::size>());
     }
 
-    template <class batch_type, class G>
+    template <typename T, class A, class G>
     inline constexpr auto make_batch_bool_constant() noexcept
-        -> decltype(detail::make_batch_bool_constant<batch_type, G>(
-            detail::make_index_sequence<batch_type::size>()))
+        -> decltype(detail::make_batch_bool_constant<T, A, G>(
+            detail::make_index_sequence<batch<T, A>::size>()))
     {
-        return detail::make_batch_bool_constant<batch_type, G>(
-            detail::make_index_sequence<batch_type::size>());
+        return detail::make_batch_bool_constant<T, A, G>(
+            detail::make_index_sequence<batch<T, A>::size>());
     }
 
 } // namespace xsimd
diff --git a/third_party/xsimd/include/xsimd/types/xsimd_emulated_register.hpp b/third_party/xsimd/include/xsimd/types/xsimd_emulated_register.hpp
new file mode 100644
index 0000000000..b05d718143
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/types/xsimd_emulated_register.hpp
@@ -0,0 +1,80 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_EMULATED_REGISTER_HPP
+#define XSIMD_EMULATED_REGISTER_HPP
+
+#include "./xsimd_generic_arch.hpp"
+#include "./xsimd_register.hpp"
+
+namespace xsimd
+{
+    /**
+     * @ingroup architectures
+     *
+     * emulated instructions
+     */
+    template <size_t N>
+    struct emulated : generic
+    {
+        static constexpr bool supported() noexcept { return true; }
+        static constexpr bool available() noexcept { return true; }
+        static constexpr bool requires_alignment() noexcept { return false; }
+        static constexpr std::size_t alignment() noexcept { return 8; }
+        static constexpr char const* name() noexcept { return "emulated"; }
+    };
+
+    namespace types
+    {
+        template <size_t N>
+        struct simd_emulated_bool_register
+        {
+            using register_type = std::array<bool, N>;
+            register_type data;
+            simd_emulated_bool_register() = default;
+            simd_emulated_bool_register(register_type r) { data = r; }
+            operator register_type() const noexcept { return data; }
+        };
+        template <typename T, size_t N>
+        struct get_bool_simd_register<T, emulated<N>>
+        {
+            using type = simd_emulated_bool_register<N / (8 * sizeof(T))>;
+        };
+
+        template <typename T, size_t N>
+        struct simd_register<T, emulated<N>>
+        {
+            static_assert(N % (8 * sizeof(T)) == 0, "bit width must be a multiple of scalar width");
+            using register_type = std::array<T, N / (8 * sizeof(T))>;
+            register_type data;
+            inline operator register_type() const noexcept
+            {
+                return data;
+            }
+        };
+        template <typename T, size_t N>
+        struct has_simd_register<T, emulated<N>> : std::is_scalar<T>
+        {
+        };
+        template <typename T, size_t N>
+        struct has_simd_register<std::complex<T>, emulated<N>> : std::true_type
+        {
+        };
+#ifdef XSIMD_ENABLE_XTL_COMPLEX
+        template <typename T, bool i3ec, size_t N>
+        struct has_simd_register<xtl::complex<T, T, i3ec>, emulated<N>> : std::true_type
+        {
+        };
+#endif
+    }
+}
+
+#endif
diff --git a/third_party/xsimd/moz.yaml b/third_party/xsimd/moz.yaml
index 6385b68fa5..b88c749f67 100644
--- a/third_party/xsimd/moz.yaml
+++ b/third_party/xsimd/moz.yaml
@@ -10,8 +10,8 @@ origin:
 
   url: https://github.com/QuantStack/xsimd
 
-  release: 7080469620c2145fbedf4ef8950406066e1ca2d6 (2024-03-17T21:35:00Z).
-  revision: 7080469620c2145fbedf4ef8950406066e1ca2d6
+  release: be9dcb5df413a893fb6646fa950eeb4aeac70ffc (2024-04-20T09:35:04Z).
+  revision: be9dcb5df413a893fb6646fa950eeb4aeac70ffc
 
   license: BSD-3-Clause
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-06-12 05:43:14 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2024-06-12 05:43:14 +0000
commit	8dd16259287f58f9273002717ec4d27e97127719 (patch)
tree	3863e62a53829a84037444beab3abd4ed9dfc7d0 /third_party/xsimd
parent	Releasing progress-linux version 126.0.1-1~progress7.99u1. (diff)
download	firefox-8dd16259287f58f9273002717ec4d27e97127719.tar.xz firefox-8dd16259287f58f9273002717ec4d27e97127719.zip