1 files changed, 1739 insertions, 0 deletions
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp
new file mode 100644
index 0000000000..5ec1e02d48
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_avx.hpp
@@ -0,0 +1,1739 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_AVX_HPP
+#define XSIMD_AVX_HPP
+
+#include <complex>
+#include <limits>
+#include <type_traits>
+
+#include "../types/xsimd_avx_register.hpp"
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        using namespace types;
+
+        // fwd
+        template <class A, class T, size_t I>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept;
+
+        namespace detail
+        {
+            inline void split_avx(__m256i val, __m128i& low, __m128i& high) noexcept
+            {
+                low = _mm256_castsi256_si128(val);
+                high = _mm256_extractf128_si256(val, 1);
+            }
+            inline void split_avx(__m256 val, __m128& low, __m128& high) noexcept
+            {
+                low = _mm256_castps256_ps128(val);
+                high = _mm256_extractf128_ps(val, 1);
+            }
+            inline void split_avx(__m256d val, __m128d& low, __m128d& high) noexcept
+            {
+                low = _mm256_castpd256_pd128(val);
+                high = _mm256_extractf128_pd(val, 1);
+            }
+            inline __m256i merge_sse(__m128i low, __m128i high) noexcept
+            {
+                return _mm256_insertf128_si256(_mm256_castsi128_si256(low), high, 1);
+            }
+            inline __m256 merge_sse(__m128 low, __m128 high) noexcept
+            {
+                return _mm256_insertf128_ps(_mm256_castps128_ps256(low), high, 1);
+            }
+            inline __m256d merge_sse(__m128d low, __m128d high) noexcept
+            {
+                return _mm256_insertf128_pd(_mm256_castpd128_pd256(low), high, 1);
+            }
+            template <class F>
+            inline __m256i fwd_to_sse(F f, __m256i self) noexcept
+            {
+                __m128i self_low, self_high;
+                split_avx(self, self_low, self_high);
+                __m128i res_low = f(self_low);
+                __m128i res_high = f(self_high);
+                return merge_sse(res_low, res_high);
+            }
+            template <class F>
+            inline __m256i fwd_to_sse(F f, __m256i self, __m256i other) noexcept
+            {
+                __m128i self_low, self_high, other_low, other_high;
+                split_avx(self, self_low, self_high);
+                split_avx(other, other_low, other_high);
+                __m128i res_low = f(self_low, other_low);
+                __m128i res_high = f(self_high, other_high);
+                return merge_sse(res_low, res_high);
+            }
+            template <class F>
+            inline __m256i fwd_to_sse(F f, __m256i self, int32_t other) noexcept
+            {
+                __m128i self_low, self_high;
+                split_avx(self, self_low, self_high);
+                __m128i res_low = f(self_low, other);
+                __m128i res_high = f(self_high, other);
+                return merge_sse(res_low, res_high);
+            }
+        }
+
+        // abs
+        template <class A>
+        inline batch<float, A> abs(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            __m256 sign_mask = _mm256_set1_ps(-0.f); // -0.f = 1 << 31
+            return _mm256_andnot_ps(sign_mask, self);
+        }
+        template <class A>
+        inline batch<double, A> abs(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            __m256d sign_mask = _mm256_set1_pd(-0.f); // -0.f = 1 << 31
+            return _mm256_andnot_pd(sign_mask, self);
+        }
+
+        // add
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> add(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return add(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+        template <class A>
+        inline batch<float, A> add(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_add_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> add(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_add_pd(self, other);
+        }
+
+        // all
+        template <class A>
+        inline bool all(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_testc_ps(self, batch_bool<float, A>(true)) != 0;
+        }
+        template <class A>
+        inline bool all(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_testc_pd(self, batch_bool<double, A>(true)) != 0;
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline bool all(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_testc_si256(self, batch_bool<T, A>(true)) != 0;
+        }
+
+        // any
+        template <class A>
+        inline bool any(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return !_mm256_testz_ps(self, self);
+        }
+        template <class A>
+        inline bool any(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return !_mm256_testz_pd(self, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline bool any(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return !_mm256_testz_si256(self, self);
+        }
+
+        // batch_bool_cast
+        template <class A, class T_out, class T_in>
+        inline batch_bool<T_out, A> batch_bool_cast(batch_bool<T_in, A> const& self, batch_bool<T_out, A> const&, requires_arch<avx>) noexcept
+        {
+            return { bitwise_cast<T_out>(batch<T_in, A>(self.data)).data };
+        }
+
+        // bitwise_and
+        template <class A>
+        inline batch<float, A> bitwise_and(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_and_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_and(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_and_pd(self, other);
+        }
+
+        template <class A>
+        inline batch_bool<float, A> bitwise_and(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_and_ps(self, other);
+        }
+        template <class A>
+        inline batch_bool<double, A> bitwise_and(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_and_pd(self, other);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_and(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_and(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_and(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+
+        // bitwise_andnot
+        template <class A>
+        inline batch<float, A> bitwise_andnot(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_andnot_ps(other, self);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_andnot(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_andnot_pd(other, self);
+        }
+
+        template <class A>
+        inline batch_bool<float, A> bitwise_andnot(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_andnot_ps(other, self);
+        }
+        template <class A>
+        inline batch_bool<double, A> bitwise_andnot(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_andnot_pd(other, self);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_andnot(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_andnot(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_andnot(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_andnot(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+
+        // bitwise_lshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_lshift(batch<T, A> const& self, int32_t other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, int32_t o) noexcept
+                                      { return bitwise_lshift(batch<T, sse4_2>(s), o, sse4_2 {}); },
+                                      self, other);
+        }
+
+        // bitwise_not
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_not(batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s) noexcept
+                                      { return bitwise_not(batch<T, sse4_2>(s), sse4_2 {}); },
+                                      self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_not(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s) noexcept
+                                      { return bitwise_not(batch_bool<T, sse4_2>(s), sse4_2 {}); },
+                                      self);
+        }
+
+        // bitwise_or
+        template <class A>
+        inline batch<float, A> bitwise_or(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_or_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_or(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_or_pd(self, other);
+        }
+        template <class A>
+        inline batch_bool<float, A> bitwise_or(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_or_ps(self, other);
+        }
+        template <class A>
+        inline batch_bool<double, A> bitwise_or(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_or_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_or(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_or(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> bitwise_or(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_or(batch_bool<T, sse4_2>(s), batch_bool<T, sse4_2>(o)); },
+                                      self, other);
+        }
+
+        // bitwise_rshift
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_rshift(batch<T, A> const& self, int32_t other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, int32_t o) noexcept
+                                      { return bitwise_rshift(batch<T, sse4_2>(s), o, sse4_2 {}); },
+                                      self, other);
+        }
+
+        // bitwise_xor
+        template <class A>
+        inline batch<float, A> bitwise_xor(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_xor(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_pd(self, other);
+        }
+        template <class A>
+        inline batch_bool<float, A> bitwise_xor(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_ps(self, other);
+        }
+        template <class A>
+        inline batch_bool<double, A> bitwise_xor(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_xor(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_xor(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
+                                      self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_xor(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return bitwise_xor(batch_bool<T, sse4_2>(s), batch_bool<T, sse4_2>(o), sse4_2 {}); },
+                                      self, other);
+        }
+
+        // bitwise_cast
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<float, A> bitwise_cast(batch<T, A> const& self, batch<float, A> const&, requires_arch<avx>) noexcept
+        {
+            return _mm256_castsi256_ps(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<double, A> bitwise_cast(batch<T, A> const& self, batch<double, A> const&, requires_arch<avx>) noexcept
+        {
+            return _mm256_castsi256_pd(self);
+        }
+        template <class A, class T, class Tp, class = typename std::enable_if<std::is_integral<typename std::common_type<T, Tp>::type>::value, void>::type>
+        inline batch<Tp, A> bitwise_cast(batch<T, A> const& self, batch<Tp, A> const&, requires_arch<avx>) noexcept
+        {
+            return batch<Tp, A>(self.data);
+        }
+        template <class A>
+        inline batch<double, A> bitwise_cast(batch<float, A> const& self, batch<double, A> const&, requires_arch<avx>) noexcept
+        {
+            return _mm256_castps_pd(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_cast(batch<float, A> const& self, batch<T, A> const&, requires_arch<avx>) noexcept
+        {
+            return _mm256_castps_si256(self);
+        }
+        template <class A>
+        inline batch<float, A> bitwise_cast(batch<double, A> const& self, batch<float, A> const&, requires_arch<avx>) noexcept
+        {
+            return _mm256_castpd_ps(self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> bitwise_cast(batch<double, A> const& self, batch<T, A> const&, requires_arch<avx>) noexcept
+        {
+            return _mm256_castpd_si256(self);
+        }
+
+        // bitwise_not
+        template <class A>
+        inline batch<float, A> bitwise_not(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1)));
+        }
+        template <class A>
+        inline batch<double, A> bitwise_not(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1)));
+        }
+        template <class A>
+        inline batch_bool<float, A> bitwise_not(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(-1)));
+        }
+        template <class A>
+        inline batch_bool<double, A> bitwise_not(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi32(-1)));
+        }
+
+        // broadcast
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> broadcast(T val, requires_arch<avx>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm256_set1_epi8(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_set1_epi16(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_set1_epi32(val);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_set1_epi64x(val);
+            }
+            else
+            {
+                assert(false && "unsupported");
+                return {};
+            }
+        }
+        template <class A>
+        inline batch<float, A> broadcast(float val, requires_arch<avx>) noexcept
+        {
+            return _mm256_set1_ps(val);
+        }
+        template <class A>
+        inline batch<double, A> broadcast(double val, requires_arch<avx>) noexcept
+        {
+            return _mm256_set1_pd(val);
+        }
+
+        // ceil
+        template <class A>
+        inline batch<float, A> ceil(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_ceil_ps(self);
+        }
+        template <class A>
+        inline batch<double, A> ceil(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_ceil_pd(self);
+        }
+
+        namespace detail
+        {
+            // On clang, _mm256_extractf128_ps is built upon build_shufflevector
+            // which require index parameter to be a constant
+            template <int index, class B>
+            inline B get_half_complex_f(const B& real, const B& imag) noexcept
+            {
+                __m128 tmp0 = _mm256_extractf128_ps(real, index);
+                __m128 tmp1 = _mm256_extractf128_ps(imag, index);
+                __m128 tmp2 = _mm_unpackhi_ps(tmp0, tmp1);
+                tmp0 = _mm_unpacklo_ps(tmp0, tmp1);
+                __m256 res = real;
+                res = _mm256_insertf128_ps(res, tmp0, 0);
+                res = _mm256_insertf128_ps(res, tmp2, 1);
+                return res;
+            }
+            template <int index, class B>
+            inline B get_half_complex_d(const B& real, const B& imag) noexcept
+            {
+                __m128d tmp0 = _mm256_extractf128_pd(real, index);
+                __m128d tmp1 = _mm256_extractf128_pd(imag, index);
+                __m128d tmp2 = _mm_unpackhi_pd(tmp0, tmp1);
+                tmp0 = _mm_unpacklo_pd(tmp0, tmp1);
+                __m256d res = real;
+                res = _mm256_insertf128_pd(res, tmp0, 0);
+                res = _mm256_insertf128_pd(res, tmp2, 1);
+                return res;
+            }
+
+            // complex_low
+            template <class A>
+            inline batch<float, A> complex_low(batch<std::complex<float>, A> const& self, requires_arch<avx>) noexcept
+            {
+                return get_half_complex_f<0>(self.real(), self.imag());
+            }
+            template <class A>
+            inline batch<double, A> complex_low(batch<std::complex<double>, A> const& self, requires_arch<avx>) noexcept
+            {
+                return get_half_complex_d<0>(self.real(), self.imag());
+            }
+
+            // complex_high
+            template <class A>
+            inline batch<float, A> complex_high(batch<std::complex<float>, A> const& self, requires_arch<avx>) noexcept
+            {
+                return get_half_complex_f<1>(self.real(), self.imag());
+            }
+            template <class A>
+            inline batch<double, A> complex_high(batch<std::complex<double>, A> const& self, requires_arch<avx>) noexcept
+            {
+                return get_half_complex_d<1>(self.real(), self.imag());
+            }
+        }
+
+        // fast_cast
+        namespace detail
+        {
+            template <class A>
+            inline batch<float, A> fast_cast(batch<int32_t, A> const& self, batch<float, A> const&, requires_arch<avx>) noexcept
+            {
+                return _mm256_cvtepi32_ps(self);
+            }
+
+            template <class A>
+            inline batch<int32_t, A> fast_cast(batch<float, A> const& self, batch<int32_t, A> const&, requires_arch<avx>) noexcept
+            {
+                return _mm256_cvttps_epi32(self);
+            }
+        }
+
+        // decr_if
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> decr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<avx>) noexcept
+        {
+            return self + batch<T, A>(mask.data);
+        }
+
+        // div
+        template <class A>
+        inline batch<float, A> div(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_div_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> div(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_div_pd(self, other);
+        }
+
+        // eq
+        template <class A>
+        inline batch_bool<float, A> eq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_ps(self, other, _CMP_EQ_OQ);
+        }
+        template <class A>
+        inline batch_bool<double, A> eq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_pd(self, other, _CMP_EQ_OQ);
+        }
+        template <class A>
+        inline batch_bool<float, A> eq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return ~(self != other);
+        }
+        template <class A>
+        inline batch_bool<double, A> eq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return ~(self != other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> eq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return eq(batch<T, sse4_2>(s), batch<T, sse4_2>(o), sse4_2 {}); },
+                                      self, other);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> eq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return ~(self != other);
+        }
+
+        // floor
+        template <class A>
+        inline batch<float, A> floor(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_floor_ps(self);
+        }
+        template <class A>
+        inline batch<double, A> floor(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_floor_pd(self);
+        }
+
+        // from_mask
+        template <class A>
+        inline batch_bool<float, A> from_mask(batch_bool<float, A> const&, uint64_t mask, requires_arch<avx>) noexcept
+        {
+            alignas(A::alignment()) static const uint64_t lut32[] = {
+                0x0000000000000000ul,
+                0x00000000FFFFFFFFul,
+                0xFFFFFFFF00000000ul,
+                0xFFFFFFFFFFFFFFFFul,
+            };
+            assert(!(mask & ~0xFFul) && "inbound mask");
+            return _mm256_castsi256_ps(_mm256_setr_epi64x(lut32[mask & 0x3], lut32[(mask >> 2) & 0x3], lut32[(mask >> 4) & 0x3], lut32[mask >> 6]));
+        }
+        template <class A>
+        inline batch_bool<double, A> from_mask(batch_bool<double, A> const&, uint64_t mask, requires_arch<avx>) noexcept
+        {
+            alignas(A::alignment()) static const uint64_t lut64[][4] = {
+                { 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul },
+                { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul },
+                { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul },
+                { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul },
+                { 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
+                { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
+                { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
+                { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul },
+                { 0x0000000000000000ul, 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
+                { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
+                { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
+                { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul },
+                { 0x0000000000000000ul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
+                { 0xFFFFFFFFFFFFFFFFul, 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
+                { 0x0000000000000000ul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
+                { 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul, 0xFFFFFFFFFFFFFFFFul },
+            };
+            assert(!(mask & ~0xFul) && "inbound mask");
+            return _mm256_castsi256_pd(_mm256_load_si256((const __m256i*)lut64[mask]));
+        }
+        template <class T, class A, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> from_mask(batch_bool<T, A> const&, uint64_t mask, requires_arch<avx>) noexcept
+        {
+            alignas(A::alignment()) static const uint32_t lut32[] = {
+                0x00000000,
+                0x000000FF,
+                0x0000FF00,
+                0x0000FFFF,
+                0x00FF0000,
+                0x00FF00FF,
+                0x00FFFF00,
+                0x00FFFFFF,
+                0xFF000000,
+                0xFF0000FF,
+                0xFF00FF00,
+                0xFF00FFFF,
+                0xFFFF0000,
+                0xFFFF00FF,
+                0xFFFFFF00,
+                0xFFFFFFFF,
+            };
+            alignas(A::alignment()) static const uint64_t lut64[] = {
+                0x0000000000000000ul,
+                0x000000000000FFFFul,
+                0x00000000FFFF0000ul,
+                0x00000000FFFFFFFFul,
+                0x0000FFFF00000000ul,
+                0x0000FFFF0000FFFFul,
+                0x0000FFFFFFFF0000ul,
+                0x0000FFFFFFFFFFFFul,
+                0xFFFF000000000000ul,
+                0xFFFF00000000FFFFul,
+                0xFFFF0000FFFF0000ul,
+                0xFFFF0000FFFFFFFFul,
+                0xFFFFFFFF00000000ul,
+                0xFFFFFFFF0000FFFFul,
+                0xFFFFFFFFFFFF0000ul,
+                0xFFFFFFFFFFFFFFFFul,
+            };
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                assert(!(mask & ~0xFFFFFFFFul) && "inbound mask");
+                return _mm256_setr_epi32(lut32[mask & 0xF], lut32[(mask >> 4) & 0xF],
+                                         lut32[(mask >> 8) & 0xF], lut32[(mask >> 12) & 0xF],
+                                         lut32[(mask >> 16) & 0xF], lut32[(mask >> 20) & 0xF],
+                                         lut32[(mask >> 24) & 0xF], lut32[mask >> 28]);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                assert(!(mask & ~0xFFFFul) && "inbound mask");
+                return _mm256_setr_epi64x(lut64[mask & 0xF], lut64[(mask >> 4) & 0xF], lut64[(mask >> 8) & 0xF], lut64[(mask >> 12) & 0xF]);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_castps_si256(from_mask(batch_bool<float, A> {}, mask, avx {}));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_castpd_si256(from_mask(batch_bool<double, A> {}, mask, avx {}));
+            }
+        }
+
+        // haddp
+        template <class A>
+        inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<avx>) noexcept
+        {
+            // row = (a,b,c,d,e,f,g,h)
+            // tmp0 = (a0+a1, a2+a3, b0+b1, b2+b3, a4+a5, a6+a7, b4+b5, b6+b7)
+            __m256 tmp0 = _mm256_hadd_ps(row[0], row[1]);
+            // tmp1 = (c0+c1, c2+c3, d1+d2, d2+d3, c4+c5, c6+c7, d4+d5, d6+d7)
+            __m256 tmp1 = _mm256_hadd_ps(row[2], row[3]);
+            // tmp1 = (a0+a1+a2+a3, b0+b1+b2+b3, c0+c1+c2+c3, d0+d1+d2+d3,
+            // a4+a5+a6+a7, b4+b5+b6+b7, c4+c5+c6+c7, d4+d5+d6+d7)
+            tmp1 = _mm256_hadd_ps(tmp0, tmp1);
+            // tmp0 = (e0+e1, e2+e3, f0+f1, f2+f3, e4+e5, e6+e7, f4+f5, f6+f7)
+            tmp0 = _mm256_hadd_ps(row[4], row[5]);
+            // tmp2 = (g0+g1, g2+g3, h0+h1, h2+h3, g4+g5, g6+g7, h4+h5, h6+h7)
+            __m256 tmp2 = _mm256_hadd_ps(row[6], row[7]);
+            // tmp2 = (e0+e1+e2+e3, f0+f1+f2+f3, g0+g1+g2+g3, h0+h1+h2+h3,
+            // e4+e5+e6+e7, f4+f5+f6+f7, g4+g5+g6+g7, h4+h5+h6+h7)
+            tmp2 = _mm256_hadd_ps(tmp0, tmp2);
+            // tmp0 = (a0+a1+a2+a3, b0+b1+b2+b3, c0+c1+c2+c3, d0+d1+d2+d3,
+            // e4+e5+e6+e7, f4+f5+f6+f7, g4+g5+g6+g7, h4+h5+h6+h7)
+            tmp0 = _mm256_blend_ps(tmp1, tmp2, 0b11110000);
+            // tmp1 = (a4+a5+a6+a7, b4+b5+b6+b7, c4+c5+c6+c7, d4+d5+d6+d7,
+            // e0+e1+e2+e3, f0+f1+f2+f3, g0+g1+g2+g3, h0+h1+h2+h3)
+            tmp1 = _mm256_permute2f128_ps(tmp1, tmp2, 0x21);
+            return _mm256_add_ps(tmp0, tmp1);
+        }
+        template <class A>
+        inline batch<double, A> haddp(batch<double, A> const* row, requires_arch<avx>) noexcept
+        {
+            // row = (a,b,c,d)
+            // tmp0 = (a0+a1, b0+b1, a2+a3, b2+b3)
+            __m256d tmp0 = _mm256_hadd_pd(row[0], row[1]);
+            // tmp1 = (c0+c1, d0+d1, c2+c3, d2+d3)
+            __m256d tmp1 = _mm256_hadd_pd(row[2], row[3]);
+            // tmp2 = (a0+a1, b0+b1, c2+c3, d2+d3)
+            __m256d tmp2 = _mm256_blend_pd(tmp0, tmp1, 0b1100);
+            // tmp1 = (a2+a3, b2+b3, c2+c3, d2+d3)
+            tmp1 = _mm256_permute2f128_pd(tmp0, tmp1, 0x21);
+            return _mm256_add_pd(tmp1, tmp2);
+        }
+
+        // incr_if
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> incr_if(batch<T, A> const& self, batch_bool<T, A> const& mask, requires_arch<avx>) noexcept
+        {
+            return self - batch<T, A>(mask.data);
+        }
+
+        // insert
+        template <class A, class T, size_t I, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> insert(batch<T, A> const& self, T val, index<I> pos, requires_arch<avx>) noexcept
+        {
+#if !defined(_MSC_VER) || _MSC_VER > 1900
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+            {
+                return _mm256_insert_epi8(self, val, I);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 2)
+            {
+                return _mm256_insert_epi16(self, val, I);
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_insert_epi32(self, val, I);
+            }
+            else
+            {
+                return insert(self, val, pos, generic {});
+            }
+#endif
+            return insert(self, val, pos, generic {});
+        }
+
+        // isnan
+        template <class A>
+        inline batch_bool<float, A> isnan(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_ps(self, self, _CMP_UNORD_Q);
+        }
+        template <class A>
+        inline batch_bool<double, A> isnan(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_pd(self, self, _CMP_UNORD_Q);
+        }
+
+        // le
+        template <class A>
+        inline batch_bool<float, A> le(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_ps(self, other, _CMP_LE_OQ);
+        }
+        template <class A>
+        inline batch_bool<double, A> le(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_pd(self, other, _CMP_LE_OQ);
+        }
+
+        // load_aligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> load_aligned(T const* mem, convert<T>, requires_arch<avx>) noexcept
+        {
+            return _mm256_load_si256((__m256i const*)mem);
+        }
+        template <class A>
+        inline batch<float, A> load_aligned(float const* mem, convert<float>, requires_arch<avx>) noexcept
+        {
+            return _mm256_load_ps(mem);
+        }
+        template <class A>
+        inline batch<double, A> load_aligned(double const* mem, convert<double>, requires_arch<avx>) noexcept
+        {
+            return _mm256_load_pd(mem);
+        }
+
+        namespace detail
+        {
+            // load_complex
+            template <class A>
+            inline batch<std::complex<float>, A> load_complex(batch<float, A> const& hi, batch<float, A> const& lo, requires_arch<avx>) noexcept
+            {
+                using batch_type = batch<float, A>;
+                __m128 tmp0 = _mm256_extractf128_ps(hi, 0);
+                __m128 tmp1 = _mm256_extractf128_ps(hi, 1);
+                __m128 tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0));
+                __m128 tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
+                batch_type real = _mm256_castps128_ps256(tmp_real);
+                batch_type imag = _mm256_castps128_ps256(tmp_imag);
+
+                tmp0 = _mm256_extractf128_ps(lo, 0);
+                tmp1 = _mm256_extractf128_ps(lo, 1);
+                tmp_real = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(2, 0, 2, 0));
+                tmp_imag = _mm_shuffle_ps(tmp0, tmp1, _MM_SHUFFLE(3, 1, 3, 1));
+                real = _mm256_insertf128_ps(real, tmp_real, 1);
+                imag = _mm256_insertf128_ps(imag, tmp_imag, 1);
+                return { real, imag };
+            }
+            template <class A>
+            inline batch<std::complex<double>, A> load_complex(batch<double, A> const& hi, batch<double, A> const& lo, requires_arch<avx>) noexcept
+            {
+                using batch_type = batch<double, A>;
+                __m128d tmp0 = _mm256_extractf128_pd(hi, 0);
+                __m128d tmp1 = _mm256_extractf128_pd(hi, 1);
+                batch_type real = _mm256_castpd128_pd256(_mm_unpacklo_pd(tmp0, tmp1));
+                batch_type imag = _mm256_castpd128_pd256(_mm_unpackhi_pd(tmp0, tmp1));
+
+                tmp0 = _mm256_extractf128_pd(lo, 0);
+                tmp1 = _mm256_extractf128_pd(lo, 1);
+                __m256d re_tmp1 = _mm256_insertf128_pd(real, _mm_unpacklo_pd(tmp0, tmp1), 1);
+                __m256d im_tmp1 = _mm256_insertf128_pd(imag, _mm_unpackhi_pd(tmp0, tmp1), 1);
+                real = _mm256_blend_pd(real, re_tmp1, 12);
+                imag = _mm256_blend_pd(imag, im_tmp1, 12);
+                return { real, imag };
+            }
+        }
+
+        // load_unaligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<avx>) noexcept
+        {
+            return _mm256_loadu_si256((__m256i const*)mem);
+        }
+        template <class A>
+        inline batch<float, A> load_unaligned(float const* mem, convert<float>, requires_arch<avx>) noexcept
+        {
+            return _mm256_loadu_ps(mem);
+        }
+        template <class A>
+        inline batch<double, A> load_unaligned(double const* mem, convert<double>, requires_arch<avx>) noexcept
+        {
+            return _mm256_loadu_pd(mem);
+        }
+
+        // lt
+        template <class A>
+        inline batch_bool<float, A> lt(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_ps(self, other, _CMP_LT_OQ);
+        }
+        template <class A>
+        inline batch_bool<double, A> lt(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_pd(self, other, _CMP_LT_OQ);
+        }
+
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> lt(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return lt(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+
+        // mask
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline uint64_t mask(batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2)
+            {
+                __m128i self_low, self_high;
+                detail::split_avx(self, self_low, self_high);
+                return mask(batch_bool<T, sse4_2>(self_low), sse4_2 {}) | (mask(batch_bool<T, sse4_2>(self_high), sse4_2 {}) << (128 / (8 * sizeof(T))));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                return _mm256_movemask_ps(_mm256_castsi256_ps(self));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                return _mm256_movemask_pd(_mm256_castsi256_pd(self));
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        inline uint64_t mask(batch_bool<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_movemask_ps(self);
+        }
+
+        template <class A>
+        inline uint64_t mask(batch_bool<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_movemask_pd(self);
+        }
+
+        // max
+        template <class A>
+        inline batch<float, A> max(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_max_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> max(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_max_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> max(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return select(self > other, self, other);
+        }
+
+        // min
+        template <class A>
+        inline batch<float, A> min(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_min_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> min(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_min_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> min(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return select(self <= other, self, other);
+        }
+
+        // mul
+        template <class A>
+        inline batch<float, A> mul(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_mul_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> mul(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_mul_pd(self, other);
+        }
+
+        // nearbyint
+        template <class A>
+        inline batch<float, A> nearbyint(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_round_ps(self, _MM_FROUND_TO_NEAREST_INT);
+        }
+        template <class A>
+        inline batch<double, A> nearbyint(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_round_pd(self, _MM_FROUND_TO_NEAREST_INT);
+        }
+
+        // nearbyint_as_int
+        template <class A>
+        inline batch<int32_t, A> nearbyint_as_int(batch<float, A> const& self,
+                                                  requires_arch<avx>) noexcept
+        {
+            return _mm256_cvtps_epi32(self);
+        }
+
+        // neg
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> neg(batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return 0 - self;
+        }
+        template <class A>
+        batch<float, A> neg(batch<float, A> const& self, requires_arch<avx>)
+        {
+            return _mm256_xor_ps(self, _mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)));
+        }
+        template <class A>
+        inline batch<double, A> neg(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_pd(self, _mm256_castsi256_pd(_mm256_set1_epi64x(0x8000000000000000)));
+        }
+
+        // neq
+        template <class A>
+        inline batch_bool<float, A> neq(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_ps(self, other, _CMP_NEQ_UQ);
+        }
+        template <class A>
+        inline batch_bool<double, A> neq(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_cmp_pd(self, other, _CMP_NEQ_UQ);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> neq(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return ~(self == other);
+        }
+
+        template <class A>
+        inline batch_bool<float, A> neq(batch_bool<float, A> const& self, batch_bool<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_ps(self, other);
+        }
+        template <class A>
+        inline batch_bool<double, A> neq(batch_bool<double, A> const& self, batch_bool<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_xor_pd(self, other);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> neq(batch_bool<T, A> const& self, batch_bool<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(self.data), _mm256_castsi256_ps(other.data)));
+        }
+
+        // reciprocal
+        template <class A>
+        inline batch<float, A> reciprocal(batch<float, A> const& self,
+                                          kernel::requires_arch<avx>) noexcept
+        {
+            return _mm256_rcp_ps(self);
+        }
+
+        // reduce_add
+        template <class A>
+        inline float reduce_add(batch<float, A> const& rhs, requires_arch<avx>) noexcept
+        {
+            // Warning about _mm256_hadd_ps:
+            // _mm256_hadd_ps(a,b) gives
+            // (a0+a1,a2+a3,b0+b1,b2+b3,a4+a5,a6+a7,b4+b5,b6+b7). Hence we can't
+            // rely on a naive use of this method
+            // rhs = (x0, x1, x2, x3, x4, x5, x6, x7)
+            // tmp = (x4, x5, x6, x7, x0, x1, x2, x3)
+            __m256 tmp = _mm256_permute2f128_ps(rhs, rhs, 1);
+            // tmp = (x4+x0, x5+x1, x6+x2, x7+x3, x0+x4, x1+x5, x2+x6, x3+x7)
+            tmp = _mm256_add_ps(rhs, tmp);
+            // tmp = (x4+x0+x5+x1, x6+x2+x7+x3, -, -, -, -, -, -)
+            tmp = _mm256_hadd_ps(tmp, tmp);
+            // tmp = (x4+x0+x5+x1+x6+x2+x7+x3, -, -, -, -, -, -, -)
+            tmp = _mm256_hadd_ps(tmp, tmp);
+            return _mm_cvtss_f32(_mm256_extractf128_ps(tmp, 0));
+        }
+        template <class A>
+        inline double reduce_add(batch<double, A> const& rhs, requires_arch<avx>) noexcept
+        {
+            // rhs = (x0, x1, x2, x3)
+            // tmp = (x2, x3, x0, x1)
+            __m256d tmp = _mm256_permute2f128_pd(rhs, rhs, 1);
+            // tmp = (x2+x0, x3+x1, -, -)
+            tmp = _mm256_add_pd(rhs, tmp);
+            // tmp = (x2+x0+x3+x1, -, -, -)
+            tmp = _mm256_hadd_pd(tmp, tmp);
+            return _mm_cvtsd_f64(_mm256_extractf128_pd(tmp, 0));
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline T reduce_add(batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            __m128i low, high;
+            detail::split_avx(self, low, high);
+            batch<T, sse4_2> blow(low), bhigh(high);
+            return reduce_add(blow) + reduce_add(bhigh);
+        }
+
+        // reduce_max
+        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
+        inline T reduce_max(batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            constexpr auto mask = detail::shuffle(1, 0);
+            batch<T, A> step = _mm256_permute2f128_si256(self, self, mask);
+            batch<T, A> acc = max(self, step);
+            __m128i low = _mm256_castsi256_si128(acc);
+            return reduce_max(batch<T, sse4_2>(low));
+        }
+
+        // reduce_min
+        template <class A, class T, class _ = typename std::enable_if<(sizeof(T) <= 2), void>::type>
+        inline T reduce_min(batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            constexpr auto mask = detail::shuffle(1, 0);
+            batch<T, A> step = _mm256_permute2f128_si256(self, self, mask);
+            batch<T, A> acc = min(self, step);
+            __m128i low = _mm256_castsi256_si128(acc);
+            return reduce_min(batch<T, sse4_2>(low));
+        }
+
+        // rsqrt
+        template <class A>
+        inline batch<float, A> rsqrt(batch<float, A> const& val, requires_arch<avx>) noexcept
+        {
+            return _mm256_rsqrt_ps(val);
+        }
+        template <class A>
+        inline batch<double, A> rsqrt(batch<double, A> const& val, requires_arch<avx>) noexcept
+        {
+            return _mm256_cvtps_pd(_mm_rsqrt_ps(_mm256_cvtpd_ps(val)));
+        }
+
+        // sadd
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sadd(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                auto mask = (other >> (8 * sizeof(T) - 1));
+                auto self_pos_branch = min(std::numeric_limits<T>::max() - other, self);
+                auto self_neg_branch = max(std::numeric_limits<T>::min() - other, self);
+                return other + select(batch_bool<T, A>(mask.data), self_neg_branch, self_pos_branch);
+            }
+            else
+            {
+                const auto diffmax = std::numeric_limits<T>::max() - self;
+                const auto mindiff = min(diffmax, other);
+                return self + mindiff;
+            }
+        }
+
+        // select
+        template <class A>
+        inline batch<float, A> select(batch_bool<float, A> const& cond, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx>) noexcept
+        {
+            return _mm256_blendv_ps(false_br, true_br, cond);
+        }
+        template <class A>
+        inline batch<double, A> select(batch_bool<double, A> const& cond, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx>) noexcept
+        {
+            return _mm256_blendv_pd(false_br, true_br, cond);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool<T, A> const& cond, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
+        {
+            __m128i cond_low, cond_hi;
+            detail::split_avx(cond, cond_low, cond_hi);
+
+            __m128i true_low, true_hi;
+            detail::split_avx(true_br, true_low, true_hi);
+
+            __m128i false_low, false_hi;
+            detail::split_avx(false_br, false_low, false_hi);
+
+            __m128i res_low = select(batch_bool<T, sse4_2>(cond_low), batch<T, sse4_2>(true_low), batch<T, sse4_2>(false_low), sse4_2 {});
+            __m128i res_hi = select(batch_bool<T, sse4_2>(cond_hi), batch<T, sse4_2>(true_hi), batch<T, sse4_2>(false_hi), sse4_2 {});
+            return detail::merge_sse(res_low, res_hi);
+        }
+        template <class A, class T, bool... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> select(batch_bool_constant<batch<T, A>, Values...> const&, batch<T, A> const& true_br, batch<T, A> const& false_br, requires_arch<avx>) noexcept
+        {
+            return select(batch_bool<T, A> { Values... }, true_br, false_br, avx2 {});
+        }
+
+        template <class A, bool... Values>
+        inline batch<float, A> select(batch_bool_constant<batch<float, A>, Values...> const&, batch<float, A> const& true_br, batch<float, A> const& false_br, requires_arch<avx>) noexcept
+        {
+            constexpr auto mask = batch_bool_constant<batch<float, A>, Values...>::mask();
+            return _mm256_blend_ps(false_br, true_br, mask);
+        }
+
+        template <class A, bool... Values>
+        inline batch<double, A> select(batch_bool_constant<batch<double, A>, Values...> const&, batch<double, A> const& true_br, batch<double, A> const& false_br, requires_arch<avx>) noexcept
+        {
+            constexpr auto mask = batch_bool_constant<batch<double, A>, Values...>::mask();
+            return _mm256_blend_pd(false_br, true_br, mask);
+        }
+
+        // set
+        template <class A, class... Values>
+        inline batch<float, A> set(batch<float, A> const&, requires_arch<avx>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<float, A>::size, "consistent init");
+            return _mm256_setr_ps(values...);
+        }
+
+        template <class A, class... Values>
+        inline batch<double, A> set(batch<double, A> const&, requires_arch<avx>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch<double, A>::size, "consistent init");
+            return _mm256_setr_pd(values...);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3) noexcept
+        {
+            return _mm256_set_epi64x(v3, v2, v1, v0);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7) noexcept
+        {
+            return _mm256_setr_epi32(v0, v1, v2, v3, v4, v5, v6, v7);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15) noexcept
+        {
+            return _mm256_setr_epi16(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> set(batch<T, A> const&, requires_arch<avx>, T v0, T v1, T v2, T v3, T v4, T v5, T v6, T v7, T v8, T v9, T v10, T v11, T v12, T v13, T v14, T v15,
+                               T v16, T v17, T v18, T v19, T v20, T v21, T v22, T v23, T v24, T v25, T v26, T v27, T v28, T v29, T v30, T v31) noexcept
+        {
+            return _mm256_setr_epi8(v0, v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31);
+        }
+
+        template <class A, class T, class... Values, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch_bool<T, A> set(batch_bool<T, A> const&, requires_arch<avx>, Values... values) noexcept
+        {
+            return set(batch<T, A>(), A {}, static_cast<T>(values ? -1LL : 0LL)...).data;
+        }
+
+        template <class A, class... Values>
+        inline batch_bool<float, A> set(batch_bool<float, A> const&, requires_arch<avx>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch_bool<float, A>::size, "consistent init");
+            return _mm256_castsi256_ps(set(batch<int32_t, A>(), A {}, static_cast<int32_t>(values ? -1LL : 0LL)...).data);
+        }
+
+        template <class A, class... Values>
+        inline batch_bool<double, A> set(batch_bool<double, A> const&, requires_arch<avx>, Values... values) noexcept
+        {
+            static_assert(sizeof...(Values) == batch_bool<double, A>::size, "consistent init");
+            return _mm256_castsi256_pd(set(batch<int64_t, A>(), A {}, static_cast<int64_t>(values ? -1LL : 0LL)...).data);
+        }
+
+        // shuffle
+        template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3, ITy I4, ITy I5, ITy I6, ITy I7>
+        inline batch<float, A> shuffle(batch<float, A> const& x, batch<float, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3, I4, I5, I6, I7> mask, requires_arch<avx>) noexcept
+        {
+            constexpr uint32_t smask = detail::mod_shuffle(I0, I1, I2, I3);
+            // shuffle within lane
+            if (I4 == (I0 + 4) && I5 == (I1 + 4) && I6 == (I2 + 4) && I7 == (I3 + 4) && I0 < 4 && I1 < 4 && I2 >= 8 && I2 < 12 && I3 >= 8 && I3 < 12)
+                return _mm256_shuffle_ps(x, y, smask);
+
+            // shuffle within opposite lane
+            if (I4 == (I0 + 4) && I5 == (I1 + 4) && I6 == (I2 + 4) && I7 == (I3 + 4) && I2 < 4 && I3 < 4 && I0 >= 8 && I0 < 12 && I1 >= 8 && I1 < 12)
+                return _mm256_shuffle_ps(y, x, smask);
+
+            return shuffle(x, y, mask, generic {});
+        }
+
+        template <class A, class ITy, ITy I0, ITy I1, ITy I2, ITy I3>
+        inline batch<double, A> shuffle(batch<double, A> const& x, batch<double, A> const& y, batch_constant<batch<ITy, A>, I0, I1, I2, I3> mask, requires_arch<avx>) noexcept
+        {
+            constexpr uint32_t smask = (I0 & 0x1) | ((I1 & 0x1) << 1) | ((I2 & 0x1) << 2) | ((I3 & 0x1) << 3);
+            // shuffle within lane
+            if (I0 < 2 && I1 >= 4 && I1 < 6 && I2 >= 2 && I2 < 4 && I3 >= 6)
+                return _mm256_shuffle_pd(x, y, smask);
+
+            // shuffle within opposite lane
+            if (I1 < 2 && I0 >= 4 && I0 < 6 && I3 >= 2 && I3 < 4 && I2 >= 6)
+                return _mm256_shuffle_pd(y, x, smask);
+
+            return shuffle(x, y, mask, generic {});
+        }
+
+        // slide_left
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_left(batch<T, A> const& x, requires_arch<avx>) noexcept
+        {
+            constexpr unsigned BitCount = N * 8;
+            if (BitCount == 0)
+            {
+                return x;
+            }
+            if (BitCount >= 256)
+            {
+                return batch<T, A>(T(0));
+            }
+            if (BitCount > 128)
+            {
+                constexpr unsigned M = (BitCount - 128) / 8;
+                __m128i low = _mm256_castsi256_si128(x);
+                auto y = _mm_slli_si128(low, M);
+                __m256i zero = _mm256_setzero_si256();
+                return _mm256_insertf128_si256(zero, y, 1);
+            }
+            if (BitCount == 128)
+            {
+                __m128i low = _mm256_castsi256_si128(x);
+                __m256i zero = _mm256_setzero_si256();
+                return _mm256_insertf128_si256(zero, low, 1);
+            }
+            // shifting by [0, 128[ bits
+            constexpr unsigned M = BitCount / 8;
+
+            __m128i low = _mm256_castsi256_si128(x);
+            auto ylow = _mm_slli_si128(low, M);
+            auto zlow = _mm_srli_si128(low, 16 - M);
+
+            __m128i high = _mm256_extractf128_si256(x, 1);
+            auto yhigh = _mm_slli_si128(high, M);
+
+            __m256i res = _mm256_castsi128_si256(ylow);
+            return _mm256_insertf128_si256(res, _mm_or_si128(yhigh, zlow), 1);
+        }
+
+        // slide_right
+        template <size_t N, class A, class T>
+        inline batch<T, A> slide_right(batch<T, A> const& x, requires_arch<avx>) noexcept
+        {
+            constexpr unsigned BitCount = N * 8;
+            if (BitCount == 0)
+            {
+                return x;
+            }
+            if (BitCount >= 256)
+            {
+                return batch<T, A>(T(0));
+            }
+            if (BitCount > 128)
+            {
+                constexpr unsigned M = (BitCount - 128) / 8;
+                __m128i high = _mm256_extractf128_si256(x, 1);
+                __m128i y = _mm_srli_si128(high, M);
+                __m256i zero = _mm256_setzero_si256();
+                return _mm256_insertf128_si256(zero, y, 0);
+            }
+            if (BitCount == 128)
+            {
+                __m128i high = _mm256_extractf128_si256(x, 1);
+                return _mm256_castsi128_si256(high);
+            }
+            // shifting by [0, 128[ bits
+            constexpr unsigned M = BitCount / 8;
+
+            __m128i low = _mm256_castsi256_si128(x);
+            auto ylow = _mm_srli_si128(low, M);
+
+            __m128i high = _mm256_extractf128_si256(x, 1);
+            auto yhigh = _mm_srli_si128(high, M);
+            auto zhigh = _mm_slli_si128(high, 16 - M);
+
+            __m256i res = _mm256_castsi128_si256(_mm_or_si128(ylow, zhigh));
+            return _mm256_insertf128_si256(res, yhigh, 1);
+        }
+
+        // sqrt
+        template <class A>
+        inline batch<float, A> sqrt(batch<float, A> const& val, requires_arch<avx>) noexcept
+        {
+            return _mm256_sqrt_ps(val);
+        }
+        template <class A>
+        inline batch<double, A> sqrt(batch<double, A> const& val, requires_arch<avx>) noexcept
+        {
+            return _mm256_sqrt_pd(val);
+        }
+
+        // ssub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> ssub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            if (std::is_signed<T>::value)
+            {
+                return sadd(self, -other);
+            }
+            else
+            {
+                const auto diff = min(self, other);
+                return self - diff;
+            }
+        }
+
+        // store_aligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_aligned(T* mem, batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_store_si256((__m256i*)mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_aligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_store_si256((__m256i*)mem, self);
+        }
+        template <class A>
+        inline void store_aligned(float* mem, batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_store_ps(mem, self);
+        }
+        template <class A>
+        inline void store_aligned(double* mem, batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_store_pd(mem, self);
+        }
+
+        // store_unaligned
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_unaligned(T* mem, batch<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_storeu_si256((__m256i*)mem, self);
+        }
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline void store_unaligned(T* mem, batch_bool<T, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_storeu_si256((__m256i*)mem, self);
+        }
+        template <class A>
+        inline void store_unaligned(float* mem, batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_storeu_ps(mem, self);
+        }
+        template <class A>
+        inline void store_unaligned(double* mem, batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_storeu_pd(mem, self);
+        }
+
+        // sub
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> sub(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            return detail::fwd_to_sse([](__m128i s, __m128i o) noexcept
+                                      { return sub(batch<T, sse4_2>(s), batch<T, sse4_2>(o)); },
+                                      self, other);
+        }
+        template <class A>
+        inline batch<float, A> sub(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_sub_ps(self, other);
+        }
+        template <class A>
+        inline batch<double, A> sub(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            return _mm256_sub_pd(self, other);
+        }
+
+        // swizzle (dynamic mask)
+        template <class A>
+        inline batch<float, A> swizzle(batch<float, A> const& self, batch<uint32_t, A> mask, requires_arch<avx>) noexcept
+        {
+            // duplicate low and high part of input
+            __m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1));
+            __m256 hi_hi = _mm256_insertf128_ps(self, _mm256_castps256_ps128(hi), 0);
+
+            __m256 low = _mm256_castps128_ps256(_mm256_castps256_ps128(self));
+            __m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1);
+
+            // normalize mask
+            batch<uint32_t, A> half_mask = mask % 4;
+
+            // permute within each lane
+            __m256 r0 = _mm256_permutevar_ps(low_low, half_mask);
+            __m256 r1 = _mm256_permutevar_ps(hi_hi, half_mask);
+
+            // mask to choose the right lane
+            batch_bool<uint32_t, A> blend_mask = mask >= 4;
+
+            // blend the two permutes
+            return _mm256_blendv_ps(r0, r1, batch_bool_cast<float>(blend_mask));
+        }
+
+        template <class A>
+        inline batch<double, A> swizzle(batch<double, A> const& self, batch<uint64_t, A> mask, requires_arch<avx>) noexcept
+        {
+            // duplicate low and high part of input
+            __m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1));
+            __m256d hi_hi = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(hi), 0);
+
+            __m256d low = _mm256_castpd128_pd256(_mm256_castpd256_pd128(self));
+            __m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1);
+
+            // normalize mask
+            batch<uint64_t, A> half_mask = -(mask & 1);
+
+            // permute within each lane
+            __m256d r0 = _mm256_permutevar_pd(low_low, half_mask);
+            __m256d r1 = _mm256_permutevar_pd(hi_hi, half_mask);
+
+            // mask to choose the right lane
+            batch_bool<uint64_t, A> blend_mask = mask >= 2;
+
+            // blend the two permutes
+            return _mm256_blendv_pd(r0, r1, batch_bool_cast<double>(blend_mask));
+        }
+
+        template <class A, typename T, detail::enable_sized_integral_t<T, 4> = 0>
+        inline batch<T, A> swizzle(batch<T, A> const& self, batch<uint32_t, A> const& mask, requires_arch<avx>) noexcept
+        {
+            return bitwise_cast<T>(
+                swizzle(bitwise_cast<float>(self), mask));
+        }
+
+        template <class A, typename T, detail::enable_sized_integral_t<T, 8> = 0>
+        inline batch<T, A>
+        swizzle(batch<T, A> const& self, batch<uint64_t, A> const& mask, requires_arch<avx>) noexcept
+        {
+            return bitwise_cast<T>(
+                swizzle(bitwise_cast<double>(self), mask));
+        }
+
+        // swizzle (constant mask)
+        template <class A, uint32_t V0, uint32_t V1, uint32_t V2, uint32_t V3, uint32_t V4, uint32_t V5, uint32_t V6, uint32_t V7>
+        inline batch<float, A> swizzle(batch<float, A> const& self, batch_constant<batch<uint32_t, A>, V0, V1, V2, V3, V4, V5, V6, V7>, requires_arch<avx>) noexcept
+        {
+            // duplicate low and high part of input
+            __m256 hi = _mm256_castps128_ps256(_mm256_extractf128_ps(self, 1));
+            __m256 hi_hi = _mm256_insertf128_ps(self, _mm256_castps256_ps128(hi), 0);
+
+            __m256 low = _mm256_castps128_ps256(_mm256_castps256_ps128(self));
+            __m256 low_low = _mm256_insertf128_ps(self, _mm256_castps256_ps128(low), 1);
+
+            // normalize mask
+            batch_constant<batch<uint32_t, A>, (V0 % 4), (V1 % 4), (V2 % 4), (V3 % 4), (V4 % 4), (V5 % 4), (V6 % 4), (V7 % 4)> half_mask;
+
+            // permute within each lane
+            __m256 r0 = _mm256_permutevar_ps(low_low, (batch<uint32_t, A>)half_mask);
+            __m256 r1 = _mm256_permutevar_ps(hi_hi, (batch<uint32_t, A>)half_mask);
+
+            // mask to choose the right lane
+            batch_bool_constant<batch<uint32_t, A>, (V0 >= 4), (V1 >= 4), (V2 >= 4), (V3 >= 4), (V4 >= 4), (V5 >= 4), (V6 >= 4), (V7 >= 4)> blend_mask;
+
+            // blend the two permutes
+            constexpr auto mask = blend_mask.mask();
+            return _mm256_blend_ps(r0, r1, mask);
+        }
+
+        template <class A, uint64_t V0, uint64_t V1, uint64_t V2, uint64_t V3>
+        inline batch<double, A> swizzle(batch<double, A> const& self, batch_constant<batch<uint64_t, A>, V0, V1, V2, V3>, requires_arch<avx>) noexcept
+        {
+            // duplicate low and high part of input
+            __m256d hi = _mm256_castpd128_pd256(_mm256_extractf128_pd(self, 1));
+            __m256d hi_hi = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(hi), 0);
+
+            __m256d low = _mm256_castpd128_pd256(_mm256_castpd256_pd128(self));
+            __m256d low_low = _mm256_insertf128_pd(self, _mm256_castpd256_pd128(low), 1);
+
+            // normalize mask
+            batch_constant<batch<uint64_t, A>, (V0 % 2) * -1, (V1 % 2) * -1, (V2 % 2) * -1, (V3 % 2) * -1> half_mask;
+
+            // permute within each lane
+            __m256d r0 = _mm256_permutevar_pd(low_low, (batch<uint64_t, A>)half_mask);
+            __m256d r1 = _mm256_permutevar_pd(hi_hi, (batch<uint64_t, A>)half_mask);
+
+            // mask to choose the right lane
+            batch_bool_constant<batch<uint64_t, A>, (V0 >= 2), (V1 >= 2), (V2 >= 2), (V3 >= 2)> blend_mask;
+
+            // blend the two permutes
+            constexpr auto mask = blend_mask.mask();
+            return _mm256_blend_pd(r0, r1, mask);
+        }
+        template <class A,
+                  typename T,
+                  uint32_t V0,
+                  uint32_t V1,
+                  uint32_t V2,
+                  uint32_t V3,
+                  uint32_t V4,
+                  uint32_t V5,
+                  uint32_t V6,
+                  uint32_t V7,
+                  detail::enable_sized_integral_t<T, 4> = 0>
+        inline batch<T, A> swizzle(batch<T, A> const& self,
+                                   batch_constant<batch<uint32_t, A>,
+                                                  V0,
+                                                  V1,
+                                                  V2,
+                                                  V3,
+                                                  V4,
+                                                  V5,
+                                                  V6,
+                                                  V7> const& mask,
+                                   requires_arch<avx>) noexcept
+        {
+            return bitwise_cast<T>(
+                swizzle(bitwise_cast<float>(self), mask));
+        }
+
+        template <class A,
+                  typename T,
+                  uint64_t V0,
+                  uint64_t V1,
+                  uint64_t V2,
+                  uint64_t V3,
+                  detail::enable_sized_integral_t<T, 8> = 0>
+        inline batch<T, A>
+        swizzle(batch<T, A> const& self,
+                batch_constant<batch<uint64_t, A>, V0, V1, V2, V3> const& mask,
+                requires_arch<avx>) noexcept
+        {
+            return bitwise_cast<T>(
+                swizzle(bitwise_cast<double>(self), mask));
+        }
+
+        // trunc
+        template <class A>
+        inline batch<float, A> trunc(batch<float, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_round_ps(self, _MM_FROUND_TO_ZERO);
+        }
+        template <class A>
+        inline batch<double, A> trunc(batch<double, A> const& self, requires_arch<avx>) noexcept
+        {
+            return _mm256_round_pd(self, _MM_FROUND_TO_ZERO);
+        }
+
+        // zip_hi
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> zip_hi(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2)
+            {
+                // extract high word
+                __m128i self_hi = _mm256_extractf128_si256(self, 1);
+                __m128i other_hi = _mm256_extractf128_si256(other, 1);
+
+                // interleave
+                __m128i res_lo, res_hi;
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    res_lo = _mm_unpacklo_epi8(self_hi, other_hi);
+                    res_hi = _mm_unpackhi_epi8(self_hi, other_hi);
+                }
+                else
+                {
+                    res_lo = _mm_unpacklo_epi16(self_hi, other_hi);
+                    res_hi = _mm_unpackhi_epi16(self_hi, other_hi);
+                }
+
+                // fuse
+                return _mm256_castps_si256(
+                    _mm256_insertf128_ps(
+                        _mm256_castsi256_ps(_mm256_castsi128_si256(res_lo)),
+                        _mm_castsi128_ps(res_hi),
+                        1));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                auto lo = _mm256_unpacklo_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
+                auto hi = _mm256_unpackhi_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
+                return _mm256_castps_si256(_mm256_permute2f128_ps(lo, hi, 0x31));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                auto lo = _mm256_unpacklo_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
+                auto hi = _mm256_unpackhi_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
+                return _mm256_castpd_si256(_mm256_permute2f128_pd(lo, hi, 0x31));
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+        template <class A>
+        inline batch<float, A> zip_hi(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            auto lo = _mm256_unpacklo_ps(self, other);
+            auto hi = _mm256_unpackhi_ps(self, other);
+            return _mm256_permute2f128_ps(lo, hi, 0x31);
+        }
+        template <class A>
+        inline batch<double, A> zip_hi(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            auto lo = _mm256_unpacklo_pd(self, other);
+            auto hi = _mm256_unpackhi_pd(self, other);
+            return _mm256_permute2f128_pd(lo, hi, 0x31);
+        }
+
+        // zip_lo
+        template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+        inline batch<T, A> zip_lo(batch<T, A> const& self, batch<T, A> const& other, requires_arch<avx>) noexcept
+        {
+            XSIMD_IF_CONSTEXPR(sizeof(T) == 1 || sizeof(T) == 2)
+            {
+                // extract low word
+                __m128i self_lo = _mm256_extractf128_si256(self, 0);
+                __m128i other_lo = _mm256_extractf128_si256(other, 0);
+
+                // interleave
+                __m128i res_lo, res_hi;
+                XSIMD_IF_CONSTEXPR(sizeof(T) == 1)
+                {
+                    res_lo = _mm_unpacklo_epi8(self_lo, other_lo);
+                    res_hi = _mm_unpackhi_epi8(self_lo, other_lo);
+                }
+                else
+                {
+                    res_lo = _mm_unpacklo_epi16(self_lo, other_lo);
+                    res_hi = _mm_unpackhi_epi16(self_lo, other_lo);
+                }
+
+                // fuse
+                return _mm256_castps_si256(
+                    _mm256_insertf128_ps(
+                        _mm256_castsi256_ps(_mm256_castsi128_si256(res_lo)),
+                        _mm_castsi128_ps(res_hi),
+                        1));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 4)
+            {
+                auto lo = _mm256_unpacklo_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
+                auto hi = _mm256_unpackhi_ps(_mm256_castsi256_ps(self), _mm256_castsi256_ps(other));
+                return _mm256_castps_si256(_mm256_insertf128_ps(lo, _mm256_castps256_ps128(hi), 1));
+            }
+            else XSIMD_IF_CONSTEXPR(sizeof(T) == 8)
+            {
+                auto lo = _mm256_unpacklo_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
+                auto hi = _mm256_unpackhi_pd(_mm256_castsi256_pd(self), _mm256_castsi256_pd(other));
+                return _mm256_castpd_si256(_mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1));
+            }
+            else
+            {
+                assert(false && "unsupported arch/op combination");
+                return {};
+            }
+        }
+
+        template <class A>
+        inline batch<float, A> zip_lo(batch<float, A> const& self, batch<float, A> const& other, requires_arch<avx>) noexcept
+        {
+            auto lo = _mm256_unpacklo_ps(self, other);
+            auto hi = _mm256_unpackhi_ps(self, other);
+            return _mm256_insertf128_ps(lo, _mm256_castps256_ps128(hi), 1);
+        }
+        template <class A>
+        inline batch<double, A> zip_lo(batch<double, A> const& self, batch<double, A> const& other, requires_arch<avx>) noexcept
+        {
+            auto lo = _mm256_unpacklo_pd(self, other);
+            auto hi = _mm256_unpackhi_pd(self, other);
+            return _mm256_insertf128_pd(lo, _mm256_castpd256_pd128(hi), 1);
+        }
+    }
+}
+
+#endif