summaryrefslogtreecommitdiffstats
path: root/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp')
-rw-r--r--third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp631
1 files changed, 631 insertions, 0 deletions
diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp
new file mode 100644
index 0000000000..e9e9065832
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_memory.hpp
@@ -0,0 +1,631 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
+ * Martin Renou *
+ * Copyright (c) QuantStack *
+ * Copyright (c) Serge Guelton *
+ * *
+ * Distributed under the terms of the BSD 3-Clause License. *
+ * *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_MEMORY_HPP
+#define XSIMD_GENERIC_MEMORY_HPP
+
+#include <algorithm>
+#include <complex>
+#include <stdexcept>
+
+#include "../../types/xsimd_batch_constant.hpp"
+#include "./xsimd_generic_details.hpp"
+
+namespace xsimd
+{
+ template <class batch_type, typename batch_type::value_type... Values>
+ struct batch_constant;
+
+ template <class batch_type, bool... Values>
+ struct batch_bool_constant;
+
+ namespace kernel
+ {
+
+ using namespace types;
+
+ // compress
+ namespace detail
+ {
+ template <class IT, class A, class I, size_t... Is>
+ inline batch<IT, A> create_compress_swizzle_mask(I bitmask, ::xsimd::detail::index_sequence<Is...>)
+ {
+ batch<IT, A> swizzle_mask(IT(0));
+ alignas(A::alignment()) IT mask_buffer[batch<IT, A>::size] = { Is... };
+ size_t inserted = 0;
+ for (size_t i = 0; i < sizeof...(Is); ++i)
+ if ((bitmask >> i) & 1u)
+ std::swap(mask_buffer[inserted++], mask_buffer[i]);
+ return batch<IT, A>::load_aligned(&mask_buffer[0]);
+ }
+ }
+
+ template <typename A, typename T>
+ inline batch<T, A>
+ compress(batch<T, A> const& x, batch_bool<T, A> const& mask,
+ kernel::requires_arch<generic>) noexcept
+ {
+ using IT = as_unsigned_integer_t<T>;
+ constexpr std::size_t size = batch_bool<T, A>::size;
+ auto bitmask = mask.mask();
+ auto z = select(mask, x, batch<T, A>((T)0));
+ auto compress_mask = detail::create_compress_swizzle_mask<IT, A>(bitmask, ::xsimd::detail::make_index_sequence<size>());
+ return swizzle(z, compress_mask);
+ }
+
+ // expand
+ namespace detail
+ {
+ template <class IT, class A, class I, size_t... Is>
+ inline batch<IT, A> create_expand_swizzle_mask(I bitmask, ::xsimd::detail::index_sequence<Is...>)
+ {
+ batch<IT, A> swizzle_mask(IT(0));
+ IT j = 0;
+ (void)std::initializer_list<bool> { ((swizzle_mask = insert(swizzle_mask, j, index<Is>())), (j += ((bitmask >> Is) & 1u)), true)... };
+ return swizzle_mask;
+ }
+ }
+
+ template <typename A, typename T>
+ inline batch<T, A>
+ expand(batch<T, A> const& x, batch_bool<T, A> const& mask,
+ kernel::requires_arch<generic>) noexcept
+ {
+ constexpr std::size_t size = batch_bool<T, A>::size;
+ auto bitmask = mask.mask();
+ auto swizzle_mask = detail::create_expand_swizzle_mask<as_unsigned_integer_t<T>, A>(bitmask, ::xsimd::detail::make_index_sequence<size>());
+ auto z = swizzle(x, swizzle_mask);
+ return select(mask, z, batch<T, A>(T(0)));
+ }
+
+ // extract_pair
+ template <class A, class T>
+ inline batch<T, A> extract_pair(batch<T, A> const& self, batch<T, A> const& other, std::size_t i, requires_arch<generic>) noexcept
+ {
+ constexpr std::size_t size = batch<T, A>::size;
+ assert(i < size && "index in bounds");
+
+ alignas(A::alignment()) T self_buffer[size];
+ self.store_aligned(self_buffer);
+
+ alignas(A::alignment()) T other_buffer[size];
+ other.store_aligned(other_buffer);
+
+ alignas(A::alignment()) T concat_buffer[size];
+
+ for (std::size_t j = 0; j < (size - i); ++j)
+ {
+ concat_buffer[j] = other_buffer[i + j];
+ if (j < i)
+ {
+ concat_buffer[size - 1 - j] = self_buffer[i - 1 - j];
+ }
+ }
+ return batch<T, A>::load_aligned(concat_buffer);
+ }
+
+ // gather
+ namespace detail
+ {
+ template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N == 0, int>::type = 0>
+ inline batch<T, A> gather(U const* src, batch<V, A> const& index,
+ ::xsimd::index<N> I) noexcept
+ {
+ return insert(batch<T, A> {}, static_cast<T>(src[index.get(I)]), I);
+ }
+
+ template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N != 0, int>::type = 0>
+ inline batch<T, A>
+ gather(U const* src, batch<V, A> const& index, ::xsimd::index<N> I) noexcept
+ {
+ static_assert(N <= batch<V, A>::size, "Incorrect value in recursion!");
+
+ const auto test = gather<N - 1, T, A>(src, index, {});
+ return insert(test, static_cast<T>(src[index.get(I)]), I);
+ }
+ } // namespace detail
+
+ template <typename T, typename A, typename V>
+ inline batch<T, A>
+ gather(batch<T, A> const&, T const* src, batch<V, A> const& index,
+ kernel::requires_arch<generic>) noexcept
+ {
+ static_assert(batch<T, A>::size == batch<V, A>::size,
+ "Index and destination sizes must match");
+
+ return detail::gather<batch<V, A>::size - 1, T, A>(src, index, {});
+ }
+
+ // Gather with runtime indexes and mismatched strides.
+ template <typename T, typename A, typename U, typename V>
+ inline detail::sizes_mismatch_t<T, U, batch<T, A>>
+ gather(batch<T, A> const&, U const* src, batch<V, A> const& index,
+ kernel::requires_arch<generic>) noexcept
+ {
+ static_assert(batch<T, A>::size == batch<V, A>::size,
+ "Index and destination sizes must match");
+
+ return detail::gather<batch<V, A>::size - 1, T, A>(src, index, {});
+ }
+
+ // Gather with runtime indexes and matching strides.
+ template <typename T, typename A, typename U, typename V>
+ inline detail::stride_match_t<T, U, batch<T, A>>
+ gather(batch<T, A> const&, U const* src, batch<V, A> const& index,
+ kernel::requires_arch<generic>) noexcept
+ {
+ static_assert(batch<T, A>::size == batch<V, A>::size,
+ "Index and destination sizes must match");
+
+ return batch_cast<T>(kernel::gather(batch<U, A> {}, src, index, A {}));
+ }
+
+ // insert
+ template <class A, class T, size_t I>
+ inline batch<T, A> insert(batch<T, A> const& self, T val, index<I>, requires_arch<generic>) noexcept
+ {
+ struct index_mask
+ {
+ static constexpr bool get(size_t index, size_t /* size*/)
+ {
+ return index != I;
+ }
+ };
+ batch<T, A> tmp(val);
+ return select(make_batch_bool_constant<batch<T, A>, index_mask>(), self, tmp);
+ }
+
+ // get
+ template <class A, size_t I, class T>
+ inline T get(batch<T, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept
+ {
+ alignas(A::alignment()) T buffer[batch<T, A>::size];
+ self.store_aligned(&buffer[0]);
+ return buffer[I];
+ }
+
+ template <class A, size_t I, class T>
+ inline T get(batch_bool<T, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept
+ {
+ alignas(A::alignment()) T buffer[batch_bool<T, A>::size];
+ self.store_aligned(&buffer[0]);
+ return buffer[I];
+ }
+
+ template <class A, size_t I, class T>
+ inline auto get(batch<std::complex<T>, A> const& self, ::xsimd::index<I>, requires_arch<generic>) noexcept -> typename batch<std::complex<T>, A>::value_type
+ {
+ alignas(A::alignment()) T buffer[batch<std::complex<T>, A>::size];
+ self.store_aligned(&buffer[0]);
+ return buffer[I];
+ }
+
+ template <class A, class T>
+ inline T get(batch<T, A> const& self, std::size_t i, requires_arch<generic>) noexcept
+ {
+ alignas(A::alignment()) T buffer[batch<T, A>::size];
+ self.store_aligned(&buffer[0]);
+ return buffer[i];
+ }
+
+ template <class A, class T>
+ inline T get(batch_bool<T, A> const& self, std::size_t i, requires_arch<generic>) noexcept
+ {
+ alignas(A::alignment()) bool buffer[batch_bool<T, A>::size];
+ self.store_aligned(&buffer[0]);
+ return buffer[i];
+ }
+
+ template <class A, class T>
+ inline auto get(batch<std::complex<T>, A> const& self, std::size_t i, requires_arch<generic>) noexcept -> typename batch<std::complex<T>, A>::value_type
+ {
+ using T2 = typename batch<std::complex<T>, A>::value_type;
+ alignas(A::alignment()) T2 buffer[batch<std::complex<T>, A>::size];
+ self.store_aligned(&buffer[0]);
+ return buffer[i];
+ }
+
+ // load_aligned
+ namespace detail
+ {
+ template <class A, class T_in, class T_out>
+ inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_fast_conversion) noexcept
+ {
+ using batch_type_in = batch<T_in, A>;
+ using batch_type_out = batch<T_out, A>;
+ return fast_cast(batch_type_in::load_aligned(mem), batch_type_out(), A {});
+ }
+ template <class A, class T_in, class T_out>
+ inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_slow_conversion) noexcept
+ {
+ static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct load for this type combination");
+ using batch_type_out = batch<T_out, A>;
+ alignas(A::alignment()) T_out buffer[batch_type_out::size];
+ std::copy(mem, mem + batch_type_out::size, std::begin(buffer));
+ return batch_type_out::load_aligned(buffer);
+ }
+ }
+ template <class A, class T_in, class T_out>
+ inline batch<T_out, A> load_aligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) noexcept
+ {
+ return detail::load_aligned<A>(mem, cvt, A {}, detail::conversion_type<A, T_in, T_out> {});
+ }
+
+ // load_unaligned
+ namespace detail
+ {
+ template <class A, class T_in, class T_out>
+ inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out>, requires_arch<generic>, with_fast_conversion) noexcept
+ {
+ using batch_type_in = batch<T_in, A>;
+ using batch_type_out = batch<T_out, A>;
+ return fast_cast(batch_type_in::load_unaligned(mem), batch_type_out(), A {});
+ }
+
+ template <class A, class T_in, class T_out>
+ inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>, with_slow_conversion) noexcept
+ {
+ static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct load for this type combination");
+ return load_aligned<A>(mem, cvt, generic {}, with_slow_conversion {});
+ }
+ }
+ template <class A, class T_in, class T_out>
+ inline batch<T_out, A> load_unaligned(T_in const* mem, convert<T_out> cvt, requires_arch<generic>) noexcept
+ {
+ return detail::load_unaligned<A>(mem, cvt, generic {}, detail::conversion_type<A, T_in, T_out> {});
+ }
+
+ // rotate_left
+ template <size_t N, class A, class T>
+ inline batch<T, A> rotate_left(batch<T, A> const& self, requires_arch<generic>) noexcept
+ {
+ struct rotate_generator
+ {
+ static constexpr size_t get(size_t index, size_t size)
+ {
+ return (index - N) % size;
+ }
+ };
+
+ return swizzle(self, make_batch_constant<batch<as_unsigned_integer_t<T>, A>, rotate_generator>(), A {});
+ }
+
+ template <size_t N, class A, class T>
+ inline batch<std::complex<T>, A> rotate_left(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+ {
+ return { rotate_left<N>(self.real()), rotate_left<N>(self.imag()) };
+ }
+
+ // rotate_right
+ template <size_t N, class A, class T>
+ inline batch<T, A> rotate_right(batch<T, A> const& self, requires_arch<generic>) noexcept
+ {
+ struct rotate_generator
+ {
+ static constexpr size_t get(size_t index, size_t size)
+ {
+ return (index + N) % size;
+ }
+ };
+
+ return swizzle(self, make_batch_constant<batch<as_unsigned_integer_t<T>, A>, rotate_generator>(), A {});
+ }
+
+ template <size_t N, class A, class T>
+ inline batch<std::complex<T>, A> rotate_right(batch<std::complex<T>, A> const& self, requires_arch<generic>) noexcept
+ {
+ return { rotate_right<N>(self.real()), rotate_right<N>(self.imag()) };
+ }
+
+ // Scatter with runtime indexes.
+ namespace detail
+ {
+ template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N == 0, int>::type = 0>
+ inline void scatter(batch<T, A> const& src, U* dst,
+ batch<V, A> const& index,
+ ::xsimd::index<N> I) noexcept
+ {
+ dst[index.get(I)] = static_cast<U>(src.get(I));
+ }
+
+ template <size_t N, typename T, typename A, typename U, typename V, typename std::enable_if<N != 0, int>::type = 0>
+ inline void
+ scatter(batch<T, A> const& src, U* dst, batch<V, A> const& index,
+ ::xsimd::index<N> I) noexcept
+ {
+ static_assert(N <= batch<V, A>::size, "Incorrect value in recursion!");
+
+ kernel::detail::scatter<N - 1, T, A, U, V>(
+ src, dst, index, {});
+ dst[index.get(I)] = static_cast<U>(src.get(I));
+ }
+ } // namespace detail
+
+ template <typename A, typename T, typename V>
+ inline void
+ scatter(batch<T, A> const& src, T* dst,
+ batch<V, A> const& index,
+ kernel::requires_arch<generic>) noexcept
+ {
+ static_assert(batch<T, A>::size == batch<V, A>::size,
+ "Source and index sizes must match");
+ kernel::detail::scatter<batch<V, A>::size - 1, T, A, T, V>(
+ src, dst, index, {});
+ }
+
+ template <typename A, typename T, typename U, typename V>
+ inline detail::sizes_mismatch_t<T, U, void>
+ scatter(batch<T, A> const& src, U* dst,
+ batch<V, A> const& index,
+ kernel::requires_arch<generic>) noexcept
+ {
+ static_assert(batch<T, A>::size == batch<V, A>::size,
+ "Source and index sizes must match");
+ kernel::detail::scatter<batch<V, A>::size - 1, T, A, U, V>(
+ src, dst, index, {});
+ }
+
+ template <typename A, typename T, typename U, typename V>
+ inline detail::stride_match_t<T, U, void>
+ scatter(batch<T, A> const& src, U* dst,
+ batch<V, A> const& index,
+ kernel::requires_arch<generic>) noexcept
+ {
+ static_assert(batch<T, A>::size == batch<V, A>::size,
+ "Source and index sizes must match");
+ const auto tmp = batch_cast<U>(src);
+ kernel::scatter<A>(tmp, dst, index, A {});
+ }
+
+ // shuffle
+ namespace detail
+ {
+ constexpr bool is_swizzle_fst(size_t)
+ {
+ return true;
+ }
+ template <typename ITy, typename... ITys>
+ constexpr bool is_swizzle_fst(size_t bsize, ITy index, ITys... indices)
+ {
+ return index < bsize && is_swizzle_fst(bsize, indices...);
+ }
+ constexpr bool is_swizzle_snd(size_t)
+ {
+ return true;
+ }
+ template <typename ITy, typename... ITys>
+ constexpr bool is_swizzle_snd(size_t bsize, ITy index, ITys... indices)
+ {
+ return index >= bsize && is_swizzle_snd(bsize, indices...);
+ }
+
+ constexpr bool is_zip_lo(size_t)
+ {
+ return true;
+ }
+
+ template <typename ITy0, typename ITy1, typename... ITys>
+ constexpr bool is_zip_lo(size_t bsize, ITy0 index0, ITy1 index1, ITys... indices)
+ {
+ return index0 == (bsize - (sizeof...(indices) + 2)) && index1 == (2 * bsize - (sizeof...(indices) + 2)) && is_zip_lo(bsize, indices...);
+ }
+
+ constexpr bool is_zip_hi(size_t)
+ {
+ return true;
+ }
+
+ template <typename ITy0, typename ITy1, typename... ITys>
+ constexpr bool is_zip_hi(size_t bsize, ITy0 index0, ITy1 index1, ITys... indices)
+ {
+ return index0 == (bsize / 2 + bsize - (sizeof...(indices) + 2)) && index1 == (bsize / 2 + 2 * bsize - (sizeof...(indices) + 2)) && is_zip_hi(bsize, indices...);
+ }
+
+ constexpr bool is_select(size_t)
+ {
+ return true;
+ }
+
+ template <typename ITy, typename... ITys>
+ constexpr bool is_select(size_t bsize, ITy index, ITys... indices)
+ {
+ return (index < bsize ? index : index - bsize) == (bsize - sizeof...(ITys)) && is_select(bsize, indices...);
+ }
+
+ }
+
+ template <class A, typename T, typename ITy, ITy... Indices>
+ inline batch<T, A> shuffle(batch<T, A> const& x, batch<T, A> const& y, batch_constant<batch<ITy, A>, Indices...>, requires_arch<generic>) noexcept
+ {
+ constexpr size_t bsize = sizeof...(Indices);
+
+ // Detect common patterns
+ XSIMD_IF_CONSTEXPR(detail::is_swizzle_fst(bsize, Indices...))
+ {
+ return swizzle(x, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? 0 /* never happens */ : Indices)...>());
+ }
+
+ XSIMD_IF_CONSTEXPR(detail::is_swizzle_snd(bsize, Indices...))
+ {
+ return swizzle(y, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? (Indices - bsize) : 0 /* never happens */)...>());
+ }
+
+ XSIMD_IF_CONSTEXPR(detail::is_zip_lo(bsize, Indices...))
+ {
+ return zip_lo(x, y);
+ }
+
+ XSIMD_IF_CONSTEXPR(detail::is_zip_hi(bsize, Indices...))
+ {
+ return zip_hi(x, y);
+ }
+
+ XSIMD_IF_CONSTEXPR(detail::is_select(bsize, Indices...))
+ {
+ return select(batch_bool_constant<batch<T, A>, (Indices < bsize)...>(), x, y);
+ }
+
+#if defined(__has_builtin)
+#if __has_builtin(__builtin_shuffle_vector)
+#define builtin_shuffle __builtin_shuffle_vector
+#endif
+#endif
+
+#if defined(builtin_shuffle)
+ return builtin_shuffle(x.data, y.data, Indices...);
+
+// FIXME: my experiments show that GCC only correctly optimizes this builtin
+// starting at GCC 13, where it already has __builtin_shuffle_vector
+//
+// #elif __has_builtin(__builtin_shuffle) || GCC >= 6
+// typedef ITy integer_vector_type __attribute__((vector_size(sizeof(batch<ITy, A>))));
+// return __builtin_shuffle(x.data, y.data, integer_vector_type{Indices...});
+#else
+ // Use a generic_pattern. It is suboptimal but clang optimizes this
+ // pretty well.
+ batch<T, A> x_lane = swizzle(x, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
+ batch<T, A> y_lane = swizzle(y, batch_constant<batch<ITy, A>, ((Indices >= bsize) ? (Indices - bsize) : Indices)...>());
+ batch_bool_constant<batch<T, A>, (Indices < bsize)...> select_x_lane;
+ return select(select_x_lane, x_lane, y_lane);
+#endif
+ }
+
+ // store
+ template <class T, class A>
+ inline void store(batch_bool<T, A> const& self, bool* mem, requires_arch<generic>) noexcept
+ {
+ using batch_type = batch<T, A>;
+ constexpr auto size = batch_bool<T, A>::size;
+ alignas(A::alignment()) T buffer[size];
+ kernel::store_aligned<A>(&buffer[0], batch_type(self), A {});
+ for (std::size_t i = 0; i < size; ++i)
+ mem[i] = bool(buffer[i]);
+ }
+
+ // store_aligned
+ template <class A, class T_in, class T_out>
+ inline void store_aligned(T_out* mem, batch<T_in, A> const& self, requires_arch<generic>) noexcept
+ {
+ static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct store for this type combination");
+ alignas(A::alignment()) T_in buffer[batch<T_in, A>::size];
+ store_aligned(&buffer[0], self);
+ std::copy(std::begin(buffer), std::end(buffer), mem);
+ }
+
+ // store_unaligned
+ template <class A, class T_in, class T_out>
+ inline void store_unaligned(T_out* mem, batch<T_in, A> const& self, requires_arch<generic>) noexcept
+ {
+ static_assert(!std::is_same<T_in, T_out>::value, "there should be a direct store for this type combination");
+ return store_aligned<A>(mem, self, generic {});
+ }
+
+ // swizzle
+ template <class A, class T, class ITy, ITy... Vs>
+ inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch_constant<batch<ITy, A>, Vs...> mask, requires_arch<generic>) noexcept
+ {
+ return { swizzle(self.real(), mask), swizzle(self.imag(), mask) };
+ }
+
+ template <class A, class T, class ITy>
+ inline batch<T, A> swizzle(batch<T, A> const& self, batch<ITy, A> mask, requires_arch<generic>) noexcept
+ {
+ constexpr size_t size = batch<T, A>::size;
+ alignas(A::alignment()) T self_buffer[size];
+ store_aligned(&self_buffer[0], self);
+
+ alignas(A::alignment()) ITy mask_buffer[size];
+ store_aligned(&mask_buffer[0], mask);
+
+ alignas(A::alignment()) T out_buffer[size];
+ for (size_t i = 0; i < size; ++i)
+ out_buffer[i] = self_buffer[mask_buffer[i]];
+ return batch<T, A>::load_aligned(out_buffer);
+ }
+
+ template <class A, class T, class ITy>
+ inline batch<std::complex<T>, A> swizzle(batch<std::complex<T>, A> const& self, batch<ITy, A> mask, requires_arch<generic>) noexcept
+ {
+ return { swizzle(self.real(), mask), swizzle(self.imag(), mask) };
+ }
+
+ // load_complex_aligned
+ namespace detail
+ {
+ template <class A, class T>
+ inline batch<std::complex<T>, A> load_complex(batch<T, A> const& /*hi*/, batch<T, A> const& /*lo*/, requires_arch<generic>) noexcept
+ {
+ static_assert(std::is_same<T, void>::value, "load_complex not implemented for the required architecture");
+ }
+
+ template <class A, class T>
+ inline batch<T, A> complex_high(batch<std::complex<T>, A> const& /*src*/, requires_arch<generic>) noexcept
+ {
+ static_assert(std::is_same<T, void>::value, "complex_high not implemented for the required architecture");
+ }
+
+ template <class A, class T>
+ inline batch<T, A> complex_low(batch<std::complex<T>, A> const& /*src*/, requires_arch<generic>) noexcept
+ {
+ static_assert(std::is_same<T, void>::value, "complex_low not implemented for the required architecture");
+ }
+ }
+
+ template <class A, class T_out, class T_in>
+ inline batch<std::complex<T_out>, A> load_complex_aligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) noexcept
+ {
+ using real_batch = batch<T_out, A>;
+ T_in const* buffer = reinterpret_cast<T_in const*>(mem);
+ real_batch hi = real_batch::load_aligned(buffer),
+ lo = real_batch::load_aligned(buffer + real_batch::size);
+ return detail::load_complex(hi, lo, A {});
+ }
+
+ // load_complex_unaligned
+ template <class A, class T_out, class T_in>
+ inline batch<std::complex<T_out>, A> load_complex_unaligned(std::complex<T_in> const* mem, convert<std::complex<T_out>>, requires_arch<generic>) noexcept
+ {
+ using real_batch = batch<T_out, A>;
+ T_in const* buffer = reinterpret_cast<T_in const*>(mem);
+ real_batch hi = real_batch::load_unaligned(buffer),
+ lo = real_batch::load_unaligned(buffer + real_batch::size);
+ return detail::load_complex(hi, lo, A {});
+ }
+
+ // store_complex_aligned
+ template <class A, class T_out, class T_in>
+ inline void store_complex_aligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) noexcept
+ {
+ using real_batch = batch<T_in, A>;
+ real_batch hi = detail::complex_high(src, A {});
+ real_batch lo = detail::complex_low(src, A {});
+ T_out* buffer = reinterpret_cast<T_out*>(dst);
+ lo.store_aligned(buffer);
+ hi.store_aligned(buffer + real_batch::size);
+ }
+
+ // store_compelx_unaligned
+ template <class A, class T_out, class T_in>
+ inline void store_complex_unaligned(std::complex<T_out>* dst, batch<std::complex<T_in>, A> const& src, requires_arch<generic>) noexcept
+ {
+ using real_batch = batch<T_in, A>;
+ real_batch hi = detail::complex_high(src, A {});
+ real_batch lo = detail::complex_low(src, A {});
+ T_out* buffer = reinterpret_cast<T_out*>(dst);
+ lo.store_unaligned(buffer);
+ hi.store_unaligned(buffer + real_batch::size);
+ }
+
+ }
+
+}
+
+#endif