summaryrefslogtreecommitdiffstats
path: root/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_details.hpp
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_details.hpp')
-rw-r--r--third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_details.hpp316
1 files changed, 316 insertions, 0 deletions
diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_details.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_details.hpp
new file mode 100644
index 0000000000..14c62a0892
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_details.hpp
@@ -0,0 +1,316 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
+ * Martin Renou *
+ * Copyright (c) QuantStack *
+ * Copyright (c) Serge Guelton *
+ * *
+ * Distributed under the terms of the BSD 3-Clause License. *
+ * *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_DETAILS_HPP
+#define XSIMD_GENERIC_DETAILS_HPP
+
+#include <complex>
+
+#include "../../math/xsimd_rem_pio2.hpp"
+#include "../../types/xsimd_generic_arch.hpp"
+#include "../../types/xsimd_utils.hpp"
+#include "../xsimd_constants.hpp"
+
+namespace xsimd
+{
+ // Forward declaration. Should we put them in a separate file?
+ template <class T, class A>
+ inline batch<T, A> abs(batch<T, A> const& self) noexcept;
+ template <class T, class A>
+ inline batch<T, A> abs(batch<std::complex<T>, A> const& self) noexcept;
+ template <class T, class A>
+ inline bool any(batch_bool<T, A> const& self) noexcept;
+ template <class T, class A>
+ inline batch<T, A> atan2(batch<T, A> const& self, batch<T, A> const& other) noexcept;
+ template <class A, class T_out, class T_in>
+ inline batch<T_out, A> batch_cast(batch<T_in, A> const&, batch<T_out, A> const& out) noexcept;
+ template <class T, class A>
+ inline batch<T, A> bitofsign(batch<T, A> const& self) noexcept;
+ template <class T_out, class T_in, class A>
+ inline batch<T_out, A> bitwise_cast(batch<T_in, A> const& self) noexcept;
+ template <class T, class A>
+ inline batch<T, A> cos(batch<T, A> const& self) noexcept;
+ template <class T, class A>
+ inline batch<T, A> cosh(batch<T, A> const& self) noexcept;
+ template <class T, class A>
+ inline batch<T, A> exp(batch<T, A> const& self) noexcept;
+ template <class T, class A>
+ inline batch<T, A> fma(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept;
+ template <class T, class A>
+ inline batch<T, A> fms(batch<T, A> const& x, batch<T, A> const& y, batch<T, A> const& z) noexcept;
+ template <class T, class A>
+ inline batch<T, A> frexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& e) noexcept;
+ template <class T, class A, uint64_t... Coefs>
+ inline batch<T, A> horner(const batch<T, A>& self) noexcept;
+ template <class T, class A>
+ inline batch<T, A> hypot(const batch<T, A>& self) noexcept;
+ template <class T, class A>
+ inline batch_bool<T, A> is_even(batch<T, A> const& self) noexcept;
+ template <class T, class A>
+ inline batch_bool<T, A> is_flint(batch<T, A> const& self) noexcept;
+ template <class T, class A>
+ inline batch_bool<T, A> is_odd(batch<T, A> const& self) noexcept;
+ template <class T, class A>
+ inline typename batch<T, A>::batch_bool_type isinf(batch<T, A> const& self) noexcept;
+ template <class T, class A>
+ inline typename batch<T, A>::batch_bool_type isfinite(batch<T, A> const& self) noexcept;
+ template <class T, class A>
+ inline typename batch<T, A>::batch_bool_type isnan(batch<T, A> const& self) noexcept;
+ template <class T, class A>
+ inline batch<T, A> ldexp(const batch<T, A>& x, const batch<as_integer_t<T>, A>& e) noexcept;
+ template <class T, class A>
+ inline batch<T, A> log(batch<T, A> const& self) noexcept;
+ template <class T, class A>
+ inline batch<T, A> nearbyint(batch<T, A> const& self) noexcept;
+ template <class T, class A>
+ inline batch<as_integer_t<T>, A> nearbyint_as_int(const batch<T, A>& x) noexcept;
+ template <class T, class A>
+ inline T reduce_add(batch<T, A> const&) noexcept;
+ template <class T, class A>
+ inline batch<T, A> select(batch_bool<T, A> const&, batch<T, A> const&, batch<T, A> const&) noexcept;
+ template <class T, class A>
+ inline batch<std::complex<T>, A> select(batch_bool<T, A> const&, batch<std::complex<T>, A> const&, batch<std::complex<T>, A> const&) noexcept;
+ template <class T, class A>
+ inline batch<T, A> sign(batch<T, A> const& self) noexcept;
+ template <class T, class A>
+ inline batch<T, A> signnz(batch<T, A> const& self) noexcept;
+ template <class T, class A>
+ inline batch<T, A> sin(batch<T, A> const& self) noexcept;
+ template <class T, class A>
+ inline batch<T, A> sinh(batch<T, A> const& self) noexcept;
+ template <class T, class A>
+ inline std::pair<batch<T, A>, batch<T, A>> sincos(batch<T, A> const& self) noexcept;
+ template <class T, class A>
+ inline batch<T, A> sqrt(batch<T, A> const& self) noexcept;
+ template <class T, class A>
+ inline batch<T, A> tan(batch<T, A> const& self) noexcept;
+ template <class T, class A>
+ inline batch<as_float_t<T>, A> to_float(batch<T, A> const& self) noexcept;
+ template <class T, class A>
+ inline batch<as_integer_t<T>, A> to_int(batch<T, A> const& self) noexcept;
+ template <class T, class A>
+ inline batch<T, A> trunc(batch<T, A> const& self) noexcept;
+
+ namespace kernel
+ {
+
+ namespace detail
+ {
+ template <class F, class A, class T, class... Batches>
+ inline batch<T, A> apply(F&& func, batch<T, A> const& self, batch<T, A> const& other) noexcept
+ {
+ constexpr std::size_t size = batch<T, A>::size;
+ alignas(A::alignment()) T self_buffer[size];
+ alignas(A::alignment()) T other_buffer[size];
+ self.store_aligned(&self_buffer[0]);
+ other.store_aligned(&other_buffer[0]);
+ for (std::size_t i = 0; i < size; ++i)
+ {
+ self_buffer[i] = func(self_buffer[i], other_buffer[i]);
+ }
+ return batch<T, A>::load_aligned(self_buffer);
+ }
+
+ template <class U, class F, class A, class T>
+ inline batch<U, A> apply_transform(F&& func, batch<T, A> const& self) noexcept
+ {
+ static_assert(batch<T, A>::size == batch<U, A>::size,
+ "Source and destination sizes must match");
+ constexpr std::size_t src_size = batch<T, A>::size;
+ constexpr std::size_t dest_size = batch<U, A>::size;
+ alignas(A::alignment()) T self_buffer[src_size];
+ alignas(A::alignment()) U other_buffer[dest_size];
+ self.store_aligned(&self_buffer[0]);
+ for (std::size_t i = 0; i < src_size; ++i)
+ {
+ other_buffer[i] = func(self_buffer[i]);
+ }
+ return batch<U, A>::load_aligned(other_buffer);
+ }
+ }
+
+ // some generic fast_cast conversion
+ namespace detail
+ {
+ template <class A>
+ inline batch<uint8_t, A> fast_cast(batch<int8_t, A> const& self, batch<uint8_t, A> const&, requires_arch<generic>) noexcept
+ {
+ return bitwise_cast<uint8_t>(self);
+ }
+ template <class A>
+ inline batch<uint16_t, A> fast_cast(batch<int16_t, A> const& self, batch<uint16_t, A> const&, requires_arch<generic>) noexcept
+ {
+ return bitwise_cast<uint16_t>(self);
+ }
+ template <class A>
+ inline batch<uint32_t, A> fast_cast(batch<int32_t, A> const& self, batch<uint32_t, A> const&, requires_arch<generic>) noexcept
+ {
+ return bitwise_cast<uint32_t>(self);
+ }
+ template <class A>
+ inline batch<uint64_t, A> fast_cast(batch<int64_t, A> const& self, batch<uint64_t, A> const&, requires_arch<generic>) noexcept
+ {
+ return bitwise_cast<uint64_t>(self);
+ }
+ template <class A>
+ inline batch<int8_t, A> fast_cast(batch<uint8_t, A> const& self, batch<int8_t, A> const&, requires_arch<generic>) noexcept
+ {
+ return bitwise_cast<int8_t>(self);
+ }
+ template <class A>
+ inline batch<int16_t, A> fast_cast(batch<uint16_t, A> const& self, batch<int16_t, A> const&, requires_arch<generic>) noexcept
+ {
+ return bitwise_cast<int16_t>(self);
+ }
+ template <class A>
+ inline batch<int32_t, A> fast_cast(batch<uint32_t, A> const& self, batch<int32_t, A> const&, requires_arch<generic>) noexcept
+ {
+ return bitwise_cast<int32_t>(self);
+ }
+ template <class A>
+ inline batch<int64_t, A> fast_cast(batch<uint64_t, A> const& self, batch<int64_t, A> const&, requires_arch<generic>) noexcept
+ {
+ return bitwise_cast<int64_t>(self);
+ }
+
+ // Provide a generic uint32_t -> float cast only if we have a
+ // non-generic int32_t -> float fast_cast
+ template <class A, class _ = decltype(fast_cast(std::declval<batch<int32_t, A> const&>(), std::declval<batch<float, A> const&>(), A {}))>
+ inline batch<float, A> fast_cast(batch<uint32_t, A> const& v, batch<float, A> const&, requires_arch<generic>) noexcept
+ {
+ // see https://stackoverflow.com/questions/34066228/how-to-perform-uint32-float-conversion-with-sse
+ batch<uint32_t, A> msk_lo(0xFFFF);
+ batch<float, A> cnst65536f(65536.0f);
+
+ auto v_lo = batch_cast<int32_t>(v & msk_lo); /* extract the 16 lowest significant bits of self */
+ auto v_hi = batch_cast<int32_t>(v >> 16); /* 16 most significant bits of v */
+ auto v_lo_flt = batch_cast<float>(v_lo); /* No rounding */
+ auto v_hi_flt = batch_cast<float>(v_hi); /* No rounding */
+ v_hi_flt = cnst65536f * v_hi_flt; /* No rounding */
+ return v_hi_flt + v_lo_flt; /* Rounding may occur here, mul and add may fuse to fma for haswell and newer */
+ }
+
+ // Provide a generic float -> uint32_t cast only if we have a
+ // non-generic float -> int32_t fast_cast
+ template <class A, class _ = decltype(fast_cast(std::declval<batch<float, A> const&>(), std::declval<batch<int32_t, A> const&>(), A {}))>
+ inline batch<uint32_t, A> fast_cast(batch<float, A> const& v, batch<uint32_t, A> const&, requires_arch<generic>) noexcept
+ {
+ auto is_large = v >= batch<float, A>(1u << 31);
+ auto small = bitwise_cast<float>(batch_cast<int32_t>(v));
+ auto large = bitwise_cast<float>(
+ batch_cast<int32_t>(v - batch<float, A>(1u << 31))
+ ^ batch<int32_t, A>(1u << 31));
+ return bitwise_cast<uint32_t>(select(is_large, large, small));
+ }
+ }
+
+ namespace detail
+ {
+ // Generic conversion handling machinery. Each architecture must define
+ // conversion function when such conversions exits in the form of
+ // intrinsic. Then we use that information to automatically decide whether
+ // to use scalar or vector conversion when doing load / store / batch_cast
+ struct with_fast_conversion
+ {
+ };
+ struct with_slow_conversion
+ {
+ };
+
+ template <class A, class From, class To, class = void>
+ struct conversion_type_impl
+ {
+ using type = with_slow_conversion;
+ };
+
+ using xsimd::detail::void_t;
+
+ template <class A, class From, class To>
+ struct conversion_type_impl<A, From, To,
+ void_t<decltype(fast_cast(std::declval<const batch<From, A>&>(),
+ std::declval<const batch<To, A>&>(),
+ std::declval<const A&>()))>>
+ {
+ using type = with_fast_conversion;
+ };
+
+ template <class A, class From, class To>
+ using conversion_type = typename conversion_type_impl<A, From, To>::type;
+ }
+
+ namespace detail
+ {
+ /* origin: boost/simdfunction/horn.hpp*/
+ /*
+ * ====================================================
+ * copyright 2016 NumScale SAS
+ *
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See copy at http://boost.org/LICENSE_1_0.txt)
+ * ====================================================
+ */
+ template <class B, uint64_t c>
+ inline B coef() noexcept
+ {
+ using value_type = typename B::value_type;
+ return B(bit_cast<value_type>(as_unsigned_integer_t<value_type>(c)));
+ }
+ template <class B>
+ inline B horner(const B&) noexcept
+ {
+ return B(typename B::value_type(0.));
+ }
+
+ template <class B, uint64_t c0>
+ inline B horner(const B&) noexcept
+ {
+ return coef<B, c0>();
+ }
+
+ template <class B, uint64_t c0, uint64_t c1, uint64_t... args>
+ inline B horner(const B& self) noexcept
+ {
+ return fma(self, horner<B, c1, args...>(self), coef<B, c0>());
+ }
+
+ /* origin: boost/simdfunction/horn1.hpp*/
+ /*
+ * ====================================================
+ * copyright 2016 NumScale SAS
+ *
+ * Distributed under the Boost Software License, Version 1.0.
+ * (See copy at http://boost.org/LICENSE_1_0.txt)
+ * ====================================================
+ */
+ template <class B>
+ inline B horner1(const B&) noexcept
+ {
+ return B(1.);
+ }
+
+ template <class B, uint64_t c0>
+ inline B horner1(const B& x) noexcept
+ {
+ return x + detail::coef<B, c0>();
+ }
+
+ template <class B, uint64_t c0, uint64_t c1, uint64_t... args>
+ inline B horner1(const B& x) noexcept
+ {
+ return fma(x, horner1<B, c1, args...>(x), detail::coef<B, c0>());
+ }
+ }
+
+ }
+
+}
+
+#endif