1 files changed, 969 insertions, 0 deletions
diff --git a/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_trigo.hpp b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_trigo.hpp
new file mode 100644
index 0000000000..2568a7253f
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/generic/xsimd_generic_trigo.hpp
@@ -0,0 +1,969 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and         *
+ * Martin Renou                                                             *
+ * Copyright (c) QuantStack                                                 *
+ * Copyright (c) Serge Guelton                                              *
+ *                                                                          *
+ * Distributed under the terms of the BSD 3-Clause License.                 *
+ *                                                                          *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_GENERIC_TRIGO_HPP
+#define XSIMD_GENERIC_TRIGO_HPP
+
+#include "./xsimd_generic_details.hpp"
+
+#include <array>
+
+namespace xsimd
+{
+
+    namespace kernel
+    {
+        /* origin: boost/simd/arch/common/detail/simd/trig_base.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+
+        using namespace types;
+
+        // acos
+        template <class A, class T>
+        inline batch<T, A> acos(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type x = abs(self);
+            auto x_larger_05 = x > batch_type(0.5);
+            x = select(x_larger_05, sqrt(fma(batch_type(-0.5), x, batch_type(0.5))), self);
+            x = asin(x);
+            x = select(x_larger_05, x + x, x);
+            x = select(self < batch_type(-0.5), constants::pi<batch_type>() - x, x);
+            return select(x_larger_05, x, constants::pio2<batch_type>() - x);
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> acos(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            batch_type tmp = asin(z);
+            return { constants::pio2<real_batch>() - tmp.real(), -tmp.imag() };
+        }
+
+        // acosh
+        /* origin: boost/simd/arch/common/simd/function/acosh.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A, class T>
+        inline batch<T, A> acosh(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type x = self - batch_type(1.);
+            auto test = x > constants::oneotwoeps<batch_type>();
+            batch_type z = select(test, self, x + sqrt(x + x + x * x));
+            batch_type l1pz = log1p(z);
+            return select(test, l1pz + constants::log_2<batch_type>(), l1pz);
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> acosh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            batch_type w = acos(z);
+            w = batch_type(-w.imag(), w.real());
+            return w;
+        }
+
+        // asin
+        template <class A>
+        inline batch<float, A> asin(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            batch_type x = abs(self);
+            batch_type sign = bitofsign(self);
+            auto x_larger_05 = x > batch_type(0.5);
+            batch_type z = select(x_larger_05, batch_type(0.5) * (batch_type(1.) - x), x * x);
+            x = select(x_larger_05, sqrt(z), x);
+            batch_type z1 = detail::horner<batch_type,
+                                           0x3e2aaae4,
+                                           0x3d9980f6,
+                                           0x3d3a3ec7,
+                                           0x3cc617e3,
+                                           0x3d2cb352>(z);
+            z1 = fma(z1, z * x, x);
+            z = select(x_larger_05, constants::pio2<batch_type>() - (z1 + z1), z1);
+            return z ^ sign;
+        }
+        template <class A>
+        inline batch<double, A> asin(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            batch_type x = abs(self);
+            auto small_cond = x < constants::sqrteps<batch_type>();
+            batch_type ct1 = batch_type(bit_cast<double>(int64_t(0x3fe4000000000000)));
+            batch_type zz1 = batch_type(1.) - x;
+            batch_type vp = zz1 * detail::horner<batch_type, 0x403c896240f3081dull, 0xc03991aaac01ab68ull, 0x401bdff5baf33e6aull, 0xbfe2079259f9290full, 0x3f684fc3988e9f08ull>(zz1) / detail::horner1<batch_type, 0x40756709b0b644beull, 0xc077fe08959063eeull, 0x40626219af6a7f42ull, 0xc035f2a2b6bf5d8cull>(zz1);
+            zz1 = sqrt(zz1 + zz1);
+            batch_type z = constants::pio4<batch_type>() - zz1;
+            zz1 = fms(zz1, vp, constants::pio_2lo<batch_type>());
+            z = z - zz1;
+            zz1 = z + constants::pio4<batch_type>();
+            batch_type zz2 = self * self;
+            z = zz2 * detail::horner<batch_type, 0xc020656c06ceafd5ull, 0x40339007da779259ull, 0xc0304331de27907bull, 0x4015c74b178a2dd9ull, 0xbfe34341333e5c16ull, 0x3f716b9b0bd48ad3ull>(zz2) / detail::horner1<batch_type, 0xc04898220a3607acull, 0x4061705684ffbf9dull, 0xc06265bb6d3576d7ull, 0x40519fc025fe9054ull, 0xc02d7b590b5e0eabull>(zz2);
+            zz2 = fma(x, z, x);
+            return select(x > batch_type(1.), constants::nan<batch_type>(),
+                          select(small_cond, x,
+                                 select(x > ct1, zz1, zz2))
+                              ^ bitofsign(self));
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> asin(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            real_batch x = z.real();
+            real_batch y = z.imag();
+
+            batch_type ct(-y, x);
+            batch_type zz(real_batch(1.) - (x - y) * (x + y), -2 * x * y);
+            zz = log(ct + sqrt(zz));
+            batch_type resg(zz.imag(), -zz.real());
+
+            return select(y == real_batch(0.),
+                          select(fabs(x) > real_batch(1.),
+                                 batch_type(constants::pio2<real_batch>(), real_batch(0.)),
+                                 batch_type(asin(x), real_batch(0.))),
+                          resg);
+        }
+
+        // asinh
+        /* origin: boost/simd/arch/common/simd/function/asinh.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        namespace detail
+        {
+            template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+            inline batch<T, A>
+            average(const batch<T, A>& x1, const batch<T, A>& x2) noexcept
+            {
+                return (x1 & x2) + ((x1 ^ x2) >> 1);
+            }
+
+            template <class A, class T>
+            inline batch<T, A>
+            averagef(const batch<T, A>& x1, const batch<T, A>& x2) noexcept
+            {
+                using batch_type = batch<T, A>;
+                return fma(x1, batch_type(0.5), x2 * batch_type(0.5));
+            }
+            template <class A>
+            inline batch<float, A> average(batch<float, A> const& x1, batch<float, A> const& x2) noexcept
+            {
+                return averagef(x1, x2);
+            }
+            template <class A>
+            inline batch<double, A> average(batch<double, A> const& x1, batch<double, A> const& x2) noexcept
+            {
+                return averagef(x1, x2);
+            }
+        }
+        template <class A>
+        inline batch<float, A> asinh(batch<float, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<float, A>;
+            batch_type x = abs(self);
+            auto lthalf = x < batch_type(0.5);
+            batch_type x2 = x * x;
+            batch_type bts = bitofsign(self);
+            batch_type z(0.);
+            if (any(lthalf))
+            {
+                z = detail::horner<batch_type,
+                                   0x3f800000,
+                                   0xbe2aa9ad,
+                                   0x3d9949b1,
+                                   0xbd2ee581,
+                                   0x3ca4d6e6>(x2)
+                    * x;
+                if (all(lthalf))
+                    return z ^ bts;
+            }
+            batch_type tmp = select(x > constants::oneosqrteps<batch_type>(), x, detail::average(x, hypot(batch_type(1.), x)));
+#ifndef XSIMD_NO_NANS
+            return select(isnan(self), constants::nan<batch_type>(), select(lthalf, z, log(tmp) + constants::log_2<batch_type>()) ^ bts);
+#else
+            return select(lthalf, z, log(tmp) + constants::log_2<batch_type>()) ^ bts;
+#endif
+        }
+        template <class A>
+        inline batch<double, A> asinh(batch<double, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<double, A>;
+            batch_type x = abs(self);
+            auto test = x > constants::oneosqrteps<batch_type>();
+            batch_type z = select(test, x - batch_type(1.), x + x * x / (batch_type(1.) + hypot(batch_type(1.), x)));
+#ifndef XSIMD_NO_INFINITIES
+            z = select(x == constants::infinity<batch_type>(), x, z);
+#endif
+            batch_type l1pz = log1p(z);
+            z = select(test, l1pz + constants::log_2<batch_type>(), l1pz);
+            return bitofsign(self) ^ z;
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> asinh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            batch_type w = asin(batch_type(-z.imag(), z.real()));
+            w = batch_type(w.imag(), -w.real());
+            return w;
+        }
+
+        // atan
+        namespace detail
+        {
+            template <class A>
+            static inline batch<float, A> kernel_atan(const batch<float, A>& x, const batch<float, A>& recx) noexcept
+            {
+                using batch_type = batch<float, A>;
+                const auto flag1 = x < constants::tan3pio8<batch_type>();
+                const auto flag2 = (x >= batch_type(bit_cast<float>((uint32_t)0x3ed413cd))) && flag1;
+                batch_type yy = select(flag1, batch_type(0.), constants::pio2<batch_type>());
+                yy = select(flag2, constants::pio4<batch_type>(), yy);
+                batch_type xx = select(flag1, x, -recx);
+                xx = select(flag2, (x - batch_type(1.)) / (x + batch_type(1.)), xx);
+                const batch_type z = xx * xx;
+                batch_type z1 = detail::horner<batch_type,
+                                               0xbeaaaa2aul,
+                                               0x3e4c925ful,
+                                               0xbe0e1b85ul,
+                                               0x3da4f0d1ul>(z);
+                z1 = fma(xx, z1 * z, xx);
+                z1 = select(flag2, z1 + constants::pio_4lo<batch_type>(), z1);
+                z1 = select(!flag1, z1 + constants::pio_2lo<batch_type>(), z1);
+                return yy + z1;
+            }
+            template <class A>
+            static inline batch<double, A> kernel_atan(const batch<double, A>& x, const batch<double, A>& recx) noexcept
+            {
+                using batch_type = batch<double, A>;
+                const auto flag1 = x < constants::tan3pio8<batch_type>();
+                const auto flag2 = (x >= constants::tanpio8<batch_type>()) && flag1;
+                batch_type yy = select(flag1, batch_type(0.), constants::pio2<batch_type>());
+                yy = select(flag2, constants::pio4<batch_type>(), yy);
+                batch_type xx = select(flag1, x, -recx);
+                xx = select(flag2, (x - batch_type(1.)) / (x + batch_type(1.)), xx);
+                batch_type z = xx * xx;
+                z *= detail::horner<batch_type,
+                                    0xc0503669fd28ec8eull,
+                                    0xc05eb8bf2d05ba25ull,
+                                    0xc052c08c36880273ull,
+                                    0xc03028545b6b807aull,
+                                    0xbfec007fa1f72594ull>(z)
+                    / detail::horner1<batch_type,
+                                      0x4068519efbbd62ecull,
+                                      0x407e563f13b049eaull,
+                                      0x407b0e18d2e2be3bull,
+                                      0x4064a0dd43b8fa25ull,
+                                      0x4038dbc45b14603cull>(z);
+                z = fma(xx, z, xx);
+                z = select(flag2, z + constants::pio_4lo<batch_type>(), z);
+                z = z + select(flag1, batch_type(0.), constants::pio_2lo<batch_type>());
+                return yy + z;
+            }
+        }
+        template <class A, class T>
+        inline batch<T, A> atan(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            const batch_type absa = abs(self);
+            const batch_type x = detail::kernel_atan(absa, batch_type(1.) / absa);
+            return x ^ bitofsign(self);
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> atan(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            real_batch x = z.real();
+            real_batch y = z.imag();
+            real_batch x2 = x * x;
+            real_batch one(1.);
+            real_batch a = one - x2 - (y * y);
+            real_batch w = 0.5 * atan2(2. * x, a);
+            real_batch num = y + one;
+            num = x2 + num * num;
+            real_batch den = y - one;
+            den = x2 + den * den;
+            batch_type res = select((x == real_batch(0.)) && (y == real_batch(1.)),
+                                    batch_type(real_batch(0.), constants::infinity<real_batch>()),
+                                    batch_type(w, 0.25 * log(num / den)));
+            return res;
+        }
+
+        // atanh
+        /* origin: boost/simd/arch/common/simd/function/acosh.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A, class T>
+        inline batch<T, A> atanh(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type x = abs(self);
+            batch_type t = x + x;
+            batch_type z = batch_type(1.) - x;
+            auto test = x < batch_type(0.5);
+            batch_type tmp = select(test, x, t) / z;
+            return bitofsign(self) ^ (batch_type(0.5) * log1p(select(test, fma(t, tmp, t), tmp)));
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> atanh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            batch_type w = atan(batch_type(-z.imag(), z.real()));
+            w = batch_type(w.imag(), -w.real());
+            return w;
+        }
+
+        // atan2
+        template <class A, class T>
+        inline batch<T, A> atan2(batch<T, A> const& self, batch<T, A> const& other, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            const batch_type q = abs(self / other);
+            const batch_type z = detail::kernel_atan(q, batch_type(1.) / q);
+            return select(other > batch_type(0.), z, constants::pi<batch_type>() - z) * signnz(self);
+        }
+
+        // cos
+        namespace detail
+        {
+            template <class T, class A>
+            inline batch<T, A> quadrant(const batch<T, A>& x) noexcept
+            {
+                return x & batch<T, A>(3);
+            }
+
+            template <class A>
+            inline batch<float, A> quadrant(const batch<float, A>& x) noexcept
+            {
+                return to_float(quadrant(to_int(x)));
+            }
+
+            template <class A>
+            inline batch<double, A> quadrant(const batch<double, A>& x) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type a = x * batch_type(0.25);
+                return (a - floor(a)) * batch_type(4.);
+            }
+            /* origin: boost/simd/arch/common/detail/simd/f_trig_evaluation.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+
+            template <class A>
+            inline batch<float, A> cos_eval(const batch<float, A>& z) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type y = detail::horner<batch_type,
+                                              0x3d2aaaa5,
+                                              0xbab60619,
+                                              0x37ccf5ce>(z);
+                return batch_type(1.) + fma(z, batch_type(-0.5), y * z * z);
+            }
+
+            template <class A>
+            inline batch<float, A> sin_eval(const batch<float, A>& z, const batch<float, A>& x) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type y = detail::horner<batch_type,
+                                              0xbe2aaaa2,
+                                              0x3c08839d,
+                                              0xb94ca1f9>(z);
+                return fma(y * z, x, x);
+            }
+
+            template <class A>
+            static inline batch<float, A> base_tancot_eval(const batch<float, A>& z) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type zz = z * z;
+                batch_type y = detail::horner<batch_type,
+                                              0x3eaaaa6f,
+                                              0x3e0896dd,
+                                              0x3d5ac5c9,
+                                              0x3cc821b5,
+                                              0x3b4c779c,
+                                              0x3c19c53b>(zz);
+                return fma(y, zz * z, z);
+            }
+
+            template <class A, class BB>
+            static inline batch<float, A> tan_eval(const batch<float, A>& z, const BB& test) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type y = base_tancot_eval(z);
+                return select(test, y, -batch_type(1.) / y);
+            }
+
+            template <class A, class BB>
+            static inline batch<float, A> cot_eval(const batch<float, A>& z, const BB& test) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type y = base_tancot_eval(z);
+                return select(test, batch_type(1.) / y, -y);
+            }
+
+            /* origin: boost/simd/arch/common/detail/simd/d_trig_evaluation.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class A>
+            static inline batch<double, A> cos_eval(const batch<double, A>& z) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type y = detail::horner<batch_type,
+                                              0x3fe0000000000000ull,
+                                              0xbfa5555555555551ull,
+                                              0x3f56c16c16c15d47ull,
+                                              0xbefa01a019ddbcd9ull,
+                                              0x3e927e4f8e06d9a5ull,
+                                              0xbe21eea7c1e514d4ull,
+                                              0x3da8ff831ad9b219ull>(z);
+                return batch_type(1.) - y * z;
+            }
+
+            template <class A>
+            static inline batch<double, A> sin_eval(const batch<double, A>& z, const batch<double, A>& x) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type y = detail::horner<batch_type,
+                                              0xbfc5555555555548ull,
+                                              0x3f8111111110f7d0ull,
+                                              0xbf2a01a019bfdf03ull,
+                                              0x3ec71de3567d4896ull,
+                                              0xbe5ae5e5a9291691ull,
+                                              0x3de5d8fd1fcf0ec1ull>(z);
+                return fma(y * z, x, x);
+            }
+
+            template <class A>
+            static inline batch<double, A> base_tancot_eval(const batch<double, A>& z) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type zz = z * z;
+                batch_type num = detail::horner<batch_type,
+                                                0xc1711fead3299176ull,
+                                                0x413199eca5fc9dddull,
+                                                0xc0c992d8d24f3f38ull>(zz);
+                batch_type den = detail::horner1<batch_type,
+                                                 0xc189afe03cbe5a31ull,
+                                                 0x4177d98fc2ead8efull,
+                                                 0xc13427bc582abc96ull,
+                                                 0x40cab8a5eeb36572ull>(zz);
+                return fma(z, (zz * (num / den)), z);
+            }
+
+            template <class A, class BB>
+            static inline batch<double, A> tan_eval(const batch<double, A>& z, const BB& test) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type y = base_tancot_eval(z);
+                return select(test, y, -batch_type(1.) / y);
+            }
+
+            template <class A, class BB>
+            static inline batch<double, A> cot_eval(const batch<double, A>& z, const BB& test) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type y = base_tancot_eval(z);
+                return select(test, batch_type(1.) / y, -y);
+            }
+            /* origin: boost/simd/arch/common/detail/simd/trig_reduction.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+
+            struct trigo_radian_tag
+            {
+            };
+            struct trigo_pi_tag
+            {
+            };
+
+            template <class B, class Tag = trigo_radian_tag>
+            struct trigo_reducer
+            {
+                static inline B reduce(const B& x, B& xr) noexcept
+                {
+                    if (all(x <= constants::pio4<B>()))
+                    {
+                        xr = x;
+                        return B(0.);
+                    }
+                    else if (all(x <= constants::pio2<B>()))
+                    {
+                        auto test = x > constants::pio4<B>();
+                        xr = x - constants::pio2_1<B>();
+                        xr -= constants::pio2_2<B>();
+                        xr -= constants::pio2_3<B>();
+                        xr = select(test, xr, x);
+                        return select(test, B(1.), B(0.));
+                    }
+                    else if (all(x <= constants::twentypi<B>()))
+                    {
+                        B xi = nearbyint(x * constants::twoopi<B>());
+                        xr = fnma(xi, constants::pio2_1<B>(), x);
+                        xr -= xi * constants::pio2_2<B>();
+                        xr -= xi * constants::pio2_3<B>();
+                        return quadrant(xi);
+                    }
+                    else if (all(x <= constants::mediumpi<B>()))
+                    {
+                        B fn = nearbyint(x * constants::twoopi<B>());
+                        B r = x - fn * constants::pio2_1<B>();
+                        B w = fn * constants::pio2_1t<B>();
+                        B t = r;
+                        w = fn * constants::pio2_2<B>();
+                        r = t - w;
+                        w = fn * constants::pio2_2t<B>() - ((t - r) - w);
+                        t = r;
+                        w = fn * constants::pio2_3<B>();
+                        r = t - w;
+                        w = fn * constants::pio2_3t<B>() - ((t - r) - w);
+                        xr = r - w;
+                        return quadrant(fn);
+                    }
+                    else
+                    {
+                        static constexpr std::size_t size = B::size;
+                        using value_type = typename B::value_type;
+                        alignas(B) std::array<value_type, size> tmp;
+                        alignas(B) std::array<value_type, size> txr;
+                        alignas(B) std::array<value_type, size> args;
+                        x.store_aligned(args.data());
+
+                        for (std::size_t i = 0; i < size; ++i)
+                        {
+                            double arg = args[i];
+                            if (arg == std::numeric_limits<value_type>::infinity())
+                            {
+                                tmp[i] = 0.;
+                                txr[i] = std::numeric_limits<value_type>::quiet_NaN();
+                            }
+                            else
+                            {
+                                double y[2];
+                                std::int32_t n = ::xsimd::detail::__ieee754_rem_pio2(arg, y);
+                                tmp[i] = value_type(n & 3);
+                                txr[i] = value_type(y[0]);
+                            }
+                        }
+                        xr = B::load_aligned(&txr[0]);
+                        B res = B::load_aligned(&tmp[0]);
+                        return res;
+                    }
+                }
+            };
+
+            template <class B>
+            struct trigo_reducer<B, trigo_pi_tag>
+            {
+                static inline B reduce(const B& x, B& xr) noexcept
+                {
+                    B xi = nearbyint(x * B(2.));
+                    B x2 = x - xi * B(0.5);
+                    xr = x2 * constants::pi<B>();
+                    return quadrant(xi);
+                }
+            };
+
+        }
+        template <class A, class T>
+        inline batch<T, A> cos(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            const batch_type x = abs(self);
+            batch_type xr = constants::nan<batch_type>();
+            const batch_type n = detail::trigo_reducer<batch_type>::reduce(x, xr);
+            auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
+            auto swap_bit = fma(batch_type(-2.), tmp, n);
+            auto sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
+            const batch_type z = xr * xr;
+            const batch_type se = detail::sin_eval(z, xr);
+            const batch_type ce = detail::cos_eval(z);
+            const batch_type z1 = select(swap_bit != batch_type(0.), se, ce);
+            return z1 ^ sign_bit;
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> cos(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            return { cos(z.real()) * cosh(z.imag()), -sin(z.real()) * sinh(z.imag()) };
+        }
+
+        // cosh
+
+        /* origin: boost/simd/arch/common/simd/function/cosh.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+
+        template <class A, class T>
+        inline batch<T, A> cosh(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type x = abs(self);
+            auto test1 = x > (constants::maxlog<batch_type>() - constants::log_2<batch_type>());
+            batch_type fac = select(test1, batch_type(0.5), batch_type(1.));
+            batch_type tmp = exp(x * fac);
+            batch_type tmp1 = batch_type(0.5) * tmp;
+            return select(test1, tmp1 * tmp, detail::average(tmp, batch_type(1.) / tmp));
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> cosh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            auto x = z.real();
+            auto y = z.imag();
+            return { cosh(x) * cos(y), sinh(x) * sin(y) };
+        }
+
+        // sin
+        namespace detail
+        {
+            template <class A, class T, class Tag = trigo_radian_tag>
+            inline batch<T, A> sin(batch<T, A> const& self, Tag = Tag()) noexcept
+            {
+                using batch_type = batch<T, A>;
+                const batch_type x = abs(self);
+                batch_type xr = constants::nan<batch_type>();
+                const batch_type n = detail::trigo_reducer<batch_type, Tag>::reduce(x, xr);
+                auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
+                auto swap_bit = fma(batch_type(-2.), tmp, n);
+                auto sign_bit = bitofsign(self) ^ select(tmp != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
+                const batch_type z = xr * xr;
+                const batch_type se = detail::sin_eval(z, xr);
+                const batch_type ce = detail::cos_eval(z);
+                const batch_type z1 = select(swap_bit == batch_type(0.), se, ce);
+                return z1 ^ sign_bit;
+            }
+        }
+
+        template <class A, class T>
+        inline batch<T, A> sin(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            return detail::sin(self);
+        }
+
+        template <class A, class T>
+        inline batch<std::complex<T>, A> sin(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            return { sin(z.real()) * cosh(z.imag()), cos(z.real()) * sinh(z.imag()) };
+        }
+
+        // sincos
+        template <class A, class T>
+        inline std::pair<batch<T, A>, batch<T, A>> sincos(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            const batch_type x = abs(self);
+            batch_type xr = constants::nan<batch_type>();
+            const batch_type n = detail::trigo_reducer<batch_type>::reduce(x, xr);
+            auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
+            auto swap_bit = fma(batch_type(-2.), tmp, n);
+            const batch_type z = xr * xr;
+            const batch_type se = detail::sin_eval(z, xr);
+            const batch_type ce = detail::cos_eval(z);
+            auto sin_sign_bit = bitofsign(self) ^ select(tmp != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
+            const batch_type sin_z1 = select(swap_bit == batch_type(0.), se, ce);
+            auto cos_sign_bit = select((swap_bit ^ tmp) != batch_type(0.), constants::signmask<batch_type>(), batch_type(0.));
+            const batch_type cos_z1 = select(swap_bit != batch_type(0.), se, ce);
+            return std::make_pair(sin_z1 ^ sin_sign_bit, cos_z1 ^ cos_sign_bit);
+        }
+
+        template <class A, class T>
+        inline std::pair<batch<std::complex<T>, A>, batch<std::complex<T>, A>>
+        sincos(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            real_batch rcos = cos(z.real());
+            real_batch rsin = sin(z.real());
+            real_batch icosh = cosh(z.imag());
+            real_batch isinh = sinh(z.imag());
+            return std::make_pair(batch_type(rsin * icosh, rcos * isinh), batch_type(rcos * icosh, -rsin * isinh));
+        }
+
+        // sinh
+        namespace detail
+        {
+            /* origin: boost/simd/arch/common/detail/generic/sinh_kernel.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class A>
+            inline batch<float, A> sinh_kernel(batch<float, A> const& self) noexcept
+            {
+                using batch_type = batch<float, A>;
+                batch_type sqr_self = self * self;
+                return detail::horner<batch_type,
+                                      0x3f800000, // 1.0f
+                                      0x3e2aaacc, // 1.66667160211E-1f
+                                      0x3c087bbe, // 8.33028376239E-3f
+                                      0x39559e2f // 2.03721912945E-4f
+                                      >(sqr_self)
+                    * self;
+            }
+
+            template <class A>
+            inline batch<double, A> sinh_kernel(batch<double, A> const& self) noexcept
+            {
+                using batch_type = batch<double, A>;
+                batch_type sqrself = self * self;
+                return fma(self, (detail::horner<batch_type,
+                                                 0xc115782bdbf6ab05ull, //  -3.51754964808151394800E5
+                                                 0xc0c694b8c71d6182ull, //  -1.15614435765005216044E4,
+                                                 0xc064773a398ff4feull, //  -1.63725857525983828727E2,
+                                                 0xbfe9435fe8bb3cd6ull //  -7.89474443963537015605E-1
+                                                 >(sqrself)
+                                  / detail::horner1<batch_type,
+                                                    0xc1401a20e4f90044ull, //  -2.11052978884890840399E6
+                                                    0x40e1a7ba7ed72245ull, //   3.61578279834431989373E4,
+                                                    0xc0715b6096e96484ull //  -2.77711081420602794433E2,
+                                                    >(sqrself))
+                               * sqrself,
+                           self);
+            }
+        }
+        /* origin: boost/simd/arch/common/simd/function/sinh.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A, class T>
+        inline batch<T, A> sinh(batch<T, A> const& a, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type half(0.5);
+            batch_type x = abs(a);
+            auto lt1 = x < batch_type(1.);
+            batch_type bts = bitofsign(a);
+            batch_type z(0.);
+            if (any(lt1))
+            {
+                z = detail::sinh_kernel(x);
+                if (all(lt1))
+                    return z ^ bts;
+            }
+            auto test1 = x > (constants::maxlog<batch_type>() - constants::log_2<batch_type>());
+            batch_type fac = select(test1, half, batch_type(1.));
+            batch_type tmp = exp(x * fac);
+            batch_type tmp1 = half * tmp;
+            batch_type r = select(test1, tmp1 * tmp, tmp1 - half / tmp);
+            return select(lt1, z, r) ^ bts;
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> sinh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            auto x = z.real();
+            auto y = z.imag();
+            return { sinh(x) * cos(y), cosh(x) * sin(y) };
+        }
+
+        // tan
+        template <class A, class T>
+        inline batch<T, A> tan(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            const batch_type x = abs(self);
+            batch_type xr = constants::nan<batch_type>();
+            const batch_type n = detail::trigo_reducer<batch_type>::reduce(x, xr);
+            auto tmp = select(n >= batch_type(2.), batch_type(1.), batch_type(0.));
+            auto swap_bit = fma(batch_type(-2.), tmp, n);
+            auto test = (swap_bit == batch_type(0.));
+            const batch_type y = detail::tan_eval(xr, test);
+            return y ^ bitofsign(self);
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> tan(batch<std::complex<T>, A> const& z, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<std::complex<T>, A>;
+            using real_batch = typename batch_type::real_batch;
+            real_batch d = cos(2 * z.real()) + cosh(2 * z.imag());
+            batch_type winf(constants::infinity<real_batch>(), constants::infinity<real_batch>());
+            real_batch wreal = sin(2 * z.real()) / d;
+            real_batch wimag = sinh(2 * z.imag());
+            batch_type wres = select(isinf(wimag), batch_type(wreal, real_batch(1.)), batch_type(wreal, wimag / d));
+            return select(d == real_batch(0.), winf, wres);
+        }
+
+        // tanh
+        namespace detail
+        {
+            /* origin: boost/simd/arch/common/detail/generic/tanh_kernel.hpp */
+            /*
+             * ====================================================
+             * copyright 2016 NumScale SAS
+             *
+             * Distributed under the Boost Software License, Version 1.0.
+             * (See copy at http://boost.org/LICENSE_1_0.txt)
+             * ====================================================
+             */
+            template <class B>
+            struct tanh_kernel;
+
+            template <class A>
+            struct tanh_kernel<batch<float, A>>
+            {
+                using batch_type = batch<float, A>;
+                static inline batch_type tanh(const batch_type& x) noexcept
+                {
+                    batch_type sqrx = x * x;
+                    return fma(detail::horner<batch_type,
+                                              0xbeaaaa99, //    -3.33332819422E-1F
+                                              0x3e088393, //    +1.33314422036E-1F
+                                              0xbd5c1e2d, //    -5.37397155531E-2F
+                                              0x3ca9134e, //    +2.06390887954E-2F
+                                              0xbbbaf0ea //    -5.70498872745E-3F
+                                              >(sqrx)
+                                   * sqrx,
+                               x, x);
+                }
+
+                static inline batch_type cotanh(const batch_type& x) noexcept
+                {
+                    return batch_type(1.) / tanh(x);
+                }
+            };
+
+            template <class A>
+            struct tanh_kernel<batch<double, A>>
+            {
+                using batch_type = batch<double, A>;
+                static inline batch_type tanh(const batch_type& x) noexcept
+                {
+                    batch_type sqrx = x * x;
+                    return fma(sqrx * p(sqrx) / q(sqrx), x, x);
+                }
+
+                static inline batch_type cotanh(const batch_type& x) noexcept
+                {
+                    batch_type sqrx = x * x;
+                    batch_type qval = q(sqrx);
+                    return qval / (x * fma(p(sqrx), sqrx, qval));
+                }
+
+                static inline batch_type p(const batch_type& x) noexcept
+                {
+                    return detail::horner<batch_type,
+                                          0xc0993ac030580563, // -1.61468768441708447952E3
+                                          0xc058d26a0e26682d, // -9.92877231001918586564E1,
+                                          0xbfeedc5baafd6f4b // -9.64399179425052238628E-1
+                                          >(x);
+                }
+
+                static inline batch_type q(const batch_type& x) noexcept
+                {
+                    return detail::horner1<batch_type,
+                                           0x40b2ec102442040c, //  4.84406305325125486048E3
+                                           0x40a176fa0e5535fa, //  2.23548839060100448583E3,
+                                           0x405c33f28a581B86 //  1.12811678491632931402E2,
+                                           >(x);
+                }
+            };
+
+        }
+        /* origin: boost/simd/arch/common/simd/function/tanh.hpp */
+        /*
+         * ====================================================
+         * copyright 2016 NumScale SAS
+         *
+         * Distributed under the Boost Software License, Version 1.0.
+         * (See copy at http://boost.org/LICENSE_1_0.txt)
+         * ====================================================
+         */
+        template <class A, class T>
+        inline batch<T, A> tanh(batch<T, A> const& self, requires_arch<generic>) noexcept
+        {
+            using batch_type = batch<T, A>;
+            batch_type one(1.);
+            batch_type x = abs(self);
+            auto test = x < (batch_type(5.) / batch_type(8.));
+            batch_type bts = bitofsign(self);
+            batch_type z = one;
+            if (any(test))
+            {
+                z = detail::tanh_kernel<batch_type>::tanh(x);
+                if (all(test))
+                    return z ^ bts;
+            }
+            batch_type r = fma(batch_type(-2.), one / (one + exp(x + x)), one);
+            return select(test, z, r) ^ bts;
+        }
+        template <class A, class T>
+        inline batch<std::complex<T>, A> tanh(const batch<std::complex<T>, A>& z, requires_arch<generic>) noexcept
+        {
+            using real_batch = typename batch<std::complex<T>, A>::real_batch;
+            auto x = z.real();
+            auto y = z.imag();
+            real_batch two(2);
+            auto d = cosh(two * x) + cos(two * y);
+            return { sinh(two * x) / d, sin(two * y) / d };
+        }
+
+    }
+
+}
+
+#endif