summaryrefslogtreecommitdiffstats
path: root/third_party/xsimd/include/xsimd/arch/xsimd_sse3.hpp
diff options
context:
space:
mode:
Diffstat (limited to 'third_party/xsimd/include/xsimd/arch/xsimd_sse3.hpp')
-rw-r--r--third_party/xsimd/include/xsimd/arch/xsimd_sse3.hpp64
1 files changed, 64 insertions, 0 deletions
diff --git a/third_party/xsimd/include/xsimd/arch/xsimd_sse3.hpp b/third_party/xsimd/include/xsimd/arch/xsimd_sse3.hpp
new file mode 100644
index 0000000000..ccc049795c
--- /dev/null
+++ b/third_party/xsimd/include/xsimd/arch/xsimd_sse3.hpp
@@ -0,0 +1,64 @@
+/***************************************************************************
+ * Copyright (c) Johan Mabille, Sylvain Corlay, Wolf Vollprecht and *
+ * Martin Renou *
+ * Copyright (c) QuantStack *
+ * Copyright (c) Serge Guelton *
+ * *
+ * Distributed under the terms of the BSD 3-Clause License. *
+ * *
+ * The full license is in the file LICENSE, distributed with this software. *
+ ****************************************************************************/
+
+#ifndef XSIMD_SSE3_HPP
+#define XSIMD_SSE3_HPP
+
+#include "../types/xsimd_sse3_register.hpp"
+#include <type_traits>
+
+namespace xsimd
+{
+
+ namespace kernel
+ {
+ using namespace types;
+
+ // haddp
+ template <class A>
+ inline batch<float, A> haddp(batch<float, A> const* row, requires_arch<sse3>) noexcept
+ {
+ return _mm_hadd_ps(_mm_hadd_ps(row[0], row[1]),
+ _mm_hadd_ps(row[2], row[3]));
+ }
+ template <class A>
+ inline batch<double, A> haddp(batch<double, A> const* row, requires_arch<sse3>) noexcept
+ {
+ return _mm_hadd_pd(row[0], row[1]);
+ }
+
+ // load_unaligned
+ template <class A, class T, class = typename std::enable_if<std::is_integral<T>::value, void>::type>
+ inline batch<T, A> load_unaligned(T const* mem, convert<T>, requires_arch<sse3>) noexcept
+ {
+ return _mm_lddqu_si128((__m128i const*)mem);
+ }
+
+ // reduce_add
+ template <class A>
+ inline float reduce_add(batch<float, A> const& self, requires_arch<sse3>) noexcept
+ {
+ __m128 tmp0 = _mm_hadd_ps(self, self);
+ __m128 tmp1 = _mm_hadd_ps(tmp0, tmp0);
+ return _mm_cvtss_f32(tmp1);
+ }
+ template <class A>
+ inline double reduce_add(batch<double, A> const& self, requires_arch<sse3>) noexcept
+ {
+ __m128d tmp0 = _mm_hadd_pd(self, self);
+ return _mm_cvtsd_f64(tmp0);
+ }
+
+ }
+
+}
+
+#endif