summaryrefslogtreecommitdiffstats
path: root/sc/source/core/tool/arraysumSSE2.cxx
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 09:06:44 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 09:06:44 +0000
commited5640d8b587fbcfed7dd7967f3de04b37a76f26 (patch)
tree7a5f7c6c9d02226d7471cb3cc8fbbf631b415303 /sc/source/core/tool/arraysumSSE2.cxx
parentInitial commit. (diff)
downloadlibreoffice-ed5640d8b587fbcfed7dd7967f3de04b37a76f26.tar.xz
libreoffice-ed5640d8b587fbcfed7dd7967f3de04b37a76f26.zip
Adding upstream version 4:7.4.7.upstream/4%7.4.7upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'sc/source/core/tool/arraysumSSE2.cxx')
-rw-r--r--sc/source/core/tool/arraysumSSE2.cxx122
1 files changed, 122 insertions, 0 deletions
diff --git a/sc/source/core/tool/arraysumSSE2.cxx b/sc/source/core/tool/arraysumSSE2.cxx
new file mode 100644
index 000000000..5d962baf9
--- /dev/null
+++ b/sc/source/core/tool/arraysumSSE2.cxx
@@ -0,0 +1,122 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ *
+ */
+
+#include "arraysum.hxx"
+
+#include <arraysumfunctor.hxx>
+
+#include <tools/simdsupport.hxx>
+
+#include <stdlib.h>
+
+#if SC_USE_SSE2
+
+namespace sc::op
+{
+/** Kahan sum with SSE2.
+ */
+static inline void sumSSE2(__m128d& sum, __m128d& err, const __m128d& value)
+{
+ const __m128d ANNULATE_SIGN_BIT = _mm_castsi128_pd(_mm_set1_epi64x(0x7FFF'FFFF'FFFF'FFFF));
+ // Temporal parameter
+ __m128d t = _mm_add_pd(sum, value);
+ // Absolute value of the total sum
+ __m128d asum = _mm_and_pd(sum, ANNULATE_SIGN_BIT);
+ // Absolute value of the value to add
+ __m128d avalue = _mm_and_pd(value, ANNULATE_SIGN_BIT);
+ // Compare the absolute values sum >= value
+ __m128d mask = _mm_cmpge_pd(asum, avalue);
+ // The following code has this form ( a - t + b)
+ // Case 1: a = sum b = value
+ // Case 2: a = value b = sum
+ __m128d a = _mm_add_pd(_mm_and_pd(mask, sum), _mm_andnot_pd(mask, value));
+ __m128d b = _mm_add_pd(_mm_and_pd(mask, value), _mm_andnot_pd(mask, sum));
+ err = _mm_add_pd(err, _mm_add_pd(_mm_sub_pd(a, t), b));
+ // Store result
+ sum = t;
+}
+
+/** Execute Kahan sum with SSE2.
+ */
+KahanSum executeSSE2(size_t& i, size_t nSize, const double* pCurrent)
+{
+ // Make sure we don't fall out of bounds.
+ // This works by sums of 8 terms.
+ // So the 8'th term is i+7
+ // If we iterate until nSize won't fall out of bounds
+ if (nSize > i + 7)
+ {
+ // Setup sums and errors as 0
+ __m128d sum1 = _mm_setzero_pd();
+ __m128d err1 = _mm_setzero_pd();
+ __m128d sum2 = _mm_setzero_pd();
+ __m128d err2 = _mm_setzero_pd();
+ __m128d sum3 = _mm_setzero_pd();
+ __m128d err3 = _mm_setzero_pd();
+ __m128d sum4 = _mm_setzero_pd();
+ __m128d err4 = _mm_setzero_pd();
+
+ for (; i + 7 < nSize; i += 8)
+ {
+ // Kahan sum 1
+ __m128d load1 = _mm_loadu_pd(pCurrent);
+ sumSSE2(sum1, err1, load1);
+ pCurrent += 2;
+
+ // Kahan sum 2
+ __m128d load2 = _mm_loadu_pd(pCurrent);
+ sumSSE2(sum2, err2, load2);
+ pCurrent += 2;
+
+ // Kahan sum 3
+ __m128d load3 = _mm_loadu_pd(pCurrent);
+ sumSSE2(sum3, err3, load3);
+ pCurrent += 2;
+
+ // Kahan sum 4
+ __m128d load4 = _mm_loadu_pd(pCurrent);
+ sumSSE2(sum4, err4, load4);
+ pCurrent += 2;
+ }
+
+ // Now we combine pairwise summation with Kahan summation
+
+ // 1+2 3+4 -> 1, 3
+ sumSSE2(sum1, err1, sum2);
+ sumSSE2(sum1, err1, err2);
+ sumSSE2(sum3, err3, sum4);
+ sumSSE2(sum3, err3, err4);
+
+ // 1+3 -> 1
+ sumSSE2(sum1, err1, sum3);
+ sumSSE2(sum1, err1, err3);
+
+ // Store results
+ double sums[2];
+ double errs[2];
+ _mm_storeu_pd(&sums[0], sum1);
+ _mm_storeu_pd(&errs[0], err1);
+
+ // First Kahan & pairwise summation
+ // 0+1 -> 0
+ sumNeumanierNormal(sums[0], errs[0], sums[1]);
+ sumNeumanierNormal(sums[0], errs[0], errs[1]);
+
+ // Store result
+ return { sums[0], errs[0] };
+ }
+ return { 0.0, 0.0 };
+}
+
+} // namespace
+
+#endif
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */