diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 09:06:44 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-07 09:06:44 +0000 |
commit | ed5640d8b587fbcfed7dd7967f3de04b37a76f26 (patch) | |
tree | 7a5f7c6c9d02226d7471cb3cc8fbbf631b415303 /sc/source/core/tool/arraysumSSE2.cxx | |
parent | Initial commit. (diff) | |
download | libreoffice-ed5640d8b587fbcfed7dd7967f3de04b37a76f26.tar.xz libreoffice-ed5640d8b587fbcfed7dd7967f3de04b37a76f26.zip |
Adding upstream version 4:7.4.7.upstream/4%7.4.7upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'sc/source/core/tool/arraysumSSE2.cxx')
-rw-r--r-- | sc/source/core/tool/arraysumSSE2.cxx | 122 |
1 files changed, 122 insertions, 0 deletions
diff --git a/sc/source/core/tool/arraysumSSE2.cxx b/sc/source/core/tool/arraysumSSE2.cxx new file mode 100644 index 000000000..5d962baf9 --- /dev/null +++ b/sc/source/core/tool/arraysumSSE2.cxx @@ -0,0 +1,122 @@ +/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */ +/* + * This file is part of the LibreOffice project. + * + * This Source Code Form is subject to the terms of the Mozilla Public + * License, v. 2.0. If a copy of the MPL was not distributed with this + * file, You can obtain one at http://mozilla.org/MPL/2.0/. + * + */ + +#include "arraysum.hxx" + +#include <arraysumfunctor.hxx> + +#include <tools/simdsupport.hxx> + +#include <stdlib.h> + +#if SC_USE_SSE2 + +namespace sc::op +{ +/** Kahan sum with SSE2. + */ +static inline void sumSSE2(__m128d& sum, __m128d& err, const __m128d& value) +{ + const __m128d ANNULATE_SIGN_BIT = _mm_castsi128_pd(_mm_set1_epi64x(0x7FFF'FFFF'FFFF'FFFF)); + // Temporal parameter + __m128d t = _mm_add_pd(sum, value); + // Absolute value of the total sum + __m128d asum = _mm_and_pd(sum, ANNULATE_SIGN_BIT); + // Absolute value of the value to add + __m128d avalue = _mm_and_pd(value, ANNULATE_SIGN_BIT); + // Compare the absolute values sum >= value + __m128d mask = _mm_cmpge_pd(asum, avalue); + // The following code has this form ( a - t + b) + // Case 1: a = sum b = value + // Case 2: a = value b = sum + __m128d a = _mm_add_pd(_mm_and_pd(mask, sum), _mm_andnot_pd(mask, value)); + __m128d b = _mm_add_pd(_mm_and_pd(mask, value), _mm_andnot_pd(mask, sum)); + err = _mm_add_pd(err, _mm_add_pd(_mm_sub_pd(a, t), b)); + // Store result + sum = t; +} + +/** Execute Kahan sum with SSE2. + */ +KahanSum executeSSE2(size_t& i, size_t nSize, const double* pCurrent) +{ + // Make sure we don't fall out of bounds. + // This works by sums of 8 terms. + // So the 8'th term is i+7 + // If we iterate until nSize won't fall out of bounds + if (nSize > i + 7) + { + // Setup sums and errors as 0 + __m128d sum1 = _mm_setzero_pd(); + __m128d err1 = _mm_setzero_pd(); + __m128d sum2 = _mm_setzero_pd(); + __m128d err2 = _mm_setzero_pd(); + __m128d sum3 = _mm_setzero_pd(); + __m128d err3 = _mm_setzero_pd(); + __m128d sum4 = _mm_setzero_pd(); + __m128d err4 = _mm_setzero_pd(); + + for (; i + 7 < nSize; i += 8) + { + // Kahan sum 1 + __m128d load1 = _mm_loadu_pd(pCurrent); + sumSSE2(sum1, err1, load1); + pCurrent += 2; + + // Kahan sum 2 + __m128d load2 = _mm_loadu_pd(pCurrent); + sumSSE2(sum2, err2, load2); + pCurrent += 2; + + // Kahan sum 3 + __m128d load3 = _mm_loadu_pd(pCurrent); + sumSSE2(sum3, err3, load3); + pCurrent += 2; + + // Kahan sum 4 + __m128d load4 = _mm_loadu_pd(pCurrent); + sumSSE2(sum4, err4, load4); + pCurrent += 2; + } + + // Now we combine pairwise summation with Kahan summation + + // 1+2 3+4 -> 1, 3 + sumSSE2(sum1, err1, sum2); + sumSSE2(sum1, err1, err2); + sumSSE2(sum3, err3, sum4); + sumSSE2(sum3, err3, err4); + + // 1+3 -> 1 + sumSSE2(sum1, err1, sum3); + sumSSE2(sum1, err1, err3); + + // Store results + double sums[2]; + double errs[2]; + _mm_storeu_pd(&sums[0], sum1); + _mm_storeu_pd(&errs[0], err1); + + // First Kahan & pairwise summation + // 0+1 -> 0 + sumNeumanierNormal(sums[0], errs[0], sums[1]); + sumNeumanierNormal(sums[0], errs[0], errs[1]); + + // Store result + return { sums[0], errs[0] }; + } + return { 0.0, 0.0 }; +} + +} // namespace + +#endif + +/* vim:set shiftwidth=4 softtabstop=4 expandtab: */ |