summaryrefslogtreecommitdiffstats
path: root/include/comphelper/parallelsort.hxx
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--include/comphelper/parallelsort.hxx373
1 files changed, 373 insertions, 0 deletions
diff --git a/include/comphelper/parallelsort.hxx b/include/comphelper/parallelsort.hxx
new file mode 100644
index 000000000..94d86e0d1
--- /dev/null
+++ b/include/comphelper/parallelsort.hxx
@@ -0,0 +1,373 @@
+/* -*- Mode: C++; tab-width: 4; indent-tabs-mode: nil; c-basic-offset: 4 -*- */
+/*
+ * This file is part of the LibreOffice project.
+ *
+ * This Source Code Form is subject to the terms of the Mozilla Public
+ * License, v. 2.0. If a copy of the MPL was not distributed with this
+ * file, You can obtain one at http://mozilla.org/MPL/2.0/.
+ */
+
+#ifndef INCLUDED_COMPHELPER_PARALLELSORT_HXX
+#define INCLUDED_COMPHELPER_PARALLELSORT_HXX
+
+#include <comphelper/threadpool.hxx>
+#include <tools/cpuid.hxx>
+
+#include <memory>
+#include <iterator>
+#include <thread>
+#include <algorithm>
+#include <cmath>
+#include <random>
+#include <functional>
+#include <iostream>
+#include <chrono>
+
+namespace comphelper
+{
+const size_t nThreadCountGlobal = std::thread::hardware_concurrency();
+const bool bHyperThreadingActive = cpuid::hasHyperThreading();
+static comphelper::ThreadPool& rTPool(comphelper::ThreadPool::getSharedOptimalPool());
+
+static thread_local std::mt19937 aGenerator{ std::random_device{}() };
+
+#define PARALLELSORT_ENABLEPZ 0
+
+namespace
+{
+class ProfileZone
+{
+public:
+#if PARALLELSORT_ENABLEPZ
+ ProfileZone(const char* pTag)
+ : maTag(pTag)
+ , maStart(std::chrono::steady_clock::now())
+ , mbFinished(false)
+ {
+ }
+
+ ~ProfileZone()
+ {
+ if (!mbFinished)
+ showTimeElapsed();
+ }
+
+ void stop()
+ {
+ showTimeElapsed();
+ mbFinished = true;
+ }
+#else
+ ProfileZone(const char* /*pTag*/)
+ : mbDummy(true)
+ {
+ }
+
+ void stop()
+ {
+ // Avoid loplugin:staticmethods, loplugin:staticaccess errors
+ (void)mbDummy;
+ }
+#endif
+
+private:
+#if PARALLELSORT_ENABLEPZ
+
+ void showTimeElapsed()
+ {
+ auto end = std::chrono::steady_clock::now();
+ size_t elapsed
+ = std::chrono::duration_cast<std::chrono::milliseconds>(end - maStart).count();
+ std::cout << maTag << " : " << elapsed << " ms" << std::endl << std::flush;
+ }
+
+ std::string maTag;
+ std::chrono::steady_clock::time_point maStart;
+ bool mbFinished;
+#else
+ bool mbDummy;
+
+#endif
+};
+
+class ParallelRunner
+{
+ class Executor final : public comphelper::ThreadTask
+ {
+ public:
+ Executor(const std::shared_ptr<comphelper::ThreadTaskTag>& rTag,
+ std::function<void()> aFunc)
+ : comphelper::ThreadTask(rTag)
+ , maFunc(std::move(aFunc))
+ {
+ }
+
+ virtual void doWork() override { maFunc(); }
+
+ private:
+ const std::function<void()> maFunc;
+ };
+
+public:
+ ParallelRunner() { maTag = comphelper::ThreadPool::createThreadTaskTag(); }
+
+ void enqueue(std::function<void()> aFunc)
+ {
+ rTPool.pushTask(std::make_unique<Executor>(maTag, aFunc));
+ }
+
+ void wait() { rTPool.waitUntilDone(maTag, false); }
+
+private:
+ std::shared_ptr<comphelper::ThreadTaskTag> maTag;
+};
+
+constexpr size_t nMaxTreeArraySize = 64;
+
+size_t lcl_round_down_pow2(size_t nNum)
+{
+ size_t nPow2;
+ for (nPow2 = 1; nPow2 <= nNum; nPow2 <<= 1)
+ ;
+ return std::min((nPow2 >> 1), nMaxTreeArraySize);
+}
+
+template <class RandItr> struct Sampler
+{
+ using ValueType = typename std::iterator_traits<RandItr>::value_type;
+
+ static void sample(RandItr aBegin, RandItr aEnd, ValueType* pSamples, size_t nSamples,
+ size_t /*nParallelism*/)
+ {
+ ProfileZone aZone("\tsample()");
+ assert(aBegin <= aEnd);
+ size_t nLen = static_cast<std::size_t>(aEnd - aBegin);
+ assert(std::mt19937::max() >= nLen);
+
+ for (size_t nIdx = 0; nIdx < nSamples; ++nIdx)
+ {
+ size_t nSel = aGenerator() % nLen--;
+ using namespace std;
+ swap(*(aBegin + nSel), *(aBegin + nLen));
+ pSamples[nIdx] = *(aBegin + nLen);
+ }
+ }
+};
+
+template <class RandItr, class Compare> class Binner
+{
+ using ValueType = typename std::iterator_traits<RandItr>::value_type;
+
+ const size_t mnTreeArraySize;
+ const size_t mnDividers;
+ constexpr static size_t mnMaxStaticSize = 1024 * 50;
+ uint8_t maLabels[mnMaxStaticSize];
+ ValueType maDividers[nMaxTreeArraySize];
+ std::unique_ptr<uint8_t[]> pLabels;
+ size_t maSepBinEnds[nMaxTreeArraySize * nMaxTreeArraySize];
+ bool mbThreaded;
+
+public:
+ size_t maBinEnds[nMaxTreeArraySize];
+
+ Binner(const ValueType* pSamples, size_t nSamples, size_t nBins, bool bThreaded)
+ : mnTreeArraySize(lcl_round_down_pow2(nBins))
+ , mnDividers(mnTreeArraySize - 1)
+ , mbThreaded(bThreaded)
+ {
+ assert((nSamples % mnTreeArraySize) == 0);
+ assert(mnTreeArraySize <= nMaxTreeArraySize);
+ std::fill(maBinEnds, maBinEnds + mnTreeArraySize, 0);
+ std::fill(maSepBinEnds, maSepBinEnds + mnTreeArraySize * mnTreeArraySize, 0);
+ fillTreeArray(1, pSamples, pSamples + nSamples);
+ }
+
+ void fillTreeArray(size_t nPos, const ValueType* pLow, const ValueType* pHigh)
+ {
+ assert(pLow <= pHigh);
+ const ValueType* pMid = pLow + (pHigh - pLow) / 2;
+ maDividers[nPos] = *pMid;
+
+ if (2 * nPos < mnDividers) // So that 2*nPos < mnTreeArraySize
+ {
+ fillTreeArray(2 * nPos, pLow, pMid);
+ fillTreeArray(2 * nPos + 1, pMid + 1, pHigh);
+ }
+ }
+
+ constexpr inline size_t findBin(const ValueType& rVal, Compare& aComp)
+ {
+ size_t nIdx = 1;
+ while (nIdx <= mnDividers)
+ nIdx = ((nIdx << 1) + aComp(maDividers[nIdx], rVal));
+ return (nIdx - mnTreeArraySize);
+ }
+
+ void label(const RandItr aBegin, const RandItr aEnd, Compare& aComp)
+ {
+ ProfileZone aZoneSetup("\tlabel():setup");
+ size_t nLen = static_cast<std::size_t>(aEnd - aBegin);
+ if (nLen > mnMaxStaticSize)
+ pLabels = std::make_unique<uint8_t[]>(nLen);
+ uint8_t* pLabelsRaw = (nLen > mnMaxStaticSize) ? pLabels.get() : maLabels;
+ aZoneSetup.stop();
+ ProfileZone aZoneFindBins("\tFindBins()");
+ if (mbThreaded)
+ {
+ ParallelRunner aPRunner;
+ const size_t nBins = mnTreeArraySize;
+ for (size_t nTIdx = 0; nTIdx < nBins; ++nTIdx)
+ {
+ aPRunner.enqueue([this, nTIdx, nBins, nLen, aBegin, pLabelsRaw, &aComp] {
+ ProfileZone aZoneIn("\t\tFindBinsThreaded()");
+ size_t nBinEndsStartIdx = nTIdx * mnTreeArraySize;
+ size_t* pBinEnds = maSepBinEnds + nBinEndsStartIdx;
+ size_t aBinEndsF[nMaxTreeArraySize] = { 0 };
+ for (size_t nIdx = nTIdx; nIdx < nLen; nIdx += nBins)
+ {
+ size_t nBinIdx = findBin(*(aBegin + nIdx), aComp);
+ pLabelsRaw[nIdx] = static_cast<uint8_t>(nBinIdx);
+ ++aBinEndsF[nBinIdx];
+ }
+
+ for (size_t nIdx = 0; nIdx < mnTreeArraySize; ++nIdx)
+ pBinEnds[nIdx] = aBinEndsF[nIdx];
+ });
+ }
+
+ aPRunner.wait();
+
+ // Populate maBinEnds from maSepBinEnds
+ for (size_t nTIdx = 0; nTIdx < mnTreeArraySize; ++nTIdx)
+ {
+ for (size_t nSepIdx = 0; nSepIdx < mnTreeArraySize; ++nSepIdx)
+ maBinEnds[nTIdx] += maSepBinEnds[nSepIdx * mnTreeArraySize + nTIdx];
+ }
+ }
+ else
+ {
+ uint8_t* pLabel = pLabelsRaw;
+ for (RandItr aItr = aBegin; aItr != aEnd; ++aItr)
+ {
+ size_t nBinIdx = findBin(*aItr, aComp);
+ *pLabel++ = nBinIdx;
+ ++maBinEnds[nBinIdx];
+ }
+ }
+
+ aZoneFindBins.stop();
+
+ size_t nSum = 0;
+ // Store each bin's starting position in maBinEnds array for now.
+ for (size_t nIdx = 0; nIdx < mnTreeArraySize; ++nIdx)
+ {
+ size_t nSize = maBinEnds[nIdx];
+ maBinEnds[nIdx] = nSum;
+ nSum += nSize;
+ }
+
+ // Now maBinEnds has end positions of each bin.
+ }
+
+ void bin(const RandItr aBegin, const RandItr aEnd, ValueType* pOut)
+ {
+ ProfileZone aZone("\tbin()");
+ const size_t nLen = static_cast<std::size_t>(aEnd - aBegin);
+ uint8_t* pLabelsRaw = (nLen > mnMaxStaticSize) ? pLabels.get() : maLabels;
+ size_t nIdx;
+ for (nIdx = 0; nIdx < nLen; ++nIdx)
+ {
+ pOut[maBinEnds[pLabelsRaw[nIdx]]++] = *(aBegin + nIdx);
+ }
+ }
+};
+
+template <class RandItr, class Compare = std::less<>>
+void s3sort(const RandItr aBegin, const RandItr aEnd, Compare aComp = Compare(),
+ bool bThreaded = true)
+{
+ static size_t nThreadCount = nThreadCountGlobal;
+
+ constexpr size_t nBaseCaseSize = 1024;
+ const std::size_t nLen = static_cast<std::size_t>(aEnd - aBegin);
+ if (nLen < nBaseCaseSize)
+ {
+ std::sort(aBegin, aEnd, aComp);
+ return;
+ }
+
+ using ValueType = typename std::iterator_traits<RandItr>::value_type;
+ auto pOut = std::make_unique<ValueType[]>(nLen);
+
+ const size_t nBins = lcl_round_down_pow2(nThreadCount);
+ const size_t nOverSamplingFactor = std::max(1.0, std::sqrt(static_cast<double>(nLen) / 64));
+ const size_t nSamples = nOverSamplingFactor * nBins;
+ auto aSamples = std::make_unique<ValueType[]>(nSamples);
+ ProfileZone aZoneSampleAnsSort("SampleAndSort");
+ // Select samples and sort them
+ Sampler<RandItr>::sample(aBegin, aEnd, aSamples.get(), nSamples, nBins);
+ std::sort(aSamples.get(), aSamples.get() + nSamples, aComp);
+ aZoneSampleAnsSort.stop();
+
+ if (!aComp(aSamples[0], aSamples[nSamples - 1]))
+ {
+ // All samples are equal, fallback to standard sort.
+ std::sort(aBegin, aEnd, aComp);
+ return;
+ }
+
+ ProfileZone aZoneBinner("Binner");
+ // Create and populate bins using pOut from input iterators.
+ Binner<RandItr, Compare> aBinner(aSamples.get(), nSamples, nBins, bThreaded);
+ aBinner.label(aBegin, aEnd, aComp);
+ aBinner.bin(aBegin, aEnd, pOut.get());
+ aZoneBinner.stop();
+
+ ProfileZone aZoneSortBins("SortBins");
+ ValueType* pOutRaw = pOut.get();
+ if (bThreaded)
+ {
+ ParallelRunner aPRunner;
+ // Sort the bins separately.
+ for (size_t nBinIdx = 0, nBinStart = 0; nBinIdx < nBins; ++nBinIdx)
+ {
+ size_t nBinEnd = aBinner.maBinEnds[nBinIdx];
+ aPRunner.enqueue([pOutRaw, nBinStart, nBinEnd, &aComp] {
+ std::sort(pOutRaw + nBinStart, pOutRaw + nBinEnd, aComp);
+ });
+
+ nBinStart = nBinEnd;
+ }
+
+ aPRunner.wait();
+ }
+ else
+ {
+ for (size_t nBinIdx = 0, nBinStart = 0; nBinIdx < nBins; ++nBinIdx)
+ {
+ auto nBinEnd = aBinner.maBinEnds[nBinIdx];
+ std::sort(pOutRaw + nBinStart, pOutRaw + nBinEnd, aComp);
+ nBinStart = nBinEnd;
+ }
+ }
+
+ aZoneSortBins.stop();
+
+ // Move the sorted array to the array specified by input iterators.
+ std::move(pOutRaw, pOutRaw + nLen, aBegin);
+}
+
+} // anonymous namespace
+
+template <class RandItr, class Compare = std::less<>>
+void parallelSort(const RandItr aBegin, const RandItr aEnd, Compare aComp = Compare())
+{
+ assert(aBegin <= aEnd);
+ s3sort(aBegin, aEnd, aComp);
+}
+
+} // namespace comphelper
+
+#endif // INCLUDED_COMPHELPER_PARALLELSORT_HXX
+
+/* vim:set shiftwidth=4 softtabstop=4 expandtab: */