diff options
Diffstat (limited to 'ml/kmeans')
-rw-r--r-- | ml/kmeans/KMeans.cc | 55 | ||||
-rw-r--r-- | ml/kmeans/KMeans.h | 34 | ||||
-rw-r--r-- | ml/kmeans/Makefile.am | 4 | ||||
-rw-r--r-- | ml/kmeans/SamplesBuffer.cc | 150 | ||||
-rw-r--r-- | ml/kmeans/SamplesBuffer.h | 146 | ||||
-rw-r--r-- | ml/kmeans/Tests.cc | 143 |
6 files changed, 0 insertions, 532 deletions
diff --git a/ml/kmeans/KMeans.cc b/ml/kmeans/KMeans.cc deleted file mode 100644 index e66c66c16..000000000 --- a/ml/kmeans/KMeans.cc +++ /dev/null @@ -1,55 +0,0 @@ -// SPDX-License-Identifier: GPL-3.0-or-later - -#include "KMeans.h" -#include <dlib/clustering.h> - -void KMeans::train(SamplesBuffer &SB, size_t MaxIterations) { - std::vector<DSample> Samples = SB.preprocess(); - - MinDist = std::numeric_limits<CalculatedNumber>::max(); - MaxDist = std::numeric_limits<CalculatedNumber>::min(); - - { - std::lock_guard<std::mutex> Lock(Mutex); - - ClusterCenters.clear(); - - dlib::pick_initial_centers(NumClusters, ClusterCenters, Samples); - dlib::find_clusters_using_kmeans(Samples, ClusterCenters, MaxIterations); - - for (const auto &S : Samples) { - CalculatedNumber MeanDist = 0.0; - - for (const auto &KMCenter : ClusterCenters) - MeanDist += dlib::length(KMCenter - S); - - MeanDist /= NumClusters; - - if (MeanDist < MinDist) - MinDist = MeanDist; - - if (MeanDist > MaxDist) - MaxDist = MeanDist; - } - } -} - -CalculatedNumber KMeans::anomalyScore(SamplesBuffer &SB) { - std::vector<DSample> DSamples = SB.preprocess(); - - std::unique_lock<std::mutex> Lock(Mutex, std::defer_lock); - if (!Lock.try_lock()) - return std::numeric_limits<CalculatedNumber>::quiet_NaN(); - - CalculatedNumber MeanDist = 0.0; - for (const auto &CC: ClusterCenters) - MeanDist += dlib::length(CC - DSamples.back()); - - MeanDist /= NumClusters; - - if (MaxDist == MinDist) - return 0.0; - - CalculatedNumber AnomalyScore = 100.0 * std::abs((MeanDist - MinDist) / (MaxDist - MinDist)); - return (AnomalyScore > 100.0) ? 100.0 : AnomalyScore; -} diff --git a/ml/kmeans/KMeans.h b/ml/kmeans/KMeans.h deleted file mode 100644 index 4ea3b6a89..000000000 --- a/ml/kmeans/KMeans.h +++ /dev/null @@ -1,34 +0,0 @@ -// SPDX-License-Identifier: GPL-3.0-or-later - -#ifndef KMEANS_H -#define KMEANS_H - -#include <atomic> -#include <vector> -#include <limits> -#include <mutex> - -#include "SamplesBuffer.h" - -class KMeans { -public: - KMeans(size_t NumClusters = 2) : NumClusters(NumClusters) { - MinDist = std::numeric_limits<CalculatedNumber>::max(); - MaxDist = std::numeric_limits<CalculatedNumber>::min(); - }; - - void train(SamplesBuffer &SB, size_t MaxIterations); - CalculatedNumber anomalyScore(SamplesBuffer &SB); - -private: - size_t NumClusters; - - std::vector<DSample> ClusterCenters; - - CalculatedNumber MinDist; - CalculatedNumber MaxDist; - - std::mutex Mutex; -}; - -#endif /* KMEANS_H */ diff --git a/ml/kmeans/Makefile.am b/ml/kmeans/Makefile.am deleted file mode 100644 index babdcf0df..000000000 --- a/ml/kmeans/Makefile.am +++ /dev/null @@ -1,4 +0,0 @@ -# SPDX-License-Identifier: GPL-3.0-or-later - -AUTOMAKE_OPTIONS = subdir-objects -MAINTAINERCLEANFILES = $(srcdir)/Makefile.in diff --git a/ml/kmeans/SamplesBuffer.cc b/ml/kmeans/SamplesBuffer.cc deleted file mode 100644 index d276c6e09..000000000 --- a/ml/kmeans/SamplesBuffer.cc +++ /dev/null @@ -1,150 +0,0 @@ -// SPDX-License-Identifier: GPL-3.0-or-later -// -#include "SamplesBuffer.h" - -#include <fstream> -#include <sstream> -#include <string> - -void Sample::print(std::ostream &OS) const { - for (size_t Idx = 0; Idx != NumDims - 1; Idx++) - OS << CNs[Idx] << ", "; - - OS << CNs[NumDims - 1]; -} - -void SamplesBuffer::print(std::ostream &OS) const { - for (size_t Idx = Preprocessed ? (DiffN + (SmoothN - 1) + (LagN)) : 0; - Idx != NumSamples; Idx++) { - Sample S = Preprocessed ? getPreprocessedSample(Idx) : getSample(Idx); - OS << S << std::endl; - } -} - -std::vector<Sample> SamplesBuffer::getPreprocessedSamples() const { - std::vector<Sample> V; - - for (size_t Idx = Preprocessed ? (DiffN + (SmoothN - 1) + (LagN)) : 0; - Idx != NumSamples; Idx++) { - Sample S = Preprocessed ? getPreprocessedSample(Idx) : getSample(Idx); - V.push_back(S); - } - - return V; -} - -void SamplesBuffer::diffSamples() { - // Panda's DataFrame default behaviour is to subtract each element from - // itself. For us `DiffN = 0` means "disable diff-ing" when preprocessing - // the samples buffer. This deviation will make it easier for us to test - // the KMeans implementation. - if (DiffN == 0) - return; - - for (size_t Idx = 0; Idx != (NumSamples - DiffN); Idx++) { - size_t High = (NumSamples - 1) - Idx; - size_t Low = High - DiffN; - - Sample LHS = getSample(High); - Sample RHS = getSample(Low); - - LHS.diff(RHS); - } -} - -void SamplesBuffer::smoothSamples() { - // Holds the mean value of each window - CalculatedNumber *AccCNs = new CalculatedNumber[NumDimsPerSample](); - Sample Acc(AccCNs, NumDimsPerSample); - - // Used to avoid clobbering the accumulator when moving the window - CalculatedNumber *TmpCNs = new CalculatedNumber[NumDimsPerSample](); - Sample Tmp(TmpCNs, NumDimsPerSample); - - CalculatedNumber Factor = (CalculatedNumber) 1 / SmoothN; - - // Calculate the value of the 1st window - for (size_t Idx = 0; Idx != std::min(SmoothN, NumSamples); Idx++) { - Tmp.add(getSample(NumSamples - (Idx + 1))); - } - - Acc.add(Tmp); - Acc.scale(Factor); - - // Move the window and update the samples - for (size_t Idx = NumSamples; Idx != (DiffN + SmoothN - 1); Idx--) { - Sample S = getSample(Idx - 1); - - // Tmp <- Next window (if any) - if (Idx >= (SmoothN + 1)) { - Tmp.diff(S); - Tmp.add(getSample(Idx - (SmoothN + 1))); - } - - // S <- Acc - S.copy(Acc); - - // Acc <- Tmp - Acc.copy(Tmp); - Acc.scale(Factor); - } - - delete[] AccCNs; - delete[] TmpCNs; -} - -void SamplesBuffer::lagSamples() { - if (LagN == 0) - return; - - for (size_t Idx = NumSamples; Idx != LagN; Idx--) { - Sample PS = getPreprocessedSample(Idx - 1); - PS.lag(getSample(Idx - 1), LagN); - } -} - -std::vector<DSample> SamplesBuffer::preprocess() { - assert(Preprocessed == false); - - std::vector<DSample> DSamples; - size_t OutN = NumSamples; - - // Diff - if (DiffN >= OutN) - return DSamples; - OutN -= DiffN; - diffSamples(); - - // Smooth - if (SmoothN == 0 || SmoothN > OutN) - return DSamples; - OutN -= (SmoothN - 1); - smoothSamples(); - - // Lag - if (LagN >= OutN) - return DSamples; - OutN -= LagN; - lagSamples(); - - DSamples.reserve(OutN); - Preprocessed = true; - - uint32_t MaxMT = std::numeric_limits<uint32_t>::max(); - uint32_t CutOff = static_cast<double>(MaxMT) * SamplingRatio; - - for (size_t Idx = NumSamples - OutN; Idx != NumSamples; Idx++) { - if (RandNums[Idx] > CutOff) - continue; - - DSample DS; - DS.set_size(NumDimsPerSample * (LagN + 1)); - - const Sample PS = getPreprocessedSample(Idx); - PS.initDSample(DS); - - DSamples.push_back(DS); - } - - return DSamples; -} diff --git a/ml/kmeans/SamplesBuffer.h b/ml/kmeans/SamplesBuffer.h deleted file mode 100644 index 1c7215cca..000000000 --- a/ml/kmeans/SamplesBuffer.h +++ /dev/null @@ -1,146 +0,0 @@ -// SPDX-License-Identifier: GPL-3.0-or-later - -#ifndef SAMPLES_BUFFER_H -#define SAMPLES_BUFFER_H - -#include <iostream> -#include <vector> - -#include <cassert> -#include <cstdlib> -#include <cstring> - -#include <dlib/matrix.h> - -typedef double CalculatedNumber; -typedef dlib::matrix<CalculatedNumber, 0, 1> DSample; - -class Sample { -public: - Sample(CalculatedNumber *Buf, size_t N) : CNs(Buf), NumDims(N) {} - - void initDSample(DSample &DS) const { - for (size_t Idx = 0; Idx != NumDims; Idx++) { - DS(Idx) = std::abs(CNs[Idx]); - } - } - - void add(const Sample &RHS) const { - assert(NumDims == RHS.NumDims); - - for (size_t Idx = 0; Idx != NumDims; Idx++) - CNs[Idx] += RHS.CNs[Idx]; - }; - - void diff(const Sample &RHS) const { - assert(NumDims == RHS.NumDims); - - for (size_t Idx = 0; Idx != NumDims; Idx++) - CNs[Idx] -= RHS.CNs[Idx]; - }; - - void copy(const Sample &RHS) const { - assert(NumDims == RHS.NumDims); - - std::memcpy(CNs, RHS.CNs, NumDims * sizeof(CalculatedNumber)); - } - - void scale(CalculatedNumber Factor) { - for (size_t Idx = 0; Idx != NumDims; Idx++) - CNs[Idx] *= Factor; - } - - void lag(const Sample &S, size_t LagN) { - size_t N = S.NumDims; - - for (size_t Idx = 0; Idx != (LagN + 1); Idx++) { - Sample Src(S.CNs - (Idx * N), N); - Sample Dst(CNs + (Idx * N), N); - Dst.copy(Src); - } - } - - const CalculatedNumber *getCalculatedNumbers() const { - return CNs; - }; - - void print(std::ostream &OS) const; - -private: - CalculatedNumber *CNs; - size_t NumDims; -}; - -inline std::ostream& operator<<(std::ostream &OS, const Sample &S) { - S.print(OS); - return OS; -} - -class SamplesBuffer { -public: - SamplesBuffer(CalculatedNumber *CNs, - size_t NumSamples, size_t NumDimsPerSample, - size_t DiffN, size_t SmoothN, size_t LagN, - double SamplingRatio, std::vector<uint32_t> &RandNums) : - CNs(CNs), NumSamples(NumSamples), NumDimsPerSample(NumDimsPerSample), - DiffN(DiffN), SmoothN(SmoothN), LagN(LagN), - SamplingRatio(SamplingRatio), RandNums(RandNums), - BytesPerSample(NumDimsPerSample * sizeof(CalculatedNumber)), - Preprocessed(false) {}; - - std::vector<DSample> preprocess(); - std::vector<Sample> getPreprocessedSamples() const; - - size_t capacity() const { return NumSamples; } - void print(std::ostream &OS) const; - -private: - size_t getSampleOffset(size_t Index) const { - assert(Index < NumSamples); - return Index * NumDimsPerSample; - } - - size_t getPreprocessedSampleOffset(size_t Index) const { - assert(Index < NumSamples); - return getSampleOffset(Index) * (LagN + 1); - } - - void setSample(size_t Index, const Sample &S) const { - size_t Offset = getSampleOffset(Index); - std::memcpy(&CNs[Offset], S.getCalculatedNumbers(), BytesPerSample); - } - - const Sample getSample(size_t Index) const { - size_t Offset = getSampleOffset(Index); - return Sample(&CNs[Offset], NumDimsPerSample); - }; - - const Sample getPreprocessedSample(size_t Index) const { - size_t Offset = getPreprocessedSampleOffset(Index); - return Sample(&CNs[Offset], NumDimsPerSample * (LagN + 1)); - }; - - void diffSamples(); - void smoothSamples(); - void lagSamples(); - -private: - CalculatedNumber *CNs; - size_t NumSamples; - size_t NumDimsPerSample; - size_t DiffN; - size_t SmoothN; - size_t LagN; - double SamplingRatio; - std::vector<uint32_t> &RandNums; - - size_t BytesPerSample; - bool Preprocessed; -}; - -inline std::ostream& operator<<(std::ostream& OS, const SamplesBuffer &SB) { - SB.print(OS); - return OS; -} - -#endif /* SAMPLES_BUFFER_H */ diff --git a/ml/kmeans/Tests.cc b/ml/kmeans/Tests.cc deleted file mode 100644 index 0cb595945..000000000 --- a/ml/kmeans/Tests.cc +++ /dev/null @@ -1,143 +0,0 @@ -// SPDX-License-Identifier: GPL-3.0-or-later - -#include "ml/ml-private.h" -#include <gtest/gtest.h> - -/* - * The SamplesBuffer class implements the functionality of the following python - * code: - * >> df = pd.DataFrame(data=samples) - * >> df = df.diff(diff_n).dropna() - * >> df = df.rolling(smooth_n).mean().dropna() - * >> df = pd.concat([df.shift(n) for n in range(lag_n + 1)], axis=1).dropna() - * - * Its correctness has been verified by automatically generating random - * data frames in Python and comparing them with the correspondent preprocessed - * SampleBuffers. - * - * The following tests are meant to catch unintended changes in the SamplesBuffer - * implementation. For development purposes, one should compare changes against - * the aforementioned python code. -*/ - -TEST(SamplesBufferTest, NS_8_NDPS_1_DN_1_SN_3_LN_1) { - size_t NumSamples = 8, NumDimsPerSample = 1; - size_t DiffN = 1, SmoothN = 3, LagN = 3; - - size_t N = NumSamples * NumDimsPerSample * (LagN + 1); - CalculatedNumber *CNs = new CalculatedNumber[N](); - - CNs[0] = 0.7568336679490107; - CNs[1] = 0.4814406581763254; - CNs[2] = 0.40073555156221874; - CNs[3] = 0.5973257298194408; - CNs[4] = 0.5334727814345868; - CNs[5] = 0.2632477193454843; - CNs[6] = 0.2684839023122384; - CNs[7] = 0.851332948637479; - - SamplesBuffer SB(CNs, NumSamples, NumDimsPerSample, DiffN, SmoothN, LagN); - SB.preprocess(); - - std::vector<Sample> Samples = SB.getPreprocessedSamples(); - EXPECT_EQ(Samples.size(), 2); - - Sample S0 = Samples[0]; - const CalculatedNumber *S0_CNs = S0.getCalculatedNumbers(); - Sample S1 = Samples[1]; - const CalculatedNumber *S1_CNs = S1.getCalculatedNumbers(); - - EXPECT_NEAR(S0_CNs[0], -0.109614, 0.001); - EXPECT_NEAR(S0_CNs[1], -0.0458293, 0.001); - EXPECT_NEAR(S0_CNs[2], 0.017344, 0.001); - EXPECT_NEAR(S0_CNs[3], -0.0531693, 0.001); - - EXPECT_NEAR(S1_CNs[0], 0.105953, 0.001); - EXPECT_NEAR(S1_CNs[1], -0.109614, 0.001); - EXPECT_NEAR(S1_CNs[2], -0.0458293, 0.001); - EXPECT_NEAR(S1_CNs[3], 0.017344, 0.001); - - delete[] CNs; -} - -TEST(SamplesBufferTest, NS_8_NDPS_1_DN_2_SN_3_LN_2) { - size_t NumSamples = 8, NumDimsPerSample = 1; - size_t DiffN = 2, SmoothN = 3, LagN = 2; - - size_t N = NumSamples * NumDimsPerSample * (LagN + 1); - CalculatedNumber *CNs = new CalculatedNumber[N](); - - CNs[0] = 0.20511885291342846; - CNs[1] = 0.13151717360306558; - CNs[2] = 0.6017085062423134; - CNs[3] = 0.46256882933941545; - CNs[4] = 0.7887758447877941; - CNs[5] = 0.9237989080034406; - CNs[6] = 0.15552559051428083; - CNs[7] = 0.6309750314597955; - - SamplesBuffer SB(CNs, NumSamples, NumDimsPerSample, DiffN, SmoothN, LagN); - SB.preprocess(); - - std::vector<Sample> Samples = SB.getPreprocessedSamples(); - EXPECT_EQ(Samples.size(), 2); - - Sample S0 = Samples[0]; - const CalculatedNumber *S0_CNs = S0.getCalculatedNumbers(); - Sample S1 = Samples[1]; - const CalculatedNumber *S1_CNs = S1.getCalculatedNumbers(); - - EXPECT_NEAR(S0_CNs[0], 0.005016, 0.001); - EXPECT_NEAR(S0_CNs[1], 0.326450, 0.001); - EXPECT_NEAR(S0_CNs[2], 0.304903, 0.001); - - EXPECT_NEAR(S1_CNs[0], -0.154948, 0.001); - EXPECT_NEAR(S1_CNs[1], 0.005016, 0.001); - EXPECT_NEAR(S1_CNs[2], 0.326450, 0.001); - - delete[] CNs; -} - -TEST(SamplesBufferTest, NS_8_NDPS_3_DN_2_SN_4_LN_1) { - size_t NumSamples = 8, NumDimsPerSample = 3; - size_t DiffN = 2, SmoothN = 4, LagN = 1; - - size_t N = NumSamples * NumDimsPerSample * (LagN + 1); - CalculatedNumber *CNs = new CalculatedNumber[N](); - - CNs[0] = 0.34310900399667765; CNs[1] = 0.14694315994488194; CNs[2] = 0.8246677800938796; - CNs[3] = 0.48249504592307835; CNs[4] = 0.23241087965531182; CNs[5] = 0.9595348555892567; - CNs[6] = 0.44281094035598334; CNs[7] = 0.5143142171362715; CNs[8] = 0.06391303014242555; - CNs[9] = 0.7460491027783901; CNs[10] = 0.43887217459032923; CNs[11] = 0.2814395025355999; - CNs[12] = 0.9231114281214198; CNs[13] = 0.326882401786898; CNs[14] = 0.26747939220376216; - CNs[15] = 0.7787571209969636; CNs[16] =0.5851700001235088; CNs[17] = 0.34410728945321567; - CNs[18] = 0.9394494507088997; CNs[19] =0.17567223681734334; CNs[20] = 0.42732886195446984; - CNs[21] = 0.9460522396152958; CNs[22] =0.23462747016780894; CNs[23] = 0.35983249900892145; - - SamplesBuffer SB(CNs, NumSamples, NumDimsPerSample, DiffN, SmoothN, LagN); - SB.preprocess(); - - std::vector<Sample> Samples = SB.getPreprocessedSamples(); - EXPECT_EQ(Samples.size(), 2); - - Sample S0 = Samples[0]; - const CalculatedNumber *S0_CNs = S0.getCalculatedNumbers(); - Sample S1 = Samples[1]; - const CalculatedNumber *S1_CNs = S1.getCalculatedNumbers(); - - EXPECT_NEAR(S0_CNs[0], 0.198225, 0.001); - EXPECT_NEAR(S0_CNs[1], 0.003529, 0.001); - EXPECT_NEAR(S0_CNs[2], -0.063003, 0.001); - EXPECT_NEAR(S0_CNs[3], 0.219066, 0.001); - EXPECT_NEAR(S0_CNs[4], 0.133175, 0.001); - EXPECT_NEAR(S0_CNs[5], -0.293154, 0.001); - - EXPECT_NEAR(S1_CNs[0], 0.174160, 0.001); - EXPECT_NEAR(S1_CNs[1], -0.135722, 0.001); - EXPECT_NEAR(S1_CNs[2], 0.110452, 0.001); - EXPECT_NEAR(S1_CNs[3], 0.198225, 0.001); - EXPECT_NEAR(S1_CNs[4], 0.003529, 0.001); - EXPECT_NEAR(S1_CNs[5], -0.063003, 0.001); - - delete[] CNs; -} |