diff options
Diffstat (limited to 'ml/kmeans')
-rw-r--r-- | ml/kmeans/KMeans.cc | 55 | ||||
-rw-r--r-- | ml/kmeans/KMeans.h | 34 | ||||
-rw-r--r-- | ml/kmeans/Makefile.am | 4 | ||||
-rw-r--r-- | ml/kmeans/SamplesBuffer.cc | 144 | ||||
-rw-r--r-- | ml/kmeans/SamplesBuffer.h | 140 | ||||
-rw-r--r-- | ml/kmeans/Tests.cc | 143 |
6 files changed, 520 insertions, 0 deletions
diff --git a/ml/kmeans/KMeans.cc b/ml/kmeans/KMeans.cc new file mode 100644 index 00000000..e66c66c1 --- /dev/null +++ b/ml/kmeans/KMeans.cc @@ -0,0 +1,55 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "KMeans.h" +#include <dlib/clustering.h> + +void KMeans::train(SamplesBuffer &SB, size_t MaxIterations) { + std::vector<DSample> Samples = SB.preprocess(); + + MinDist = std::numeric_limits<CalculatedNumber>::max(); + MaxDist = std::numeric_limits<CalculatedNumber>::min(); + + { + std::lock_guard<std::mutex> Lock(Mutex); + + ClusterCenters.clear(); + + dlib::pick_initial_centers(NumClusters, ClusterCenters, Samples); + dlib::find_clusters_using_kmeans(Samples, ClusterCenters, MaxIterations); + + for (const auto &S : Samples) { + CalculatedNumber MeanDist = 0.0; + + for (const auto &KMCenter : ClusterCenters) + MeanDist += dlib::length(KMCenter - S); + + MeanDist /= NumClusters; + + if (MeanDist < MinDist) + MinDist = MeanDist; + + if (MeanDist > MaxDist) + MaxDist = MeanDist; + } + } +} + +CalculatedNumber KMeans::anomalyScore(SamplesBuffer &SB) { + std::vector<DSample> DSamples = SB.preprocess(); + + std::unique_lock<std::mutex> Lock(Mutex, std::defer_lock); + if (!Lock.try_lock()) + return std::numeric_limits<CalculatedNumber>::quiet_NaN(); + + CalculatedNumber MeanDist = 0.0; + for (const auto &CC: ClusterCenters) + MeanDist += dlib::length(CC - DSamples.back()); + + MeanDist /= NumClusters; + + if (MaxDist == MinDist) + return 0.0; + + CalculatedNumber AnomalyScore = 100.0 * std::abs((MeanDist - MinDist) / (MaxDist - MinDist)); + return (AnomalyScore > 100.0) ? 100.0 : AnomalyScore; +} diff --git a/ml/kmeans/KMeans.h b/ml/kmeans/KMeans.h new file mode 100644 index 00000000..4ea3b6a8 --- /dev/null +++ b/ml/kmeans/KMeans.h @@ -0,0 +1,34 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef KMEANS_H +#define KMEANS_H + +#include <atomic> +#include <vector> +#include <limits> +#include <mutex> + +#include "SamplesBuffer.h" + +class KMeans { +public: + KMeans(size_t NumClusters = 2) : NumClusters(NumClusters) { + MinDist = std::numeric_limits<CalculatedNumber>::max(); + MaxDist = std::numeric_limits<CalculatedNumber>::min(); + }; + + void train(SamplesBuffer &SB, size_t MaxIterations); + CalculatedNumber anomalyScore(SamplesBuffer &SB); + +private: + size_t NumClusters; + + std::vector<DSample> ClusterCenters; + + CalculatedNumber MinDist; + CalculatedNumber MaxDist; + + std::mutex Mutex; +}; + +#endif /* KMEANS_H */ diff --git a/ml/kmeans/Makefile.am b/ml/kmeans/Makefile.am new file mode 100644 index 00000000..babdcf0d --- /dev/null +++ b/ml/kmeans/Makefile.am @@ -0,0 +1,4 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +AUTOMAKE_OPTIONS = subdir-objects +MAINTAINERCLEANFILES = $(srcdir)/Makefile.in diff --git a/ml/kmeans/SamplesBuffer.cc b/ml/kmeans/SamplesBuffer.cc new file mode 100644 index 00000000..f8211fb5 --- /dev/null +++ b/ml/kmeans/SamplesBuffer.cc @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-3.0-or-later +// +#include "SamplesBuffer.h" + +#include <fstream> +#include <sstream> +#include <string> + +void Sample::print(std::ostream &OS) const { + for (size_t Idx = 0; Idx != NumDims - 1; Idx++) + OS << CNs[Idx] << ", "; + + OS << CNs[NumDims - 1]; +} + +void SamplesBuffer::print(std::ostream &OS) const { + for (size_t Idx = Preprocessed ? (DiffN + (SmoothN - 1) + (LagN)) : 0; + Idx != NumSamples; Idx++) { + Sample S = Preprocessed ? getPreprocessedSample(Idx) : getSample(Idx); + OS << S << std::endl; + } +} + +std::vector<Sample> SamplesBuffer::getPreprocessedSamples() const { + std::vector<Sample> V; + + for (size_t Idx = Preprocessed ? (DiffN + (SmoothN - 1) + (LagN)) : 0; + Idx != NumSamples; Idx++) { + Sample S = Preprocessed ? getPreprocessedSample(Idx) : getSample(Idx); + V.push_back(S); + } + + return V; +} + +void SamplesBuffer::diffSamples() { + // Panda's DataFrame default behaviour is to subtract each element from + // itself. For us `DiffN = 0` means "disable diff-ing" when preprocessing + // the samples buffer. This deviation will make it easier for us to test + // the KMeans implementation. + if (DiffN == 0) + return; + + for (size_t Idx = 0; Idx != (NumSamples - DiffN); Idx++) { + size_t High = (NumSamples - 1) - Idx; + size_t Low = High - DiffN; + + Sample LHS = getSample(High); + Sample RHS = getSample(Low); + + LHS.diff(RHS); + } +} + +void SamplesBuffer::smoothSamples() { + // Holds the mean value of each window + CalculatedNumber *AccCNs = new CalculatedNumber[NumDimsPerSample](); + Sample Acc(AccCNs, NumDimsPerSample); + + // Used to avoid clobbering the accumulator when moving the window + CalculatedNumber *TmpCNs = new CalculatedNumber[NumDimsPerSample](); + Sample Tmp(TmpCNs, NumDimsPerSample); + + CalculatedNumber Factor = (CalculatedNumber) 1 / SmoothN; + + // Calculate the value of the 1st window + for (size_t Idx = 0; Idx != std::min(SmoothN, NumSamples); Idx++) { + Tmp.add(getSample(NumSamples - (Idx + 1))); + } + + Acc.add(Tmp); + Acc.scale(Factor); + + // Move the window and update the samples + for (size_t Idx = NumSamples; Idx != (DiffN + SmoothN - 1); Idx--) { + Sample S = getSample(Idx - 1); + + // Tmp <- Next window (if any) + if (Idx >= (SmoothN + 1)) { + Tmp.diff(S); + Tmp.add(getSample(Idx - (SmoothN + 1))); + } + + // S <- Acc + S.copy(Acc); + + // Acc <- Tmp + Acc.copy(Tmp); + Acc.scale(Factor); + } + + delete[] AccCNs; + delete[] TmpCNs; +} + +void SamplesBuffer::lagSamples() { + if (LagN == 0) + return; + + for (size_t Idx = NumSamples; Idx != LagN; Idx--) { + Sample PS = getPreprocessedSample(Idx - 1); + PS.lag(getSample(Idx - 1), LagN); + } +} + +std::vector<DSample> SamplesBuffer::preprocess() { + assert(Preprocessed == false); + + std::vector<DSample> DSamples; + size_t OutN = NumSamples; + + // Diff + if (DiffN >= OutN) + return DSamples; + OutN -= DiffN; + diffSamples(); + + // Smooth + if (SmoothN == 0 || SmoothN > OutN) + return DSamples; + OutN -= (SmoothN - 1); + smoothSamples(); + + // Lag + if (LagN >= OutN) + return DSamples; + OutN -= LagN; + lagSamples(); + + DSamples.reserve(OutN); + Preprocessed = true; + + for (size_t Idx = NumSamples - OutN; Idx != NumSamples; Idx++) { + DSample DS; + DS.set_size(NumDimsPerSample * (LagN + 1)); + + const Sample PS = getPreprocessedSample(Idx); + PS.initDSample(DS); + + DSamples.push_back(DS); + } + + return DSamples; +} diff --git a/ml/kmeans/SamplesBuffer.h b/ml/kmeans/SamplesBuffer.h new file mode 100644 index 00000000..fccd216d --- /dev/null +++ b/ml/kmeans/SamplesBuffer.h @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#ifndef SAMPLES_BUFFER_H +#define SAMPLES_BUFFER_H + +#include <iostream> +#include <vector> + +#include <cassert> +#include <cstdlib> +#include <cstring> + +#include <dlib/matrix.h> + +typedef double CalculatedNumber; +typedef dlib::matrix<CalculatedNumber, 0, 1> DSample; + +class Sample { +public: + Sample(CalculatedNumber *Buf, size_t N) : CNs(Buf), NumDims(N) {} + + void initDSample(DSample &DS) const { + for (size_t Idx = 0; Idx != NumDims; Idx++) + DS(Idx) = CNs[Idx]; + } + + void add(const Sample &RHS) const { + assert(NumDims == RHS.NumDims); + + for (size_t Idx = 0; Idx != NumDims; Idx++) + CNs[Idx] += RHS.CNs[Idx]; + }; + + void diff(const Sample &RHS) const { + assert(NumDims == RHS.NumDims); + + for (size_t Idx = 0; Idx != NumDims; Idx++) + CNs[Idx] -= RHS.CNs[Idx]; + }; + + void copy(const Sample &RHS) const { + assert(NumDims == RHS.NumDims); + + std::memcpy(CNs, RHS.CNs, NumDims * sizeof(CalculatedNumber)); + } + + void scale(CalculatedNumber Factor) { + for (size_t Idx = 0; Idx != NumDims; Idx++) + CNs[Idx] *= Factor; + } + + void lag(const Sample &S, size_t LagN) { + size_t N = S.NumDims; + + for (size_t Idx = 0; Idx != (LagN + 1); Idx++) { + Sample Src(S.CNs - (Idx * N), N); + Sample Dst(CNs + (Idx * N), N); + Dst.copy(Src); + } + } + + const CalculatedNumber *getCalculatedNumbers() const { + return CNs; + }; + + void print(std::ostream &OS) const; + +private: + CalculatedNumber *CNs; + size_t NumDims; +}; + +inline std::ostream& operator<<(std::ostream &OS, const Sample &S) { + S.print(OS); + return OS; +} + +class SamplesBuffer { +public: + SamplesBuffer(CalculatedNumber *CNs, + size_t NumSamples, size_t NumDimsPerSample, + size_t DiffN = 1, size_t SmoothN = 3, size_t LagN = 3) : + CNs(CNs), NumSamples(NumSamples), NumDimsPerSample(NumDimsPerSample), + DiffN(DiffN), SmoothN(SmoothN), LagN(LagN), + BytesPerSample(NumDimsPerSample * sizeof(CalculatedNumber)), + Preprocessed(false) {}; + + std::vector<DSample> preprocess(); + std::vector<Sample> getPreprocessedSamples() const; + + size_t capacity() const { return NumSamples; } + void print(std::ostream &OS) const; + +private: + size_t getSampleOffset(size_t Index) const { + assert(Index < NumSamples); + return Index * NumDimsPerSample; + } + + size_t getPreprocessedSampleOffset(size_t Index) const { + assert(Index < NumSamples); + return getSampleOffset(Index) * (LagN + 1); + } + + void setSample(size_t Index, const Sample &S) const { + size_t Offset = getSampleOffset(Index); + std::memcpy(&CNs[Offset], S.getCalculatedNumbers(), BytesPerSample); + } + + const Sample getSample(size_t Index) const { + size_t Offset = getSampleOffset(Index); + return Sample(&CNs[Offset], NumDimsPerSample); + }; + + const Sample getPreprocessedSample(size_t Index) const { + size_t Offset = getPreprocessedSampleOffset(Index); + return Sample(&CNs[Offset], NumDimsPerSample * (LagN + 1)); + }; + + void diffSamples(); + void smoothSamples(); + void lagSamples(); + +private: + CalculatedNumber *CNs; + size_t NumSamples; + size_t NumDimsPerSample; + size_t DiffN; + size_t SmoothN; + size_t LagN; + size_t BytesPerSample; + bool Preprocessed; +}; + +inline std::ostream& operator<<(std::ostream& OS, const SamplesBuffer &SB) { + SB.print(OS); + return OS; +} + +#endif /* SAMPLES_BUFFER_H */ diff --git a/ml/kmeans/Tests.cc b/ml/kmeans/Tests.cc new file mode 100644 index 00000000..0cb59594 --- /dev/null +++ b/ml/kmeans/Tests.cc @@ -0,0 +1,143 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "ml/ml-private.h" +#include <gtest/gtest.h> + +/* + * The SamplesBuffer class implements the functionality of the following python + * code: + * >> df = pd.DataFrame(data=samples) + * >> df = df.diff(diff_n).dropna() + * >> df = df.rolling(smooth_n).mean().dropna() + * >> df = pd.concat([df.shift(n) for n in range(lag_n + 1)], axis=1).dropna() + * + * Its correctness has been verified by automatically generating random + * data frames in Python and comparing them with the correspondent preprocessed + * SampleBuffers. + * + * The following tests are meant to catch unintended changes in the SamplesBuffer + * implementation. For development purposes, one should compare changes against + * the aforementioned python code. +*/ + +TEST(SamplesBufferTest, NS_8_NDPS_1_DN_1_SN_3_LN_1) { + size_t NumSamples = 8, NumDimsPerSample = 1; + size_t DiffN = 1, SmoothN = 3, LagN = 3; + + size_t N = NumSamples * NumDimsPerSample * (LagN + 1); + CalculatedNumber *CNs = new CalculatedNumber[N](); + + CNs[0] = 0.7568336679490107; + CNs[1] = 0.4814406581763254; + CNs[2] = 0.40073555156221874; + CNs[3] = 0.5973257298194408; + CNs[4] = 0.5334727814345868; + CNs[5] = 0.2632477193454843; + CNs[6] = 0.2684839023122384; + CNs[7] = 0.851332948637479; + + SamplesBuffer SB(CNs, NumSamples, NumDimsPerSample, DiffN, SmoothN, LagN); + SB.preprocess(); + + std::vector<Sample> Samples = SB.getPreprocessedSamples(); + EXPECT_EQ(Samples.size(), 2); + + Sample S0 = Samples[0]; + const CalculatedNumber *S0_CNs = S0.getCalculatedNumbers(); + Sample S1 = Samples[1]; + const CalculatedNumber *S1_CNs = S1.getCalculatedNumbers(); + + EXPECT_NEAR(S0_CNs[0], -0.109614, 0.001); + EXPECT_NEAR(S0_CNs[1], -0.0458293, 0.001); + EXPECT_NEAR(S0_CNs[2], 0.017344, 0.001); + EXPECT_NEAR(S0_CNs[3], -0.0531693, 0.001); + + EXPECT_NEAR(S1_CNs[0], 0.105953, 0.001); + EXPECT_NEAR(S1_CNs[1], -0.109614, 0.001); + EXPECT_NEAR(S1_CNs[2], -0.0458293, 0.001); + EXPECT_NEAR(S1_CNs[3], 0.017344, 0.001); + + delete[] CNs; +} + +TEST(SamplesBufferTest, NS_8_NDPS_1_DN_2_SN_3_LN_2) { + size_t NumSamples = 8, NumDimsPerSample = 1; + size_t DiffN = 2, SmoothN = 3, LagN = 2; + + size_t N = NumSamples * NumDimsPerSample * (LagN + 1); + CalculatedNumber *CNs = new CalculatedNumber[N](); + + CNs[0] = 0.20511885291342846; + CNs[1] = 0.13151717360306558; + CNs[2] = 0.6017085062423134; + CNs[3] = 0.46256882933941545; + CNs[4] = 0.7887758447877941; + CNs[5] = 0.9237989080034406; + CNs[6] = 0.15552559051428083; + CNs[7] = 0.6309750314597955; + + SamplesBuffer SB(CNs, NumSamples, NumDimsPerSample, DiffN, SmoothN, LagN); + SB.preprocess(); + + std::vector<Sample> Samples = SB.getPreprocessedSamples(); + EXPECT_EQ(Samples.size(), 2); + + Sample S0 = Samples[0]; + const CalculatedNumber *S0_CNs = S0.getCalculatedNumbers(); + Sample S1 = Samples[1]; + const CalculatedNumber *S1_CNs = S1.getCalculatedNumbers(); + + EXPECT_NEAR(S0_CNs[0], 0.005016, 0.001); + EXPECT_NEAR(S0_CNs[1], 0.326450, 0.001); + EXPECT_NEAR(S0_CNs[2], 0.304903, 0.001); + + EXPECT_NEAR(S1_CNs[0], -0.154948, 0.001); + EXPECT_NEAR(S1_CNs[1], 0.005016, 0.001); + EXPECT_NEAR(S1_CNs[2], 0.326450, 0.001); + + delete[] CNs; +} + +TEST(SamplesBufferTest, NS_8_NDPS_3_DN_2_SN_4_LN_1) { + size_t NumSamples = 8, NumDimsPerSample = 3; + size_t DiffN = 2, SmoothN = 4, LagN = 1; + + size_t N = NumSamples * NumDimsPerSample * (LagN + 1); + CalculatedNumber *CNs = new CalculatedNumber[N](); + + CNs[0] = 0.34310900399667765; CNs[1] = 0.14694315994488194; CNs[2] = 0.8246677800938796; + CNs[3] = 0.48249504592307835; CNs[4] = 0.23241087965531182; CNs[5] = 0.9595348555892567; + CNs[6] = 0.44281094035598334; CNs[7] = 0.5143142171362715; CNs[8] = 0.06391303014242555; + CNs[9] = 0.7460491027783901; CNs[10] = 0.43887217459032923; CNs[11] = 0.2814395025355999; + CNs[12] = 0.9231114281214198; CNs[13] = 0.326882401786898; CNs[14] = 0.26747939220376216; + CNs[15] = 0.7787571209969636; CNs[16] =0.5851700001235088; CNs[17] = 0.34410728945321567; + CNs[18] = 0.9394494507088997; CNs[19] =0.17567223681734334; CNs[20] = 0.42732886195446984; + CNs[21] = 0.9460522396152958; CNs[22] =0.23462747016780894; CNs[23] = 0.35983249900892145; + + SamplesBuffer SB(CNs, NumSamples, NumDimsPerSample, DiffN, SmoothN, LagN); + SB.preprocess(); + + std::vector<Sample> Samples = SB.getPreprocessedSamples(); + EXPECT_EQ(Samples.size(), 2); + + Sample S0 = Samples[0]; + const CalculatedNumber *S0_CNs = S0.getCalculatedNumbers(); + Sample S1 = Samples[1]; + const CalculatedNumber *S1_CNs = S1.getCalculatedNumbers(); + + EXPECT_NEAR(S0_CNs[0], 0.198225, 0.001); + EXPECT_NEAR(S0_CNs[1], 0.003529, 0.001); + EXPECT_NEAR(S0_CNs[2], -0.063003, 0.001); + EXPECT_NEAR(S0_CNs[3], 0.219066, 0.001); + EXPECT_NEAR(S0_CNs[4], 0.133175, 0.001); + EXPECT_NEAR(S0_CNs[5], -0.293154, 0.001); + + EXPECT_NEAR(S1_CNs[0], 0.174160, 0.001); + EXPECT_NEAR(S1_CNs[1], -0.135722, 0.001); + EXPECT_NEAR(S1_CNs[2], 0.110452, 0.001); + EXPECT_NEAR(S1_CNs[3], 0.198225, 0.001); + EXPECT_NEAR(S1_CNs[4], 0.003529, 0.001); + EXPECT_NEAR(S1_CNs[5], -0.063003, 0.001); + + delete[] CNs; +} |