summaryrefslogtreecommitdiffstats
path: root/ml/kmeans
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2021-12-01 06:15:11 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2021-12-01 06:15:11 +0000
commit483926a283e118590da3f9ecfa75a8a4d62143ce (patch)
treecb77052778df9a128a8cd3ff5bf7645322a13bc5 /ml/kmeans
parentReleasing debian version 1.31.0-4. (diff)
downloadnetdata-483926a283e118590da3f9ecfa75a8a4d62143ce.tar.xz
netdata-483926a283e118590da3f9ecfa75a8a4d62143ce.zip
Merging upstream version 1.32.0.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r--ml/kmeans/KMeans.cc55
-rw-r--r--ml/kmeans/KMeans.h34
-rw-r--r--ml/kmeans/Makefile.am4
-rw-r--r--ml/kmeans/SamplesBuffer.cc144
-rw-r--r--ml/kmeans/SamplesBuffer.h140
-rw-r--r--ml/kmeans/Tests.cc143
6 files changed, 520 insertions, 0 deletions
diff --git a/ml/kmeans/KMeans.cc b/ml/kmeans/KMeans.cc
new file mode 100644
index 000000000..e66c66c16
--- /dev/null
+++ b/ml/kmeans/KMeans.cc
@@ -0,0 +1,55 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include "KMeans.h"
+#include <dlib/clustering.h>
+
+void KMeans::train(SamplesBuffer &SB, size_t MaxIterations) {
+ std::vector<DSample> Samples = SB.preprocess();
+
+ MinDist = std::numeric_limits<CalculatedNumber>::max();
+ MaxDist = std::numeric_limits<CalculatedNumber>::min();
+
+ {
+ std::lock_guard<std::mutex> Lock(Mutex);
+
+ ClusterCenters.clear();
+
+ dlib::pick_initial_centers(NumClusters, ClusterCenters, Samples);
+ dlib::find_clusters_using_kmeans(Samples, ClusterCenters, MaxIterations);
+
+ for (const auto &S : Samples) {
+ CalculatedNumber MeanDist = 0.0;
+
+ for (const auto &KMCenter : ClusterCenters)
+ MeanDist += dlib::length(KMCenter - S);
+
+ MeanDist /= NumClusters;
+
+ if (MeanDist < MinDist)
+ MinDist = MeanDist;
+
+ if (MeanDist > MaxDist)
+ MaxDist = MeanDist;
+ }
+ }
+}
+
+CalculatedNumber KMeans::anomalyScore(SamplesBuffer &SB) {
+ std::vector<DSample> DSamples = SB.preprocess();
+
+ std::unique_lock<std::mutex> Lock(Mutex, std::defer_lock);
+ if (!Lock.try_lock())
+ return std::numeric_limits<CalculatedNumber>::quiet_NaN();
+
+ CalculatedNumber MeanDist = 0.0;
+ for (const auto &CC: ClusterCenters)
+ MeanDist += dlib::length(CC - DSamples.back());
+
+ MeanDist /= NumClusters;
+
+ if (MaxDist == MinDist)
+ return 0.0;
+
+ CalculatedNumber AnomalyScore = 100.0 * std::abs((MeanDist - MinDist) / (MaxDist - MinDist));
+ return (AnomalyScore > 100.0) ? 100.0 : AnomalyScore;
+}
diff --git a/ml/kmeans/KMeans.h b/ml/kmeans/KMeans.h
new file mode 100644
index 000000000..4ea3b6a89
--- /dev/null
+++ b/ml/kmeans/KMeans.h
@@ -0,0 +1,34 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef KMEANS_H
+#define KMEANS_H
+
+#include <atomic>
+#include <vector>
+#include <limits>
+#include <mutex>
+
+#include "SamplesBuffer.h"
+
+class KMeans {
+public:
+ KMeans(size_t NumClusters = 2) : NumClusters(NumClusters) {
+ MinDist = std::numeric_limits<CalculatedNumber>::max();
+ MaxDist = std::numeric_limits<CalculatedNumber>::min();
+ };
+
+ void train(SamplesBuffer &SB, size_t MaxIterations);
+ CalculatedNumber anomalyScore(SamplesBuffer &SB);
+
+private:
+ size_t NumClusters;
+
+ std::vector<DSample> ClusterCenters;
+
+ CalculatedNumber MinDist;
+ CalculatedNumber MaxDist;
+
+ std::mutex Mutex;
+};
+
+#endif /* KMEANS_H */
diff --git a/ml/kmeans/Makefile.am b/ml/kmeans/Makefile.am
new file mode 100644
index 000000000..babdcf0df
--- /dev/null
+++ b/ml/kmeans/Makefile.am
@@ -0,0 +1,4 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+AUTOMAKE_OPTIONS = subdir-objects
+MAINTAINERCLEANFILES = $(srcdir)/Makefile.in
diff --git a/ml/kmeans/SamplesBuffer.cc b/ml/kmeans/SamplesBuffer.cc
new file mode 100644
index 000000000..f8211fb54
--- /dev/null
+++ b/ml/kmeans/SamplesBuffer.cc
@@ -0,0 +1,144 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+//
+#include "SamplesBuffer.h"
+
+#include <fstream>
+#include <sstream>
+#include <string>
+
+void Sample::print(std::ostream &OS) const {
+ for (size_t Idx = 0; Idx != NumDims - 1; Idx++)
+ OS << CNs[Idx] << ", ";
+
+ OS << CNs[NumDims - 1];
+}
+
+void SamplesBuffer::print(std::ostream &OS) const {
+ for (size_t Idx = Preprocessed ? (DiffN + (SmoothN - 1) + (LagN)) : 0;
+ Idx != NumSamples; Idx++) {
+ Sample S = Preprocessed ? getPreprocessedSample(Idx) : getSample(Idx);
+ OS << S << std::endl;
+ }
+}
+
+std::vector<Sample> SamplesBuffer::getPreprocessedSamples() const {
+ std::vector<Sample> V;
+
+ for (size_t Idx = Preprocessed ? (DiffN + (SmoothN - 1) + (LagN)) : 0;
+ Idx != NumSamples; Idx++) {
+ Sample S = Preprocessed ? getPreprocessedSample(Idx) : getSample(Idx);
+ V.push_back(S);
+ }
+
+ return V;
+}
+
+void SamplesBuffer::diffSamples() {
+ // Panda's DataFrame default behaviour is to subtract each element from
+ // itself. For us `DiffN = 0` means "disable diff-ing" when preprocessing
+ // the samples buffer. This deviation will make it easier for us to test
+ // the KMeans implementation.
+ if (DiffN == 0)
+ return;
+
+ for (size_t Idx = 0; Idx != (NumSamples - DiffN); Idx++) {
+ size_t High = (NumSamples - 1) - Idx;
+ size_t Low = High - DiffN;
+
+ Sample LHS = getSample(High);
+ Sample RHS = getSample(Low);
+
+ LHS.diff(RHS);
+ }
+}
+
+void SamplesBuffer::smoothSamples() {
+ // Holds the mean value of each window
+ CalculatedNumber *AccCNs = new CalculatedNumber[NumDimsPerSample]();
+ Sample Acc(AccCNs, NumDimsPerSample);
+
+ // Used to avoid clobbering the accumulator when moving the window
+ CalculatedNumber *TmpCNs = new CalculatedNumber[NumDimsPerSample]();
+ Sample Tmp(TmpCNs, NumDimsPerSample);
+
+ CalculatedNumber Factor = (CalculatedNumber) 1 / SmoothN;
+
+ // Calculate the value of the 1st window
+ for (size_t Idx = 0; Idx != std::min(SmoothN, NumSamples); Idx++) {
+ Tmp.add(getSample(NumSamples - (Idx + 1)));
+ }
+
+ Acc.add(Tmp);
+ Acc.scale(Factor);
+
+ // Move the window and update the samples
+ for (size_t Idx = NumSamples; Idx != (DiffN + SmoothN - 1); Idx--) {
+ Sample S = getSample(Idx - 1);
+
+ // Tmp <- Next window (if any)
+ if (Idx >= (SmoothN + 1)) {
+ Tmp.diff(S);
+ Tmp.add(getSample(Idx - (SmoothN + 1)));
+ }
+
+ // S <- Acc
+ S.copy(Acc);
+
+ // Acc <- Tmp
+ Acc.copy(Tmp);
+ Acc.scale(Factor);
+ }
+
+ delete[] AccCNs;
+ delete[] TmpCNs;
+}
+
+void SamplesBuffer::lagSamples() {
+ if (LagN == 0)
+ return;
+
+ for (size_t Idx = NumSamples; Idx != LagN; Idx--) {
+ Sample PS = getPreprocessedSample(Idx - 1);
+ PS.lag(getSample(Idx - 1), LagN);
+ }
+}
+
+std::vector<DSample> SamplesBuffer::preprocess() {
+ assert(Preprocessed == false);
+
+ std::vector<DSample> DSamples;
+ size_t OutN = NumSamples;
+
+ // Diff
+ if (DiffN >= OutN)
+ return DSamples;
+ OutN -= DiffN;
+ diffSamples();
+
+ // Smooth
+ if (SmoothN == 0 || SmoothN > OutN)
+ return DSamples;
+ OutN -= (SmoothN - 1);
+ smoothSamples();
+
+ // Lag
+ if (LagN >= OutN)
+ return DSamples;
+ OutN -= LagN;
+ lagSamples();
+
+ DSamples.reserve(OutN);
+ Preprocessed = true;
+
+ for (size_t Idx = NumSamples - OutN; Idx != NumSamples; Idx++) {
+ DSample DS;
+ DS.set_size(NumDimsPerSample * (LagN + 1));
+
+ const Sample PS = getPreprocessedSample(Idx);
+ PS.initDSample(DS);
+
+ DSamples.push_back(DS);
+ }
+
+ return DSamples;
+}
diff --git a/ml/kmeans/SamplesBuffer.h b/ml/kmeans/SamplesBuffer.h
new file mode 100644
index 000000000..fccd216d5
--- /dev/null
+++ b/ml/kmeans/SamplesBuffer.h
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef SAMPLES_BUFFER_H
+#define SAMPLES_BUFFER_H
+
+#include <iostream>
+#include <vector>
+
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+
+#include <dlib/matrix.h>
+
+typedef double CalculatedNumber;
+typedef dlib::matrix<CalculatedNumber, 0, 1> DSample;
+
+class Sample {
+public:
+ Sample(CalculatedNumber *Buf, size_t N) : CNs(Buf), NumDims(N) {}
+
+ void initDSample(DSample &DS) const {
+ for (size_t Idx = 0; Idx != NumDims; Idx++)
+ DS(Idx) = CNs[Idx];
+ }
+
+ void add(const Sample &RHS) const {
+ assert(NumDims == RHS.NumDims);
+
+ for (size_t Idx = 0; Idx != NumDims; Idx++)
+ CNs[Idx] += RHS.CNs[Idx];
+ };
+
+ void diff(const Sample &RHS) const {
+ assert(NumDims == RHS.NumDims);
+
+ for (size_t Idx = 0; Idx != NumDims; Idx++)
+ CNs[Idx] -= RHS.CNs[Idx];
+ };
+
+ void copy(const Sample &RHS) const {
+ assert(NumDims == RHS.NumDims);
+
+ std::memcpy(CNs, RHS.CNs, NumDims * sizeof(CalculatedNumber));
+ }
+
+ void scale(CalculatedNumber Factor) {
+ for (size_t Idx = 0; Idx != NumDims; Idx++)
+ CNs[Idx] *= Factor;
+ }
+
+ void lag(const Sample &S, size_t LagN) {
+ size_t N = S.NumDims;
+
+ for (size_t Idx = 0; Idx != (LagN + 1); Idx++) {
+ Sample Src(S.CNs - (Idx * N), N);
+ Sample Dst(CNs + (Idx * N), N);
+ Dst.copy(Src);
+ }
+ }
+
+ const CalculatedNumber *getCalculatedNumbers() const {
+ return CNs;
+ };
+
+ void print(std::ostream &OS) const;
+
+private:
+ CalculatedNumber *CNs;
+ size_t NumDims;
+};
+
+inline std::ostream& operator<<(std::ostream &OS, const Sample &S) {
+ S.print(OS);
+ return OS;
+}
+
+class SamplesBuffer {
+public:
+ SamplesBuffer(CalculatedNumber *CNs,
+ size_t NumSamples, size_t NumDimsPerSample,
+ size_t DiffN = 1, size_t SmoothN = 3, size_t LagN = 3) :
+ CNs(CNs), NumSamples(NumSamples), NumDimsPerSample(NumDimsPerSample),
+ DiffN(DiffN), SmoothN(SmoothN), LagN(LagN),
+ BytesPerSample(NumDimsPerSample * sizeof(CalculatedNumber)),
+ Preprocessed(false) {};
+
+ std::vector<DSample> preprocess();
+ std::vector<Sample> getPreprocessedSamples() const;
+
+ size_t capacity() const { return NumSamples; }
+ void print(std::ostream &OS) const;
+
+private:
+ size_t getSampleOffset(size_t Index) const {
+ assert(Index < NumSamples);
+ return Index * NumDimsPerSample;
+ }
+
+ size_t getPreprocessedSampleOffset(size_t Index) const {
+ assert(Index < NumSamples);
+ return getSampleOffset(Index) * (LagN + 1);
+ }
+
+ void setSample(size_t Index, const Sample &S) const {
+ size_t Offset = getSampleOffset(Index);
+ std::memcpy(&CNs[Offset], S.getCalculatedNumbers(), BytesPerSample);
+ }
+
+ const Sample getSample(size_t Index) const {
+ size_t Offset = getSampleOffset(Index);
+ return Sample(&CNs[Offset], NumDimsPerSample);
+ };
+
+ const Sample getPreprocessedSample(size_t Index) const {
+ size_t Offset = getPreprocessedSampleOffset(Index);
+ return Sample(&CNs[Offset], NumDimsPerSample * (LagN + 1));
+ };
+
+ void diffSamples();
+ void smoothSamples();
+ void lagSamples();
+
+private:
+ CalculatedNumber *CNs;
+ size_t NumSamples;
+ size_t NumDimsPerSample;
+ size_t DiffN;
+ size_t SmoothN;
+ size_t LagN;
+ size_t BytesPerSample;
+ bool Preprocessed;
+};
+
+inline std::ostream& operator<<(std::ostream& OS, const SamplesBuffer &SB) {
+ SB.print(OS);
+ return OS;
+}
+
+#endif /* SAMPLES_BUFFER_H */
diff --git a/ml/kmeans/Tests.cc b/ml/kmeans/Tests.cc
new file mode 100644
index 000000000..0cb595945
--- /dev/null
+++ b/ml/kmeans/Tests.cc
@@ -0,0 +1,143 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include "ml/ml-private.h"
+#include <gtest/gtest.h>
+
+/*
+ * The SamplesBuffer class implements the functionality of the following python
+ * code:
+ * >> df = pd.DataFrame(data=samples)
+ * >> df = df.diff(diff_n).dropna()
+ * >> df = df.rolling(smooth_n).mean().dropna()
+ * >> df = pd.concat([df.shift(n) for n in range(lag_n + 1)], axis=1).dropna()
+ *
+ * Its correctness has been verified by automatically generating random
+ * data frames in Python and comparing them with the correspondent preprocessed
+ * SampleBuffers.
+ *
+ * The following tests are meant to catch unintended changes in the SamplesBuffer
+ * implementation. For development purposes, one should compare changes against
+ * the aforementioned python code.
+*/
+
+TEST(SamplesBufferTest, NS_8_NDPS_1_DN_1_SN_3_LN_1) {
+ size_t NumSamples = 8, NumDimsPerSample = 1;
+ size_t DiffN = 1, SmoothN = 3, LagN = 3;
+
+ size_t N = NumSamples * NumDimsPerSample * (LagN + 1);
+ CalculatedNumber *CNs = new CalculatedNumber[N]();
+
+ CNs[0] = 0.7568336679490107;
+ CNs[1] = 0.4814406581763254;
+ CNs[2] = 0.40073555156221874;
+ CNs[3] = 0.5973257298194408;
+ CNs[4] = 0.5334727814345868;
+ CNs[5] = 0.2632477193454843;
+ CNs[6] = 0.2684839023122384;
+ CNs[7] = 0.851332948637479;
+
+ SamplesBuffer SB(CNs, NumSamples, NumDimsPerSample, DiffN, SmoothN, LagN);
+ SB.preprocess();
+
+ std::vector<Sample> Samples = SB.getPreprocessedSamples();
+ EXPECT_EQ(Samples.size(), 2);
+
+ Sample S0 = Samples[0];
+ const CalculatedNumber *S0_CNs = S0.getCalculatedNumbers();
+ Sample S1 = Samples[1];
+ const CalculatedNumber *S1_CNs = S1.getCalculatedNumbers();
+
+ EXPECT_NEAR(S0_CNs[0], -0.109614, 0.001);
+ EXPECT_NEAR(S0_CNs[1], -0.0458293, 0.001);
+ EXPECT_NEAR(S0_CNs[2], 0.017344, 0.001);
+ EXPECT_NEAR(S0_CNs[3], -0.0531693, 0.001);
+
+ EXPECT_NEAR(S1_CNs[0], 0.105953, 0.001);
+ EXPECT_NEAR(S1_CNs[1], -0.109614, 0.001);
+ EXPECT_NEAR(S1_CNs[2], -0.0458293, 0.001);
+ EXPECT_NEAR(S1_CNs[3], 0.017344, 0.001);
+
+ delete[] CNs;
+}
+
+TEST(SamplesBufferTest, NS_8_NDPS_1_DN_2_SN_3_LN_2) {
+ size_t NumSamples = 8, NumDimsPerSample = 1;
+ size_t DiffN = 2, SmoothN = 3, LagN = 2;
+
+ size_t N = NumSamples * NumDimsPerSample * (LagN + 1);
+ CalculatedNumber *CNs = new CalculatedNumber[N]();
+
+ CNs[0] = 0.20511885291342846;
+ CNs[1] = 0.13151717360306558;
+ CNs[2] = 0.6017085062423134;
+ CNs[3] = 0.46256882933941545;
+ CNs[4] = 0.7887758447877941;
+ CNs[5] = 0.9237989080034406;
+ CNs[6] = 0.15552559051428083;
+ CNs[7] = 0.6309750314597955;
+
+ SamplesBuffer SB(CNs, NumSamples, NumDimsPerSample, DiffN, SmoothN, LagN);
+ SB.preprocess();
+
+ std::vector<Sample> Samples = SB.getPreprocessedSamples();
+ EXPECT_EQ(Samples.size(), 2);
+
+ Sample S0 = Samples[0];
+ const CalculatedNumber *S0_CNs = S0.getCalculatedNumbers();
+ Sample S1 = Samples[1];
+ const CalculatedNumber *S1_CNs = S1.getCalculatedNumbers();
+
+ EXPECT_NEAR(S0_CNs[0], 0.005016, 0.001);
+ EXPECT_NEAR(S0_CNs[1], 0.326450, 0.001);
+ EXPECT_NEAR(S0_CNs[2], 0.304903, 0.001);
+
+ EXPECT_NEAR(S1_CNs[0], -0.154948, 0.001);
+ EXPECT_NEAR(S1_CNs[1], 0.005016, 0.001);
+ EXPECT_NEAR(S1_CNs[2], 0.326450, 0.001);
+
+ delete[] CNs;
+}
+
+TEST(SamplesBufferTest, NS_8_NDPS_3_DN_2_SN_4_LN_1) {
+ size_t NumSamples = 8, NumDimsPerSample = 3;
+ size_t DiffN = 2, SmoothN = 4, LagN = 1;
+
+ size_t N = NumSamples * NumDimsPerSample * (LagN + 1);
+ CalculatedNumber *CNs = new CalculatedNumber[N]();
+
+ CNs[0] = 0.34310900399667765; CNs[1] = 0.14694315994488194; CNs[2] = 0.8246677800938796;
+ CNs[3] = 0.48249504592307835; CNs[4] = 0.23241087965531182; CNs[5] = 0.9595348555892567;
+ CNs[6] = 0.44281094035598334; CNs[7] = 0.5143142171362715; CNs[8] = 0.06391303014242555;
+ CNs[9] = 0.7460491027783901; CNs[10] = 0.43887217459032923; CNs[11] = 0.2814395025355999;
+ CNs[12] = 0.9231114281214198; CNs[13] = 0.326882401786898; CNs[14] = 0.26747939220376216;
+ CNs[15] = 0.7787571209969636; CNs[16] =0.5851700001235088; CNs[17] = 0.34410728945321567;
+ CNs[18] = 0.9394494507088997; CNs[19] =0.17567223681734334; CNs[20] = 0.42732886195446984;
+ CNs[21] = 0.9460522396152958; CNs[22] =0.23462747016780894; CNs[23] = 0.35983249900892145;
+
+ SamplesBuffer SB(CNs, NumSamples, NumDimsPerSample, DiffN, SmoothN, LagN);
+ SB.preprocess();
+
+ std::vector<Sample> Samples = SB.getPreprocessedSamples();
+ EXPECT_EQ(Samples.size(), 2);
+
+ Sample S0 = Samples[0];
+ const CalculatedNumber *S0_CNs = S0.getCalculatedNumbers();
+ Sample S1 = Samples[1];
+ const CalculatedNumber *S1_CNs = S1.getCalculatedNumbers();
+
+ EXPECT_NEAR(S0_CNs[0], 0.198225, 0.001);
+ EXPECT_NEAR(S0_CNs[1], 0.003529, 0.001);
+ EXPECT_NEAR(S0_CNs[2], -0.063003, 0.001);
+ EXPECT_NEAR(S0_CNs[3], 0.219066, 0.001);
+ EXPECT_NEAR(S0_CNs[4], 0.133175, 0.001);
+ EXPECT_NEAR(S0_CNs[5], -0.293154, 0.001);
+
+ EXPECT_NEAR(S1_CNs[0], 0.174160, 0.001);
+ EXPECT_NEAR(S1_CNs[1], -0.135722, 0.001);
+ EXPECT_NEAR(S1_CNs[2], 0.110452, 0.001);
+ EXPECT_NEAR(S1_CNs[3], 0.198225, 0.001);
+ EXPECT_NEAR(S1_CNs[4], 0.003529, 0.001);
+ EXPECT_NEAR(S1_CNs[5], -0.063003, 0.001);
+
+ delete[] CNs;
+}