summaryrefslogtreecommitdiffstats
path: root/ml
diff options
context:
space:
mode:
Diffstat (limited to 'ml')
-rw-r--r--ml/ADCharts.cc233
-rw-r--r--ml/ADCharts.h23
-rw-r--r--ml/Config.cc114
-rw-r--r--ml/Config.h51
-rw-r--r--ml/Dimension.cc173
-rw-r--r--ml/Dimension.h94
-rw-r--r--ml/Host.cc255
-rw-r--r--ml/Host.h100
-rw-r--r--ml/KMeans.cc43
-rw-r--r--ml/KMeans.h41
-rw-r--r--ml/Query.h57
-rw-r--r--ml/README.md310
-rw-r--r--ml/SamplesBuffer.cc150
-rw-r--r--ml/SamplesBuffer.h146
-rw-r--r--ml/SamplesBufferTests.cc146
-rw-r--r--ml/ml-dummy.c50
-rw-r--r--ml/ml-private.h26
-rw-r--r--ml/ml.cc165
-rw-r--r--ml/ml.h50
-rw-r--r--ml/notebooks/README.md5
-rw-r--r--ml/notebooks/netdata_anomaly_detection_deepdive.ipynb1712
21 files changed, 3944 insertions, 0 deletions
diff --git a/ml/ADCharts.cc b/ml/ADCharts.cc
new file mode 100644
index 0000000..00c593c
--- /dev/null
+++ b/ml/ADCharts.cc
@@ -0,0 +1,233 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include "ADCharts.h"
+#include "Config.h"
+
+void ml::updateDimensionsChart(RRDHOST *RH,
+ collected_number NumTrainedDimensions,
+ collected_number NumNormalDimensions,
+ collected_number NumAnomalousDimensions) {
+ static thread_local RRDSET *RS = nullptr;
+ static thread_local RRDDIM *NumTotalDimensionsRD = nullptr;
+ static thread_local RRDDIM *NumTrainedDimensionsRD = nullptr;
+ static thread_local RRDDIM *NumNormalDimensionsRD = nullptr;
+ static thread_local RRDDIM *NumAnomalousDimensionsRD = nullptr;
+
+ if (!RS) {
+ std::stringstream IdSS, NameSS;
+
+ IdSS << "dimensions_on_" << localhost->machine_guid;
+ NameSS << "dimensions_on_" << localhost->hostname;
+
+ RS = rrdset_create(
+ RH,
+ "anomaly_detection", // type
+ IdSS.str().c_str(), // id
+ NameSS.str().c_str(), // name
+ "dimensions", // family
+ "anomaly_detection.dimensions", // ctx
+ "Anomaly detection dimensions", // title
+ "dimensions", // units
+ "netdata", // plugin
+ "ml", // module
+ 39183, // priority
+ RH->rrd_update_every, // update_every
+ RRDSET_TYPE_LINE // chart_type
+ );
+ rrdset_flag_set(RS, RRDSET_FLAG_ANOMALY_DETECTION);
+
+ NumTotalDimensionsRD = rrddim_add(RS, "total", NULL,
+ 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ NumTrainedDimensionsRD = rrddim_add(RS, "trained", NULL,
+ 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ NumNormalDimensionsRD = rrddim_add(RS, "normal", NULL,
+ 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ NumAnomalousDimensionsRD = rrddim_add(RS, "anomalous", NULL,
+ 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ }
+
+ rrddim_set_by_pointer(RS, NumTotalDimensionsRD, NumNormalDimensions + NumAnomalousDimensions);
+ rrddim_set_by_pointer(RS, NumTrainedDimensionsRD, NumTrainedDimensions);
+ rrddim_set_by_pointer(RS, NumNormalDimensionsRD, NumNormalDimensions);
+ rrddim_set_by_pointer(RS, NumAnomalousDimensionsRD, NumAnomalousDimensions);
+
+ rrdset_done(RS);
+}
+
+void ml::updateHostAndDetectionRateCharts(RRDHOST *RH, collected_number AnomalyRate) {
+ static thread_local RRDSET *HostRateRS = nullptr;
+ static thread_local RRDDIM *AnomalyRateRD = nullptr;
+
+ if (!HostRateRS) {
+ std::stringstream IdSS, NameSS;
+
+ IdSS << "anomaly_rate_on_" << localhost->machine_guid;
+ NameSS << "anomaly_rate_on_" << localhost->hostname;
+
+ HostRateRS = rrdset_create(
+ RH,
+ "anomaly_detection", // type
+ IdSS.str().c_str(), // id
+ NameSS.str().c_str(), // name
+ "anomaly_rate", // family
+ "anomaly_detection.anomaly_rate", // ctx
+ "Percentage of anomalous dimensions", // title
+ "percentage", // units
+ "netdata", // plugin
+ "ml", // module
+ 39184, // priority
+ RH->rrd_update_every, // update_every
+ RRDSET_TYPE_LINE // chart_type
+ );
+ rrdset_flag_set(HostRateRS, RRDSET_FLAG_ANOMALY_DETECTION);
+
+ AnomalyRateRD = rrddim_add(HostRateRS, "anomaly_rate", NULL,
+ 1, 100, RRD_ALGORITHM_ABSOLUTE);
+ }
+
+ rrddim_set_by_pointer(HostRateRS, AnomalyRateRD, AnomalyRate);
+ rrdset_done(HostRateRS);
+
+ static thread_local RRDSET *AnomalyDetectionRS = nullptr;
+ static thread_local RRDDIM *AboveThresholdRD = nullptr;
+ static thread_local RRDDIM *NewAnomalyEventRD = nullptr;
+
+ if (!AnomalyDetectionRS) {
+ std::stringstream IdSS, NameSS;
+
+ IdSS << "anomaly_detection_on_" << localhost->machine_guid;
+ NameSS << "anomaly_detection_on_" << localhost->hostname;
+
+ AnomalyDetectionRS = rrdset_create(
+ RH,
+ "anomaly_detection", // type
+ IdSS.str().c_str(), // id
+ NameSS.str().c_str(), // name
+ "anomaly_detection", // family
+ "anomaly_detection.detector_events", // ctx
+ "Anomaly detection events", // title
+ "percentage", // units
+ "netdata", // plugin
+ "ml", // module
+ 39185, // priority
+ RH->rrd_update_every, // update_every
+ RRDSET_TYPE_LINE // chart_type
+ );
+ rrdset_flag_set(AnomalyDetectionRS, RRDSET_FLAG_ANOMALY_DETECTION);
+
+ AboveThresholdRD = rrddim_add(AnomalyDetectionRS, "above_threshold", NULL,
+ 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ NewAnomalyEventRD = rrddim_add(AnomalyDetectionRS, "new_anomaly_event", NULL,
+ 1, 1, RRD_ALGORITHM_ABSOLUTE);
+ }
+
+ /*
+ * Compute the values of the dimensions based on the host rate chart
+ */
+ ONEWAYALLOC *OWA = onewayalloc_create(0);
+ time_t Now = now_realtime_sec();
+ time_t Before = Now - RH->rrd_update_every;
+ time_t After = Before - Cfg.AnomalyDetectionQueryDuration;
+ RRDR_OPTIONS Options = static_cast<RRDR_OPTIONS>(0x00000000);
+
+ RRDR *R = rrd2rrdr_legacy(
+ OWA, HostRateRS,
+ 1 /* points wanted */,
+ After,
+ Before,
+ Cfg.AnomalyDetectionGroupingMethod,
+ 0 /* resampling time */,
+ Options, "anomaly_rate",
+ NULL /* group options */,
+ 0, /* timeout */
+ 0, /* tier */
+ QUERY_SOURCE_ML
+ );
+ if(R) {
+ assert(R->d == 1 && R->n == 1 && R->rows == 1);
+
+ static thread_local bool PrevAboveThreshold = false;
+ bool AboveThreshold = R->v[0] >= Cfg.HostAnomalyRateThreshold;
+ bool NewAnomalyEvent = AboveThreshold && !PrevAboveThreshold;
+ PrevAboveThreshold = AboveThreshold;
+
+ rrddim_set_by_pointer(AnomalyDetectionRS, AboveThresholdRD, AboveThreshold);
+ rrddim_set_by_pointer(AnomalyDetectionRS, NewAnomalyEventRD, NewAnomalyEvent);
+ rrdset_done(AnomalyDetectionRS);
+
+ rrdr_free(OWA, R);
+ }
+ onewayalloc_destroy(OWA);
+}
+
+void ml::updateDetectionChart(RRDHOST *RH) {
+ static thread_local RRDSET *RS = nullptr;
+ static thread_local RRDDIM *UserRD, *SystemRD = nullptr;
+
+ if (!RS) {
+ std::stringstream IdSS, NameSS;
+
+ IdSS << "prediction_stats_" << RH->machine_guid;
+ NameSS << "prediction_stats_for_" << RH->hostname;
+
+ RS = rrdset_create_localhost(
+ "netdata", // type
+ IdSS.str().c_str(), // id
+ NameSS.str().c_str(), // name
+ "ml", // family
+ "netdata.prediction_stats", // ctx
+ "Prediction thread CPU usage", // title
+ "milliseconds/s", // units
+ "netdata", // plugin
+ "ml", // module
+ 136000, // priority
+ RH->rrd_update_every, // update_every
+ RRDSET_TYPE_STACKED // chart_type
+ );
+
+ UserRD = rrddim_add(RS, "user", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL);
+ SystemRD = rrddim_add(RS, "system", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL);
+ }
+
+ struct rusage TRU;
+ getrusage(RUSAGE_THREAD, &TRU);
+
+ rrddim_set_by_pointer(RS, UserRD, TRU.ru_utime.tv_sec * 1000000ULL + TRU.ru_utime.tv_usec);
+ rrddim_set_by_pointer(RS, SystemRD, TRU.ru_stime.tv_sec * 1000000ULL + TRU.ru_stime.tv_usec);
+ rrdset_done(RS);
+}
+
+void ml::updateTrainingChart(RRDHOST *RH, struct rusage *TRU) {
+ static thread_local RRDSET *RS = nullptr;
+ static thread_local RRDDIM *UserRD = nullptr;
+ static thread_local RRDDIM *SystemRD = nullptr;
+
+ if (!RS) {
+ std::stringstream IdSS, NameSS;
+
+ IdSS << "training_stats_" << RH->machine_guid;
+ NameSS << "training_stats_for_" << RH->hostname;
+
+ RS = rrdset_create_localhost(
+ "netdata", // type
+ IdSS.str().c_str(), // id
+ NameSS.str().c_str(), // name
+ "ml", // family
+ "netdata.training_stats", // ctx
+ "Training thread CPU usage", // title
+ "milliseconds/s", // units
+ "netdata", // plugin
+ "ml", // module
+ 136001, // priority
+ RH->rrd_update_every, // update_every
+ RRDSET_TYPE_STACKED // chart_type
+ );
+
+ UserRD = rrddim_add(RS, "user", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL);
+ SystemRD = rrddim_add(RS, "system", NULL, 1, 1000, RRD_ALGORITHM_INCREMENTAL);
+ }
+
+ rrddim_set_by_pointer(RS, UserRD, TRU->ru_utime.tv_sec * 1000000ULL + TRU->ru_utime.tv_usec);
+ rrddim_set_by_pointer(RS, SystemRD, TRU->ru_stime.tv_sec * 1000000ULL + TRU->ru_stime.tv_usec);
+ rrdset_done(RS);
+}
diff --git a/ml/ADCharts.h b/ml/ADCharts.h
new file mode 100644
index 0000000..0be324f
--- /dev/null
+++ b/ml/ADCharts.h
@@ -0,0 +1,23 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef ML_ADCHARTS_H
+#define ML_ADCHARTS_H
+
+#include "ml-private.h"
+
+namespace ml {
+
+void updateDimensionsChart(RRDHOST *RH,
+ collected_number NumTrainedDimensions,
+ collected_number NumNormalDimensions,
+ collected_number NumAnomalousDimensions);
+
+void updateHostAndDetectionRateCharts(RRDHOST *RH, collected_number AnomalyRate);
+
+void updateDetectionChart(RRDHOST *RH);
+
+void updateTrainingChart(RRDHOST *RH, struct rusage *TRU);
+
+} // namespace ml
+
+#endif /* ML_ADCHARTS_H */
diff --git a/ml/Config.cc b/ml/Config.cc
new file mode 100644
index 0000000..eedd8c2
--- /dev/null
+++ b/ml/Config.cc
@@ -0,0 +1,114 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include "Config.h"
+#include "ml-private.h"
+
+using namespace ml;
+
+/*
+ * Global configuration instance to be shared between training and
+ * prediction threads.
+ */
+Config ml::Cfg;
+
+template <typename T>
+static T clamp(const T& Value, const T& Min, const T& Max) {
+ return std::max(Min, std::min(Value, Max));
+}
+
+/*
+ * Initialize global configuration variable.
+ */
+void Config::readMLConfig(void) {
+ const char *ConfigSectionML = CONFIG_SECTION_ML;
+
+ bool EnableAnomalyDetection = config_get_boolean(ConfigSectionML, "enabled", true);
+
+ /*
+ * Read values
+ */
+
+ unsigned MaxTrainSamples = config_get_number(ConfigSectionML, "maximum num samples to train", 4 * 3600);
+ unsigned MinTrainSamples = config_get_number(ConfigSectionML, "minimum num samples to train", 1 * 900);
+ unsigned TrainEvery = config_get_number(ConfigSectionML, "train every", 1 * 3600);
+ unsigned NumModelsToUse = config_get_number(ConfigSectionML, "number of models per dimension", 1 * 24);
+
+ unsigned DiffN = config_get_number(ConfigSectionML, "num samples to diff", 1);
+ unsigned SmoothN = config_get_number(ConfigSectionML, "num samples to smooth", 3);
+ unsigned LagN = config_get_number(ConfigSectionML, "num samples to lag", 5);
+
+ double RandomSamplingRatio = config_get_float(ConfigSectionML, "random sampling ratio", 1.0 / LagN);
+ unsigned MaxKMeansIters = config_get_number(ConfigSectionML, "maximum number of k-means iterations", 1000);
+
+ double DimensionAnomalyScoreThreshold = config_get_float(ConfigSectionML, "dimension anomaly score threshold", 0.99);
+
+ double HostAnomalyRateThreshold = config_get_float(ConfigSectionML, "host anomaly rate threshold", 1.0);
+ std::string AnomalyDetectionGroupingMethod = config_get(ConfigSectionML, "anomaly detection grouping method", "average");
+ time_t AnomalyDetectionQueryDuration = config_get_number(ConfigSectionML, "anomaly detection grouping duration", 5 * 60);
+
+ /*
+ * Clamp
+ */
+
+ MaxTrainSamples = clamp<unsigned>(MaxTrainSamples, 1 * 3600, 24 * 3600);
+ MinTrainSamples = clamp<unsigned>(MinTrainSamples, 1 * 900, 6 * 3600);
+ TrainEvery = clamp<unsigned>(TrainEvery, 1 * 3600, 6 * 3600);
+ NumModelsToUse = clamp<unsigned>(TrainEvery, 1, 7 * 24);
+
+ DiffN = clamp(DiffN, 0u, 1u);
+ SmoothN = clamp(SmoothN, 0u, 5u);
+ LagN = clamp(LagN, 1u, 5u);
+
+ RandomSamplingRatio = clamp(RandomSamplingRatio, 0.2, 1.0);
+ MaxKMeansIters = clamp(MaxKMeansIters, 500u, 1000u);
+
+ DimensionAnomalyScoreThreshold = clamp(DimensionAnomalyScoreThreshold, 0.01, 5.00);
+
+ HostAnomalyRateThreshold = clamp(HostAnomalyRateThreshold, 0.1, 10.0);
+ AnomalyDetectionQueryDuration = clamp<time_t>(AnomalyDetectionQueryDuration, 60, 15 * 60);
+
+ /*
+ * Validate
+ */
+
+ if (MinTrainSamples >= MaxTrainSamples) {
+ error("invalid min/max train samples found (%u >= %u)", MinTrainSamples, MaxTrainSamples);
+
+ MinTrainSamples = 1 * 3600;
+ MaxTrainSamples = 4 * 3600;
+ }
+
+ /*
+ * Assign to config instance
+ */
+
+ Cfg.EnableAnomalyDetection = EnableAnomalyDetection;
+
+ Cfg.MaxTrainSamples = MaxTrainSamples;
+ Cfg.MinTrainSamples = MinTrainSamples;
+ Cfg.TrainEvery = TrainEvery;
+ Cfg.NumModelsToUse = NumModelsToUse;
+
+ Cfg.DiffN = DiffN;
+ Cfg.SmoothN = SmoothN;
+ Cfg.LagN = LagN;
+
+ Cfg.RandomSamplingRatio = RandomSamplingRatio;
+ Cfg.MaxKMeansIters = MaxKMeansIters;
+
+ Cfg.DimensionAnomalyScoreThreshold = DimensionAnomalyScoreThreshold;
+
+ Cfg.HostAnomalyRateThreshold = HostAnomalyRateThreshold;
+ Cfg.AnomalyDetectionGroupingMethod = web_client_api_request_v1_data_group(AnomalyDetectionGroupingMethod.c_str(), RRDR_GROUPING_AVERAGE);
+ Cfg.AnomalyDetectionQueryDuration = AnomalyDetectionQueryDuration;
+
+ Cfg.HostsToSkip = config_get(ConfigSectionML, "hosts to skip from training", "!*");
+ Cfg.SP_HostsToSkip = simple_pattern_create(Cfg.HostsToSkip.c_str(), NULL, SIMPLE_PATTERN_EXACT);
+
+ // Always exclude anomaly_detection charts from training.
+ Cfg.ChartsToSkip = "anomaly_detection.* ";
+ Cfg.ChartsToSkip += config_get(ConfigSectionML, "charts to skip from training", "netdata.*");
+ Cfg.SP_ChartsToSkip = simple_pattern_create(ChartsToSkip.c_str(), NULL, SIMPLE_PATTERN_EXACT);
+
+ Cfg.StreamADCharts = config_get_boolean(ConfigSectionML, "stream anomaly detection charts", true);
+}
diff --git a/ml/Config.h b/ml/Config.h
new file mode 100644
index 0000000..d876d4a
--- /dev/null
+++ b/ml/Config.h
@@ -0,0 +1,51 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef ML_CONFIG_H
+#define ML_CONFIG_H
+
+#include "ml-private.h"
+
+namespace ml {
+
+class Config {
+public:
+ bool EnableAnomalyDetection;
+
+ unsigned MaxTrainSamples;
+ unsigned MinTrainSamples;
+ unsigned TrainEvery;
+ unsigned NumModelsToUse;
+
+ unsigned DBEngineAnomalyRateEvery;
+
+ unsigned DiffN;
+ unsigned SmoothN;
+ unsigned LagN;
+
+ double RandomSamplingRatio;
+ unsigned MaxKMeansIters;
+
+ double DimensionAnomalyScoreThreshold;
+
+ double HostAnomalyRateThreshold;
+ RRDR_GROUPING AnomalyDetectionGroupingMethod;
+ time_t AnomalyDetectionQueryDuration;
+
+ bool StreamADCharts;
+
+ std::string HostsToSkip;
+ SIMPLE_PATTERN *SP_HostsToSkip;
+
+ std::string ChartsToSkip;
+ SIMPLE_PATTERN *SP_ChartsToSkip;
+
+ std::vector<uint32_t> RandomNums;
+
+ void readMLConfig();
+};
+
+extern Config Cfg;
+
+} // namespace ml
+
+#endif /* ML_CONFIG_H */
diff --git a/ml/Dimension.cc b/ml/Dimension.cc
new file mode 100644
index 0000000..bf34abb
--- /dev/null
+++ b/ml/Dimension.cc
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include "Config.h"
+#include "Dimension.h"
+#include "Query.h"
+
+using namespace ml;
+
+bool Dimension::isActive() const {
+ bool SetObsolete = rrdset_flag_check(RD->rrdset, RRDSET_FLAG_OBSOLETE);
+ bool DimObsolete = rrddim_flag_check(RD, RRDDIM_FLAG_OBSOLETE);
+ return !SetObsolete && !DimObsolete;
+}
+
+std::pair<CalculatedNumber *, size_t> Dimension::getCalculatedNumbers() {
+ size_t MinN = Cfg.MinTrainSamples;
+ size_t MaxN = Cfg.MaxTrainSamples;
+
+ // Figure out what our time window should be.
+ time_t BeforeT = now_realtime_sec() - 1;
+ time_t AfterT = BeforeT - (MaxN * updateEvery());
+
+ BeforeT -= (BeforeT % updateEvery());
+ AfterT -= (AfterT % updateEvery());
+
+ BeforeT = std::min(BeforeT, latestTime());
+ AfterT = std::max(AfterT, oldestTime());
+
+ if (AfterT >= BeforeT)
+ return { nullptr, 0 };
+
+ CalculatedNumber *CNs = new CalculatedNumber[MaxN * (Cfg.LagN + 1)]();
+
+ // Start the query.
+ unsigned Idx = 0;
+ unsigned CollectedValues = 0;
+ unsigned TotalValues = 0;
+
+ CalculatedNumber LastValue = std::numeric_limits<CalculatedNumber>::quiet_NaN();
+ Query Q = Query(getRD());
+
+ Q.init(AfterT, BeforeT);
+ while (!Q.isFinished()) {
+ if (Idx == MaxN)
+ break;
+
+ auto P = Q.nextMetric();
+ CalculatedNumber Value = P.second;
+
+ if (netdata_double_isnumber(Value)) {
+ CNs[Idx] = Value;
+ LastValue = CNs[Idx];
+ CollectedValues++;
+ } else
+ CNs[Idx] = LastValue;
+
+ Idx++;
+ }
+ TotalValues = Idx;
+
+ if (CollectedValues < MinN) {
+ delete[] CNs;
+ return { nullptr, 0 };
+ }
+
+ // Find first non-NaN value.
+ for (Idx = 0; std::isnan(CNs[Idx]); Idx++, TotalValues--) { }
+
+ // Overwrite NaN values.
+ if (Idx != 0)
+ memmove(CNs, &CNs[Idx], sizeof(CalculatedNumber) * TotalValues);
+
+ return { CNs, TotalValues };
+}
+
+MLResult Dimension::trainModel() {
+ auto P = getCalculatedNumbers();
+ CalculatedNumber *CNs = P.first;
+ unsigned N = P.second;
+
+ if (!CNs)
+ return MLResult::MissingData;
+
+ unsigned TargetNumSamples = Cfg.MaxTrainSamples * Cfg.RandomSamplingRatio;
+ double SamplingRatio = std::min(static_cast<double>(TargetNumSamples) / N, 1.0);
+
+ SamplesBuffer SB = SamplesBuffer(CNs, N, 1, Cfg.DiffN, Cfg.SmoothN, Cfg.LagN,
+ SamplingRatio, Cfg.RandomNums);
+ std::vector<DSample> Samples = SB.preprocess();
+
+ KMeans KM;
+ KM.train(Samples, Cfg.MaxKMeansIters);
+
+ {
+ std::lock_guard<std::mutex> Lock(Mutex);
+ Models[0] = KM;
+ }
+
+ Trained = true;
+ ConstantModel = true;
+
+ delete[] CNs;
+ return MLResult::Success;
+}
+
+bool Dimension::shouldTrain(const TimePoint &TP) const {
+ if (ConstantModel)
+ return false;
+
+ return (LastTrainedAt + Seconds(Cfg.TrainEvery * updateEvery())) < TP;
+}
+
+bool Dimension::predict(CalculatedNumber Value, bool Exists) {
+ if (!Exists) {
+ CNs.clear();
+ AnomalyBit = false;
+ return false;
+ }
+
+ unsigned N = Cfg.DiffN + Cfg.SmoothN + Cfg.LagN;
+ if (CNs.size() < N) {
+ CNs.push_back(Value);
+ AnomalyBit = false;
+ return false;
+ }
+
+ std::rotate(std::begin(CNs), std::begin(CNs) + 1, std::end(CNs));
+
+ if (CNs[N - 1] != Value)
+ ConstantModel = false;
+
+ CNs[N - 1] = Value;
+
+ if (!isTrained() || ConstantModel) {
+ AnomalyBit = false;
+ return false;
+ }
+
+ CalculatedNumber *TmpCNs = new CalculatedNumber[N * (Cfg.LagN + 1)]();
+ std::memcpy(TmpCNs, CNs.data(), N * sizeof(CalculatedNumber));
+ SamplesBuffer SB = SamplesBuffer(TmpCNs, N, 1,
+ Cfg.DiffN, Cfg.SmoothN, Cfg.LagN,
+ 1.0, Cfg.RandomNums);
+ const DSample Sample = SB.preprocess().back();
+ delete[] TmpCNs;
+
+ std::unique_lock<std::mutex> Lock(Mutex, std::defer_lock);
+ if (!Lock.try_lock()) {
+ AnomalyBit = false;
+ return false;
+ }
+
+ for (const auto &KM : Models) {
+ double AnomalyScore = KM.anomalyScore(Sample);
+ if (AnomalyScore == std::numeric_limits<CalculatedNumber>::quiet_NaN()) {
+ AnomalyBit = false;
+ continue;
+ }
+
+ if (AnomalyScore < (100 * Cfg.DimensionAnomalyScoreThreshold)) {
+ AnomalyBit = false;
+ return false;
+ }
+ }
+
+ AnomalyBit = true;
+ return true;
+}
+
+std::array<KMeans, 1> Dimension::getModels() {
+ std::unique_lock<std::mutex> Lock(Mutex);
+ return Models;
+}
diff --git a/ml/Dimension.h b/ml/Dimension.h
new file mode 100644
index 0000000..3ec56e0
--- /dev/null
+++ b/ml/Dimension.h
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef ML_DIMENSION_H
+#define ML_DIMENSION_H
+
+#include "Query.h"
+#include "Config.h"
+
+#include "ml-private.h"
+
+namespace ml {
+
+enum class MLResult {
+ Success = 0,
+ MissingData,
+ NaN,
+};
+
+static inline std::string getMLDimensionID(RRDDIM *RD) {
+ RRDSET *RS = RD->rrdset;
+
+ std::stringstream SS;
+ SS << rrdset_context(RS) << "|" << rrdset_id(RS) << "|" << rrddim_name(RD);
+ return SS.str();
+}
+
+class Dimension {
+public:
+ Dimension(RRDDIM *RD) :
+ RD(RD),
+ LastTrainedAt(Seconds(0)),
+ Trained(false),
+ ConstantModel(false),
+ AnomalyScore(0.0),
+ AnomalyBit(0)
+ { }
+
+ RRDDIM *getRD() const {
+ return RD;
+ }
+
+ unsigned updateEvery() const {
+ return RD->update_every;
+ }
+
+ time_t latestTime() const {
+ return Query(RD).latestTime();
+ }
+
+ time_t oldestTime() const {
+ return Query(RD).oldestTime();
+ }
+
+ bool isTrained() const {
+ return Trained;
+ }
+
+ bool isAnomalous() const {
+ return AnomalyBit;
+ }
+
+ bool shouldTrain(const TimePoint &TP) const;
+
+ bool isActive() const;
+
+ MLResult trainModel();
+
+ bool predict(CalculatedNumber Value, bool Exists);
+
+ std::pair<bool, double> detect(size_t WindowLength, bool Reset);
+
+ std::array<KMeans, 1> getModels();
+
+private:
+ std::pair<CalculatedNumber *, size_t> getCalculatedNumbers();
+
+public:
+ RRDDIM *RD;
+
+ TimePoint LastTrainedAt;
+ std::atomic<bool> Trained;
+ std::atomic<bool> ConstantModel;
+
+ CalculatedNumber AnomalyScore;
+ std::atomic<bool> AnomalyBit;
+
+ std::vector<CalculatedNumber> CNs;
+ std::array<KMeans, 1> Models;
+ std::mutex Mutex;
+};
+
+} // namespace ml
+
+#endif /* ML_DIMENSION_H */
diff --git a/ml/Host.cc b/ml/Host.cc
new file mode 100644
index 0000000..4a57178
--- /dev/null
+++ b/ml/Host.cc
@@ -0,0 +1,255 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include "Config.h"
+#include "Host.h"
+#include "ADCharts.h"
+
+#include "json/single_include/nlohmann/json.hpp"
+
+using namespace ml;
+
+void RrdHost::addDimension(Dimension *D) {
+ std::lock_guard<std::mutex> Lock(Mutex);
+
+ DimensionsMap[D->getRD()] = D;
+
+ // Default construct mutex for dimension
+ LocksMap[D];
+}
+
+void RrdHost::removeDimension(Dimension *D) {
+ // Remove the dimension from the hosts map.
+ {
+ std::lock_guard<std::mutex> Lock(Mutex);
+ DimensionsMap.erase(D->getRD());
+ }
+
+ // Delete the dimension by locking the mutex that protects it.
+ {
+ std::lock_guard<std::mutex> Lock(LocksMap[D]);
+ delete D;
+ }
+
+ // Remove the lock entry for the deleted dimension.
+ {
+ std::lock_guard<std::mutex> Lock(Mutex);
+ LocksMap.erase(D);
+ }
+}
+
+void RrdHost::getConfigAsJson(nlohmann::json &Json) const {
+ Json["version"] = 1;
+
+ Json["enabled"] = Cfg.EnableAnomalyDetection;
+
+ Json["min-train-samples"] = Cfg.MinTrainSamples;
+ Json["max-train-samples"] = Cfg.MaxTrainSamples;
+ Json["train-every"] = Cfg.TrainEvery;
+
+ Json["diff-n"] = Cfg.DiffN;
+ Json["smooth-n"] = Cfg.SmoothN;
+ Json["lag-n"] = Cfg.LagN;
+
+ Json["random-sampling-ratio"] = Cfg.RandomSamplingRatio;
+ Json["max-kmeans-iters"] = Cfg.MaxKMeansIters;
+
+ Json["dimension-anomaly-score-threshold"] = Cfg.DimensionAnomalyScoreThreshold;
+
+ Json["host-anomaly-rate-threshold"] = Cfg.HostAnomalyRateThreshold;
+ Json["anomaly-detection-grouping-method"] = group_method2string(Cfg.AnomalyDetectionGroupingMethod);
+ Json["anomaly-detection-query-duration"] = Cfg.AnomalyDetectionQueryDuration;
+
+ Json["hosts-to-skip"] = Cfg.HostsToSkip;
+ Json["charts-to-skip"] = Cfg.ChartsToSkip;
+}
+
+void TrainableHost::getModelsAsJson(nlohmann::json &Json) {
+ std::lock_guard<std::mutex> Lock(Mutex);
+
+ for (auto &DP : DimensionsMap) {
+ Dimension *D = DP.second;
+
+ nlohmann::json JsonArray = nlohmann::json::array();
+ for (const KMeans &KM : D->getModels()) {
+ nlohmann::json J;
+ KM.toJson(J);
+ JsonArray.push_back(J);
+ }
+ Json[getMLDimensionID(D->getRD())] = JsonArray;
+ }
+
+ return;
+}
+
+std::pair<Dimension *, Duration<double>>
+TrainableHost::findDimensionToTrain(const TimePoint &NowTP) {
+ std::lock_guard<std::mutex> Lock(Mutex);
+
+ Duration<double> AllottedDuration = Duration<double>{Cfg.TrainEvery * updateEvery()} / (DimensionsMap.size() + 1);
+
+ for (auto &DP : DimensionsMap) {
+ Dimension *D = DP.second;
+
+ if (D->shouldTrain(NowTP)) {
+ LocksMap[D].lock();
+ return { D, AllottedDuration };
+ }
+ }
+
+ return { nullptr, AllottedDuration };
+}
+
+void TrainableHost::trainDimension(Dimension *D, const TimePoint &NowTP) {
+ if (D == nullptr)
+ return;
+
+ D->LastTrainedAt = NowTP + Seconds{D->updateEvery()};
+ D->trainModel();
+
+ {
+ std::lock_guard<std::mutex> Lock(Mutex);
+ LocksMap[D].unlock();
+ }
+}
+
+void TrainableHost::train() {
+ Duration<double> MaxSleepFor = Seconds{10 * updateEvery()};
+
+ worker_register("MLTRAIN");
+ worker_register_job_name(0, "dimensions");
+
+ worker_is_busy(0);
+ while (!netdata_exit) {
+ netdata_thread_testcancel();
+ netdata_thread_disable_cancelability();
+
+ updateResourceUsage();
+
+ TimePoint NowTP = SteadyClock::now();
+
+ auto P = findDimensionToTrain(NowTP);
+ trainDimension(P.first, NowTP);
+
+ netdata_thread_enable_cancelability();
+
+ Duration<double> AllottedDuration = P.second;
+ Duration<double> RealDuration = SteadyClock::now() - NowTP;
+
+ Duration<double> SleepFor;
+ if (RealDuration >= AllottedDuration)
+ continue;
+
+ worker_is_idle();
+ SleepFor = std::min(AllottedDuration - RealDuration, MaxSleepFor);
+ TimePoint Now = SteadyClock::now();
+ auto Until = Now + SleepFor;
+ while (Now < Until && !netdata_exit) {
+ std::this_thread::sleep_for(std::chrono::milliseconds(1000));
+ Now = SteadyClock::now();
+ }
+ worker_is_busy(0);
+ }
+}
+
+#define WORKER_JOB_DETECT_DIMENSION 0
+#define WORKER_JOB_UPDATE_DETECTION_CHART 1
+#define WORKER_JOB_UPDATE_ANOMALY_RATES 2
+#define WORKER_JOB_UPDATE_CHARTS 3
+
+#if WORKER_UTILIZATION_MAX_JOB_TYPES < 5
+#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 5
+#endif
+
+void DetectableHost::detectOnce() {
+ size_t NumAnomalousDimensions = 0;
+ size_t NumNormalDimensions = 0;
+ size_t NumTrainedDimensions = 0;
+ size_t NumActiveDimensions = 0;
+
+ {
+ std::lock_guard<std::mutex> Lock(Mutex);
+
+ for (auto &DP : DimensionsMap) {
+ worker_is_busy(WORKER_JOB_DETECT_DIMENSION);
+
+ Dimension *D = DP.second;
+
+ if (!D->isActive())
+ continue;
+
+ NumActiveDimensions++;
+ NumTrainedDimensions += D->isTrained();
+
+ bool IsAnomalous = D->isAnomalous();
+ if (IsAnomalous)
+ NumAnomalousDimensions += 1;
+ }
+
+ if (NumAnomalousDimensions)
+ HostAnomalyRate = static_cast<double>(NumAnomalousDimensions) / NumActiveDimensions;
+ else
+ HostAnomalyRate = 0.0;
+
+ NumNormalDimensions = NumActiveDimensions - NumAnomalousDimensions;
+ }
+
+ this->NumAnomalousDimensions = NumAnomalousDimensions;
+ this->NumNormalDimensions = NumNormalDimensions;
+ this->NumTrainedDimensions = NumTrainedDimensions;
+ this->NumActiveDimensions = NumActiveDimensions;
+
+ worker_is_busy(WORKER_JOB_UPDATE_CHARTS);
+ updateDimensionsChart(getRH(), NumTrainedDimensions, NumNormalDimensions, NumAnomalousDimensions);
+ updateHostAndDetectionRateCharts(getRH(), HostAnomalyRate * 10000.0);
+
+ struct rusage TRU;
+ getResourceUsage(&TRU);
+ updateTrainingChart(getRH(), &TRU);
+}
+
+void DetectableHost::detect() {
+ worker_register("MLDETECT");
+ worker_register_job_name(WORKER_JOB_DETECT_DIMENSION, "dimensions");
+ worker_register_job_name(WORKER_JOB_UPDATE_DETECTION_CHART, "detection chart");
+ worker_register_job_name(WORKER_JOB_UPDATE_ANOMALY_RATES, "anomaly rates");
+ worker_register_job_name(WORKER_JOB_UPDATE_CHARTS, "charts");
+
+ std::this_thread::sleep_for(Seconds{10});
+
+ heartbeat_t HB;
+ heartbeat_init(&HB);
+
+ while (!netdata_exit) {
+ netdata_thread_testcancel();
+ worker_is_idle();
+ heartbeat_next(&HB, updateEvery() * USEC_PER_SEC);
+
+ netdata_thread_disable_cancelability();
+ detectOnce();
+
+ worker_is_busy(WORKER_JOB_UPDATE_DETECTION_CHART);
+ updateDetectionChart(getRH());
+ netdata_thread_enable_cancelability();
+ }
+}
+
+void DetectableHost::getDetectionInfoAsJson(nlohmann::json &Json) const {
+ Json["version"] = 1;
+ Json["anomalous-dimensions"] = NumAnomalousDimensions;
+ Json["normal-dimensions"] = NumNormalDimensions;
+ Json["total-dimensions"] = NumAnomalousDimensions + NumNormalDimensions;
+ Json["trained-dimensions"] = NumTrainedDimensions;
+}
+
+void DetectableHost::startAnomalyDetectionThreads() {
+ TrainingThread = std::thread(&TrainableHost::train, this);
+ DetectionThread = std::thread(&DetectableHost::detect, this);
+}
+
+void DetectableHost::stopAnomalyDetectionThreads() {
+ netdata_thread_cancel(TrainingThread.native_handle());
+ netdata_thread_cancel(DetectionThread.native_handle());
+
+ TrainingThread.join();
+ DetectionThread.join();
+}
diff --git a/ml/Host.h b/ml/Host.h
new file mode 100644
index 0000000..52a0cd0
--- /dev/null
+++ b/ml/Host.h
@@ -0,0 +1,100 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef ML_HOST_H
+#define ML_HOST_H
+
+#include "Config.h"
+#include "Dimension.h"
+
+#include "ml-private.h"
+#include "json/single_include/nlohmann/json.hpp"
+
+namespace ml {
+
+class RrdHost {
+public:
+ RrdHost(RRDHOST *RH) : RH(RH) {};
+
+ RRDHOST *getRH() { return RH; }
+
+ unsigned updateEvery() { return RH->rrd_update_every; }
+
+ std::string getUUID() {
+ char S[UUID_STR_LEN];
+ uuid_unparse_lower(RH->host_uuid, S);
+ return S;
+ }
+
+ void addDimension(Dimension *D);
+ void removeDimension(Dimension *D);
+
+ void getConfigAsJson(nlohmann::json &Json) const;
+
+ virtual ~RrdHost() {};
+
+protected:
+ RRDHOST *RH;
+
+ // Protect dimension and lock maps
+ std::mutex Mutex;
+
+ std::unordered_map<RRDDIM *, Dimension *> DimensionsMap;
+ std::unordered_map<Dimension *, std::mutex> LocksMap;
+};
+
+class TrainableHost : public RrdHost {
+public:
+ TrainableHost(RRDHOST *RH) : RrdHost(RH) {}
+
+ void train();
+
+ void updateResourceUsage() {
+ std::lock_guard<std::mutex> Lock(ResourceUsageMutex);
+ getrusage(RUSAGE_THREAD, &ResourceUsage);
+ }
+
+ void getResourceUsage(struct rusage *RU) {
+ std::lock_guard<std::mutex> Lock(ResourceUsageMutex);
+ memcpy(RU, &ResourceUsage, sizeof(struct rusage));
+ }
+
+ void getModelsAsJson(nlohmann::json &Json);
+
+private:
+ std::pair<Dimension *, Duration<double>> findDimensionToTrain(const TimePoint &NowTP);
+ void trainDimension(Dimension *D, const TimePoint &NowTP);
+
+ struct rusage ResourceUsage{};
+ std::mutex ResourceUsageMutex;
+};
+
+class DetectableHost : public TrainableHost {
+public:
+ DetectableHost(RRDHOST *RH) : TrainableHost(RH) {}
+
+ void startAnomalyDetectionThreads();
+ void stopAnomalyDetectionThreads();
+
+ void getDetectionInfoAsJson(nlohmann::json &Json) const;
+
+private:
+ void detect();
+ void detectOnce();
+
+private:
+ std::thread TrainingThread;
+ std::thread DetectionThread;
+
+ CalculatedNumber HostAnomalyRate{0.0};
+
+ size_t NumAnomalousDimensions{0};
+ size_t NumNormalDimensions{0};
+ size_t NumTrainedDimensions{0};
+ size_t NumActiveDimensions{0};
+};
+
+using Host = DetectableHost;
+
+} // namespace ml
+
+#endif /* ML_HOST_H */
diff --git a/ml/KMeans.cc b/ml/KMeans.cc
new file mode 100644
index 0000000..edc2ef4
--- /dev/null
+++ b/ml/KMeans.cc
@@ -0,0 +1,43 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include "KMeans.h"
+#include <dlib/clustering.h>
+
+void KMeans::train(const std::vector<DSample> &Samples, size_t MaxIterations) {
+ MinDist = std::numeric_limits<CalculatedNumber>::max();
+ MaxDist = std::numeric_limits<CalculatedNumber>::min();
+
+ ClusterCenters.clear();
+
+ dlib::pick_initial_centers(NumClusters, ClusterCenters, Samples);
+ dlib::find_clusters_using_kmeans(Samples, ClusterCenters, MaxIterations);
+
+ for (const auto &S : Samples) {
+ CalculatedNumber MeanDist = 0.0;
+
+ for (const auto &KMCenter : ClusterCenters)
+ MeanDist += dlib::length(KMCenter - S);
+
+ MeanDist /= NumClusters;
+
+ if (MeanDist < MinDist)
+ MinDist = MeanDist;
+
+ if (MeanDist > MaxDist)
+ MaxDist = MeanDist;
+ }
+}
+
+CalculatedNumber KMeans::anomalyScore(const DSample &Sample) const {
+ CalculatedNumber MeanDist = 0.0;
+ for (const auto &CC: ClusterCenters)
+ MeanDist += dlib::length(CC - Sample);
+
+ MeanDist /= NumClusters;
+
+ if (MaxDist == MinDist)
+ return 0.0;
+
+ CalculatedNumber AnomalyScore = 100.0 * std::abs((MeanDist - MinDist) / (MaxDist - MinDist));
+ return (AnomalyScore > 100.0) ? 100.0 : AnomalyScore;
+}
diff --git a/ml/KMeans.h b/ml/KMeans.h
new file mode 100644
index 0000000..0398eeb
--- /dev/null
+++ b/ml/KMeans.h
@@ -0,0 +1,41 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef KMEANS_H
+#define KMEANS_H
+
+#include <atomic>
+#include <vector>
+#include <limits>
+#include <mutex>
+
+#include "SamplesBuffer.h"
+#include "json/single_include/nlohmann/json.hpp"
+
+class KMeans {
+public:
+ KMeans(size_t NumClusters = 2) : NumClusters(NumClusters) {
+ MinDist = std::numeric_limits<CalculatedNumber>::max();
+ MaxDist = std::numeric_limits<CalculatedNumber>::min();
+ };
+
+ void train(const std::vector<DSample> &Samples, size_t MaxIterations);
+ CalculatedNumber anomalyScore(const DSample &Sample) const;
+
+ void toJson(nlohmann::json &J) const {
+ J = nlohmann::json{
+ {"CCs", ClusterCenters},
+ {"MinDist", MinDist},
+ {"MaxDist", MaxDist}
+ };
+ }
+
+private:
+ size_t NumClusters;
+
+ std::vector<DSample> ClusterCenters;
+
+ CalculatedNumber MinDist;
+ CalculatedNumber MaxDist;
+};
+
+#endif /* KMEANS_H */
diff --git a/ml/Query.h b/ml/Query.h
new file mode 100644
index 0000000..78d1170
--- /dev/null
+++ b/ml/Query.h
@@ -0,0 +1,57 @@
+#ifndef QUERY_H
+#define QUERY_H
+
+#include "ml-private.h"
+
+namespace ml {
+
+class Query {
+public:
+ Query(RRDDIM *RD) : RD(RD), Initialized(false) {
+ Ops = RD->tiers[0]->query_ops;
+ }
+
+ time_t latestTime() {
+ return Ops->latest_time(RD->tiers[0]->db_metric_handle);
+ }
+
+ time_t oldestTime() {
+ return Ops->oldest_time(RD->tiers[0]->db_metric_handle);
+ }
+
+ void init(time_t AfterT, time_t BeforeT) {
+ Ops->init(RD->tiers[0]->db_metric_handle, &Handle, AfterT, BeforeT);
+ Initialized = true;
+ points_read = 0;
+ }
+
+ bool isFinished() {
+ return Ops->is_finished(&Handle);
+ }
+
+ ~Query() {
+ if (Initialized) {
+ Ops->finalize(&Handle);
+ global_statistics_ml_query_completed(points_read);
+ points_read = 0;
+ }
+ }
+
+ std::pair<time_t, CalculatedNumber> nextMetric() {
+ points_read++;
+ STORAGE_POINT sp = Ops->next_metric(&Handle);
+ return { sp.start_time, sp.sum / sp.count };
+ }
+
+private:
+ RRDDIM *RD;
+ bool Initialized;
+ size_t points_read;
+
+ struct storage_engine_query_ops *Ops;
+ struct storage_engine_query_handle Handle;
+};
+
+} // namespace ml
+
+#endif /* QUERY_H */
diff --git a/ml/README.md b/ml/README.md
new file mode 100644
index 0000000..f6fd923
--- /dev/null
+++ b/ml/README.md
@@ -0,0 +1,310 @@
+<!--
+title: Configure machine learning (ML) powered anomaly detection
+custom_edit_url: https://github.com/netdata/netdata/edit/master/ml/README.md
+description: This is an in-depth look at how Netdata uses ML to detect anomalies.
+keywords: [machine learning, anomaly detection, Netdata ML]
+-->
+# Machine learning (ML) powered anomaly detection
+
+## Overview
+
+As of [`v1.32.0`](https://github.com/netdata/netdata/releases/tag/v1.32.0), Netdata comes with some ML powered [anomaly detection](https://en.wikipedia.org/wiki/Anomaly_detection) capabilities built into it and available to use out of the box, with zero configuration required (ML was enabled by default in `v1.35.0-29-nightly` in [this PR](https://github.com/netdata/netdata/pull/13158), previously it required a one line config change).
+
+🚧 **Note**: If you would like to get involved and help us with some feedback, email us at analytics-ml-team@netdata.cloud, comment on the [beta launch post](https://community.netdata.cloud/t/anomaly-advisor-beta-launch/2717) in the Netdata community, or come join us in the [🤖-ml-powered-monitoring](https://discord.gg/4eRSEUpJnc) channel of the Netdata discord.
+
+Once ML is enabled, Netdata will begin training a model for each dimension. By default this model is a [k-means clustering](https://en.wikipedia.org/wiki/K-means_clustering) model trained on the most recent 4 hours of data. Rather than just using the most recent value of each raw metric, the model works on a preprocessed ["feature vector"](#feature-vector) of recent smoothed and differenced values. This should enable the model to detect a wider range of potentially anomalous patterns in recent observations as opposed to just point anomalies like big spikes or drops. ([This infographic](https://user-images.githubusercontent.com/2178292/144414415-275a3477-5b47-43d6-8959-509eb48ebb20.png) shows some different types of anomalies.)
+
+The sections below will introduce some of the main concepts:
+- anomaly bit
+- anomaly score
+- anomaly rate
+- anomaly detector
+
+Additional explanations and details can be found in the [Glossary](#glossary) and [Notes](#notes) at the bottom of the page.
+
+### Anomaly Bit - (100 = Anomalous, 0 = Normal)
+
+Once each model is trained, Netdata will begin producing an ["anomaly score"](#anomaly-score) at each time step for each dimension. This ["anomaly score"](#anomaly-score) is essentially a distance measure to the trained cluster centers of the model (by default each model has k=2, so two cluster centers are learned). More anomalous looking data should be more distant to those cluster centers. If this ["anomaly score"](#anomaly-score) is sufficiently large, this is a sign that the recent raw values of the dimension could potentially be anomalous. By default, "sufficiently large" means that the distance is in the 99th percentile or above of all distances observed during training or, put another way, it has to be further away than the furthest 1% of the data used during training. Once this threshold is passed, the ["anomaly bit"](#anomaly-bit) corresponding to that dimension is set to 100 to flag it as anomalous, otherwise it would be left at 0 to signal normal data.
+
+What this means is that in addition to the raw value of each metric, Netdata now also stores an ["anomaly bit"](#anomaly-bit) that is either 100 (anomalous) or 0 (normal). Importantly, this is achieved without additional storage overhead due to how the anomaly bit has been implemented within the existing internal Netdata storage representation.
+
+This ["anomaly bit"](#anomaly-bit) is exposed via the `anomaly-bit` key that can be passed to the `options` param of the `/api/v1/data` REST API.
+
+For example, here are some recent raw dimension values for `system.ip` on our [london](http://london.my-netdata.io/) demo server:
+
+[`https://london.my-netdata.io/api/v1/data?chart=system.ip`](https://london.my-netdata.io/api/v1/data?chart=system.ip)
+
+```
+{
+ "labels": ["time", "received", "sent"],
+ "data":
+ [
+ [ 1638365672, 54.84098, -76.70201],
+ [ 1638365671, 124.4328, -309.7543],
+ [ 1638365670, 123.73152, -167.9056],
+ ...
+ ]
+}
+```
+
+And if we add the `&options=anomaly-bit` params, we can see the "anomaly bit" value corresponding to each raw dimension value:
+
+[`https://london.my-netdata.io/api/v1/data?chart=system.ip&options=anomaly-bit`](https://london.my-netdata.io/api/v1/data?chart=system.ip&options=anomaly-bit)
+
+```
+{
+ "labels": ["time", "received", "sent"],
+ "data":
+ [
+ [ 1638365672, 0, 0],
+ [ 1638365671, 0, 0],
+ [ 1638365670, 0, 0],
+ ...
+ ]
+}
+```
+In this example, the dimensions "received" and "sent" didn't show any abnormal behavior, so the anomaly bit is zero.
+Under normal circumstances, the anomaly bit will mostly be 0. However, there can be random fluctuations setting the anomaly to 100, although this very much depends on the nature of the dimension in question.
+
+### Anomaly Rate - average(anomaly bit)
+
+Once all models have been trained, we can think of the Netdata dashboard as essentially a big matrix or table of 0's and 100's. If we consider this "anomaly bit"-based representation of the state of the node, we can now think about how we might detect overall node level anomalies. The figure below illustrates the main ideas.
+
+```
+ dimensions
+time d1 d2 d3 d4 d5 NAR
+ 1 0 0 0 0 0 0%
+ 2 0 0 0 0 100 20%
+ 3 0 0 0 0 0 0%
+ 4 0 100 0 0 0 20%
+ 5 100 0 0 0 0 20%
+ 6 0 100 100 0 100 60%
+ 7 0 100 0 100 0 40%
+ 8 0 0 0 0 100 20%
+ 9 0 0 100 100 0 40%
+ 10 0 0 0 0 0 0%
+
+DAR 10% 30% 20% 20% 30% 22% NAR_t1-t10
+
+DAR = Dimension Anomaly Rate
+NAR = Node Anomaly Rate
+NAR_t1-t10 = Node Anomaly Rate over t1 to t10
+```
+
+To work out an ["anomaly rate"](#anomaly-rate), we can just average a row or a column in any direction. For example, if we were to just average along a row then this would be the ["node anomaly rate"](#node-anomaly-rate) (all dimensions) at time t. Likewise if we averaged a column then we would have the ["dimension anomaly rate"](#dimension-anomaly-rate) for each dimension over the time window t=1-10. Extending this idea, we can work out an overall ["anomaly rate"](#anomaly-rate) for the whole matrix or any subset of it we might be interested in.
+
+### Anomaly Detector - Node level anomaly events
+
+An ["anomaly detector"](#anomaly-detector) looks at all anomaly bits of a node. Netdata's anomaly detector produces an ["anomaly event"](#anomaly-event) when a the percentage of anomaly bits is high enough for a persistent amount of time. This anomaly event signals that there was sufficient evidence among all the anomaly bits that some strange behavior might have been detected in a more global sense across the node.
+
+Essentially if the ["Node Anomaly Rate"](#node-anomaly-rate) (NAR) passes a defined threshold and stays above that threshold for a persistent amount of time, a "Node [Anomaly Event](#anomaly-event)" will be triggered.
+
+These anomaly events are currently exposed via `/api/v1/anomaly_events`
+
+**Note**: Clicking the link below will likely return an empty list of `[]`. This is the response when no anomaly events exist in the specified range. The example response below is illustrative of what the response would be when one or more anomaly events exist within the range of `after` to `before`.
+
+https://london.my-netdata.io/api/v1/anomaly_events?after=1638365182000&before=1638365602000
+
+If an event exists within the window, the result would be a list of start and end times.
+
+```
+[
+ [
+ 1638367788,
+ 1638367851
+ ]
+]
+```
+
+Information about each anomaly event can then be found at the `/api/v1/anomaly_event_info` endpoint (making sure to pass the `after` and `before` params):
+
+**Note**: If you click the below url you will get a `null` since no such anomaly event exists as the response is just an illustrative example taken from a node that did have such an anomaly event.
+
+https://london.my-netdata.io/api/v1/anomaly_event_info?after=1638367788&before=1638367851
+
+```
+[
+ [
+ 0.66,
+ "netdata.response_time|max"
+ ],
+ [
+ 0.63,
+ "netdata.response_time|average"
+ ],
+ [
+ 0.54,
+ "netdata.requests|requests"
+ ],
+ ...
+```
+
+The query returns a list of dimension anomaly rates for all dimensions that were considered part of the detected anomaly event.
+
+**Note**: We plan to build additional anomaly detection and exploration features into both Netdata Agent and Netdata Cloud. The current endpoints are still under active development to power the upcoming features.
+
+## Configuration
+
+If you are running a netdata version after `v1.35.0-29-nightly` then ML will be enabled by default.
+
+To enable or disable anomaly detection:
+1. Find and open the Netdata configuration file `netdata.conf`.
+2. In the `[ml]` section, set `enabled = yes` to enable or `enabled = no` to disable.
+3. Restart netdata (typically `sudo systemctl restart netdata`).
+
+**Note**: If you would like to learn more about configuring Netdata please see [the configuration guide](https://learn.netdata.cloud/guides/step-by-step/step-04).
+
+Below is a list of all the available configuration params and their default values.
+
+```
+[ml]
+ # enabled = yes
+ # maximum num samples to train = 14400
+ # minimum num samples to train = 3600
+ # train every = 3600
+ # dbengine anomaly rate every = 30
+ # num samples to diff = 1
+ # num samples to smooth = 3
+ # num samples to lag = 5
+ # random sampling ratio = 0.2
+ # maximum number of k-means iterations = 1000
+ # dimension anomaly score threshold = 0.99
+ # host anomaly rate threshold = 0.01000
+ # minimum window size = 30.00000
+ # maximum window size = 600.00000
+ # idle window size = 30.00000
+ # window minimum anomaly rate = 0.25000
+ # anomaly event min dimension rate threshold = 0.05000
+ # hosts to skip from training = !*
+ # charts to skip from training = netdata.*
+```
+
+### Configuration Examples
+
+If you would like to run ML on a parent instead of at the edge, some configuration options are illustrated below.
+
+This example assumes 3 child nodes [streaming](https://learn.netdata.cloud/docs/agent/streaming) to 1 parent node and illustrates the main ways you might want to configure running ML for the children on the parent, running ML on the children themselves, or even a mix of approaches.
+
+![parent_child_options](https://user-images.githubusercontent.com/2178292/164439761-8fb7dddd-c4d8-4329-9f44-9a794937a086.png)
+
+```
+# parent will run ML for itself and child 1,2, it will skip running ML for child 0.
+# child 0 will run its own ML at the edge.
+# child 1 will run its own ML at the edge, even though parent will also run ML for it, a bit wasteful potentially to run ML in both places but is possible (Netdata Cloud will essentially average any overlapping models).
+# child 2 will not run ML at the edge, it will be run in the parent only.
+
+# parent-ml-enabled
+# run ML on all hosts apart from child-ml-enabled
+[ml]
+ enabled = yes
+ hosts to skip from training = child-0-ml-enabled
+
+# child-0-ml-enabled
+# run ML on child-0-ml-enabled
+[ml]
+ enabled = yes
+
+# child-1-ml-enabled
+# run ML on child-1-ml-enabled
+[ml]
+ enabled = yes
+
+# child-2-ml-disabled
+# do not run ML on child-2-ml-disabled
+[ml]
+ enabled = no
+```
+
+### Descriptions (min/max)
+
+- `enabled`: `yes` to enable, `no` to disable.
+- `maximum num samples to train`: (`3600`/`86400`) This is the maximum amount of time you would like to train each model on. For example, the default of `14400` trains on the preceding 4 hours of data, assuming an `update every` of 1 second.
+- `minimum num samples to train`: (`900`/`21600`) This is the minimum amount of data required to be able to train a model. For example, the default of `900` implies that once at least 15 minutes of data is available for training, a model is trained, otherwise it is skipped and checked again at the next training run.
+- `train every`: (`1800`/`21600`) This is how often each model will be retrained. For example, the default of `3600` means that each model is retrained every hour. Note: The training of all models is spread out across the `train every` period for efficiency, so in reality, it means that each model will be trained in a staggered manner within each `train every` period.
+- `dbengine anomaly rate every`: (`30`/`900`) This is how often netdata will aggregate all the anomaly bits into a single chart (`anomaly_detection.anomaly_rates`). The aggregation into a single chart allows enabling anomaly rate ranking over _all_ metrics with one API call as opposed to a call per chart.
+- `num samples to diff`: (`0`/`1`) This is a `0` or `1` to determine if you want the model to operate on differences of the raw data or just the raw data. For example, the default of `1` means that we take differences of the raw values. Using differences is more general and works on dimensions that might naturally tend to have some trends or cycles in them that is normal behavior to which we don't want to be too sensitive.
+- `num samples to smooth`: (`0`/`5`) This is a small integer that controls the amount of smoothing applied as part of the feature processing used by the model. For example, the default of `3` means that the rolling average of the last 3 values is used. Smoothing like this helps the model be a little more robust to spiky types of dimensions that naturally "jump" up or down as part of their normal behavior.
+- `num samples to lag`: (`0`/`5`) This is a small integer that determines how many lagged values of the dimension to include in the feature vector. For example, the default of `5` means that in addition to the most recent (by default, differenced and smoothed) value of the dimension, the feature vector will also include the 5 previous values too. Using lagged values in our feature representation allows the model to work over strange patterns over recent values of a dimension as opposed to just focusing on if the most recent value itself is big or small enough to be anomalous.
+- `random sampling ratio`: (`0.2`/`1.0`) This parameter determines how much of the available training data is randomly sampled when training a model. The default of `0.2` means that Netdata will train on a random 20% of training data. This parameter influences cost efficiency. At `0.2` the model is still reasonably trained while minimizing system overhead costs caused by the training.
+- `maximum number of k-means iterations`: This is a parameter that can be passed to the model to limit the number of iterations in training the k-means model. Vast majority of cases can ignore and leave as default.
+- `dimension anomaly score threshold`: (`0.01`/`5.00`) This is the threshold at which an individual dimension at a specific timestep is considered anomalous or not. For example, the default of `0.99` means that a dimension with an anomaly score of 99% or higher is flagged as anomalous. This is a normalized probability based on the training data, so the default of 99% means that anything that is as strange (based on distance measure) or more strange as the most strange 1% of data observed during training will be flagged as anomalous. If you wanted to make the anomaly detection on individual dimensions more sensitive you could try a value like `0.90` (90%) or to make it less sensitive you could try `1.5` (150%).
+- `host anomaly rate threshold`: (`0.0`/`1.0`) This is the percentage of dimensions (based on all those enabled for anomaly detection) that need to be considered anomalous at specific timestep for the host itself to be considered anomalous. For example, the default value of `0.01` means that if more than 1% of dimensions are anomalous at the same time then the host itself is considered in an anomalous state.
+- `minimum window size`: The Netdata "Anomaly Detector" logic works over a rolling window of data. This parameter defines the minimum length of window to consider. If over this window the host is in an anomalous state then an anomaly detection event will be triggered. For example, the default of `30` means that the detector will initially work over a rolling window of 30 seconds. Note: The length of this window will be dynamic once an anomaly event has been triggered such that it will expand as needed until either the max length of an anomaly event is hit or the host settles back into a normal state with sufficiently decreased host level anomaly states in the rolling window. Note: If you wanted to adjust the higher level anomaly detector behavior then this is one parameter you might adjust to see the impact of on anomaly detection events.
+- `maximum window size`: This parameter defines the maximum length of window to consider. If an anomaly event reaches this size, it will be closed. This is to provide an upper bound on the length of an anomaly event and cost of the anomaly detector logic for that event.
+- `window minimum anomaly rate`: (`0.0`/`1.0`) This parameter corresponds to a threshold on the percentage of time in the rolling window that the host was considered in an anomalous state. For example, the default of `0.25` means that if the host is in an anomalous state for 25% of more of the rolling window then and anomaly event will be triggered or extended if one is already active. Note: If you want to make the anomaly detector itself less sensitive, you can adjust this value to something like `0.75` which would mean the host needs to be much more consistently in an anomalous state to trigger an anomaly detection event. Likewise, a lower value like `0.1` would make the anomaly detector more sensitive.
+- `anomaly event min dimension rate threshold`: (`0.0`/`1.0`) This is a parameter that helps filter out irrelevant dimensions from anomaly events. For example, the default of `0.05` means that only dimensions that were considered anomalous for at least 5% of the anomaly event itself will be included in that anomaly event. The idea here is to just include dimensions that were consistently anomalous as opposed to those that may have just randomly happened to be anomalous at the same time.
+- `hosts to skip from training`: This parameter allows you to turn off anomaly detection for any child hosts on a parent host by defining those you would like to skip from training here. For example, a value like `dev-*` skips all hosts on a parent that begin with the "dev-" prefix. The default value of `!*` means "don't skip any".
+- `charts to skip from training`: This parameter allows you to exclude certain charts from anomaly detection. By default, only netdata related charts are excluded. This is to avoid the scenario where accessing the netdata dashboard could itself tigger some anomalies if you don't access them regularly. If you want to include charts that are excluded by default, add them in small groups and then measure any impact on performance before adding additional ones. Example: If you want to include system, apps, and user charts:`!system.* !apps.* !user.* *`.
+
+## Charts
+
+Once enabled, the "Anomaly Detection" menu and charts will be available on the dashboard.
+
+![anomaly_detection_menu](https://user-images.githubusercontent.com/2178292/144255721-4568aabf-39c7-4855-bf1c-31b1d60e28e6.png)
+
+In terms of anomaly detection, the most interesting charts would be the `anomaly_detection.dimensions` and `anomaly_detection.anomaly_rate` ones, which hold the `anomalous` and `anomaly_rate` dimensions that show the overall number of dimensions considered anomalous at any time and the corresponding anomaly rate.
+
+- `anomaly_detection.dimensions`: Total count of dimensions considered anomalous or normal.
+- `anomaly_detection.dimensions`: Percentage of anomalous dimensions.
+- `anomaly_detection.detector_window`: The length of the active window used by the detector.
+- `anomaly_detection.detector_events`: Flags (0 or 1) to show when an anomaly event has been triggered by the detector.
+
+Below is an example of how these charts may look in the presence of an anomaly event.
+
+Initially we see a jump in `anomalous` dimensions:
+
+![anomalous](https://user-images.githubusercontent.com/2178292/144256036-c89fa768-5e5f-4278-9725-c67521c0d95e.png)
+
+And a corresponding jump in the `anomaly_rate`:
+
+![anomaly_rate](https://user-images.githubusercontent.com/2178292/144256071-7d157438-31f3-4b23-a795-0fd3b2e2e85c.png)
+
+After a short while the rolling node anomaly rate goes `above_threshold`, and once it stays above threshold for long enough a `new_anomaly_event` is created:
+
+![anomaly_event](https://user-images.githubusercontent.com/2178292/144256152-910b06ec-26b8-45b4-bcb7-4c2acdf9af15.png)
+
+## Glossary
+
+#### _feature vector_
+
+A [feature vector](https://en.wikipedia.org/wiki/Feature_(machine_learning)) is what the ML model is trained on and uses for prediction. The most simple feature vector would be just the latest raw dimension value itself [x]. By default Netdata will use a feature vector consisting of the 6 latest differences and smoothed values of the dimension so conceptually something like `[avg3(diff1(x-5)), avg3(diff1(x-4)), avg3(diff1(x-3)), avg3(diff1(x-2)), avg3(diff1(x-1)), avg3(diff1(x))]` which ends up being just 6 floating point numbers that try and represent the "shape" of recent data.
+
+#### _anomaly score_
+
+At prediction time the anomaly score is just the distance of the most recent feature vector to the trained cluster centers of the model, which are themselves just feature vectors, albeit supposedly the best most representative feature vectors that could be "learned" from the training data. So if the most recent feature vector is very far away in terms of [euclidean distance](https://en.wikipedia.org/wiki/Euclidean_distance#:~:text=In%20mathematics%2C%20the%20Euclidean%20distance,being%20called%20the%20Pythagorean%20distance.) it's more likely that the recent data it represents consists of some strange pattern not commonly found in the training data.
+
+#### _anomaly bit_
+
+If the anomaly score is greater than a specified threshold then the most recent feature vector, and hence most recent raw data, is considered anomalous. Since storing the raw anomaly score would essentially double amount of storage space Netdata would need, we instead efficiently store just the anomaly bit in the existing internal Netdata data representation without any additional storage overhead.
+
+#### _anomaly rate_
+
+An anomaly rate is really just an average over one or more anomaly bits. An anomaly rate can be calculated over time for one or more dimensions or at a point in time across multiple dimensions, or some combination of the two. Its just an average of some collection of anomaly bits.
+
+#### _anomaly detector_
+
+The is essentially business logic that just tries to process a collection of anomaly bits to determine if there is enough active anomaly bits to merit investigation or declaration of a node level anomaly event.
+
+#### _anomaly event_
+
+Anomaly events are triggered by the anomaly detector and represent a window of time on the node with sufficiently elevated anomaly rates across all dimensions.
+
+#### _dimension anomaly rate_
+
+The anomaly rate of a specific dimension over some window of time.
+
+#### _node anomaly rate_
+
+The anomaly rate across all dimensions of a node.
+
+## Notes
+
+- We would love to hear any feedback relating to this functionality, please email us at analytics-ml-team@netdata.cloud or come join us in the [🤖-ml-powered-monitoring](https://discord.gg/4eRSEUpJnc) channel of the Netdata discord.
+- We are working on additional UI/UX based features that build on these core components to make them as useful as possible out of the box.
+- Although not yet a core focus of this work, users could leverage the `anomaly_detection` chart dimensions and/or `anomaly-bit` options in defining alarms based on ML driven anomaly detection models.
+- [This presentation](https://docs.google.com/presentation/d/18zkCvU3nKP-Bw_nQZuXTEa4PIVM6wppH3VUnAauq-RU/edit?usp=sharing) walks through some of the main concepts covered above in a more informal way.
+- After restart Netdata will wait until `minimum num samples to train` observations of data are available before starting training and prediction.
+- Netdata uses [dlib](https://github.com/davisking/dlib) under the hood for its core ML features.
+- You should benchmark Netdata resource usage before and after enabling ML. Typical overhead ranges from 1-2% additional CPU at most.
+- The "anomaly bit" has been implemented to be a building block to underpin many more ML based use cases that we plan to deliver soon.
+- At its core Netdata uses an approach and problem formulation very similar to the Netdata python [anomalies collector](https://learn.netdata.cloud/docs/agent/collectors/python.d.plugin/anomalies), just implemented in a much much more efficient and scalable way in the agent in c++. So if you would like to learn more about the approach and are familiar with Python that is a useful resource to explore, as is the corresponding [deep dive tutorial](https://nbviewer.org/github/netdata/community/blob/main/netdata-agent-api/netdata-pandas/anomalies_collector_deepdive.ipynb) where the default model used is PCA instead of K-Means but the overall approach and formulation is similar.
diff --git a/ml/SamplesBuffer.cc b/ml/SamplesBuffer.cc
new file mode 100644
index 0000000..d276c6e
--- /dev/null
+++ b/ml/SamplesBuffer.cc
@@ -0,0 +1,150 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+//
+#include "SamplesBuffer.h"
+
+#include <fstream>
+#include <sstream>
+#include <string>
+
+void Sample::print(std::ostream &OS) const {
+ for (size_t Idx = 0; Idx != NumDims - 1; Idx++)
+ OS << CNs[Idx] << ", ";
+
+ OS << CNs[NumDims - 1];
+}
+
+void SamplesBuffer::print(std::ostream &OS) const {
+ for (size_t Idx = Preprocessed ? (DiffN + (SmoothN - 1) + (LagN)) : 0;
+ Idx != NumSamples; Idx++) {
+ Sample S = Preprocessed ? getPreprocessedSample(Idx) : getSample(Idx);
+ OS << S << std::endl;
+ }
+}
+
+std::vector<Sample> SamplesBuffer::getPreprocessedSamples() const {
+ std::vector<Sample> V;
+
+ for (size_t Idx = Preprocessed ? (DiffN + (SmoothN - 1) + (LagN)) : 0;
+ Idx != NumSamples; Idx++) {
+ Sample S = Preprocessed ? getPreprocessedSample(Idx) : getSample(Idx);
+ V.push_back(S);
+ }
+
+ return V;
+}
+
+void SamplesBuffer::diffSamples() {
+ // Panda's DataFrame default behaviour is to subtract each element from
+ // itself. For us `DiffN = 0` means "disable diff-ing" when preprocessing
+ // the samples buffer. This deviation will make it easier for us to test
+ // the KMeans implementation.
+ if (DiffN == 0)
+ return;
+
+ for (size_t Idx = 0; Idx != (NumSamples - DiffN); Idx++) {
+ size_t High = (NumSamples - 1) - Idx;
+ size_t Low = High - DiffN;
+
+ Sample LHS = getSample(High);
+ Sample RHS = getSample(Low);
+
+ LHS.diff(RHS);
+ }
+}
+
+void SamplesBuffer::smoothSamples() {
+ // Holds the mean value of each window
+ CalculatedNumber *AccCNs = new CalculatedNumber[NumDimsPerSample]();
+ Sample Acc(AccCNs, NumDimsPerSample);
+
+ // Used to avoid clobbering the accumulator when moving the window
+ CalculatedNumber *TmpCNs = new CalculatedNumber[NumDimsPerSample]();
+ Sample Tmp(TmpCNs, NumDimsPerSample);
+
+ CalculatedNumber Factor = (CalculatedNumber) 1 / SmoothN;
+
+ // Calculate the value of the 1st window
+ for (size_t Idx = 0; Idx != std::min(SmoothN, NumSamples); Idx++) {
+ Tmp.add(getSample(NumSamples - (Idx + 1)));
+ }
+
+ Acc.add(Tmp);
+ Acc.scale(Factor);
+
+ // Move the window and update the samples
+ for (size_t Idx = NumSamples; Idx != (DiffN + SmoothN - 1); Idx--) {
+ Sample S = getSample(Idx - 1);
+
+ // Tmp <- Next window (if any)
+ if (Idx >= (SmoothN + 1)) {
+ Tmp.diff(S);
+ Tmp.add(getSample(Idx - (SmoothN + 1)));
+ }
+
+ // S <- Acc
+ S.copy(Acc);
+
+ // Acc <- Tmp
+ Acc.copy(Tmp);
+ Acc.scale(Factor);
+ }
+
+ delete[] AccCNs;
+ delete[] TmpCNs;
+}
+
+void SamplesBuffer::lagSamples() {
+ if (LagN == 0)
+ return;
+
+ for (size_t Idx = NumSamples; Idx != LagN; Idx--) {
+ Sample PS = getPreprocessedSample(Idx - 1);
+ PS.lag(getSample(Idx - 1), LagN);
+ }
+}
+
+std::vector<DSample> SamplesBuffer::preprocess() {
+ assert(Preprocessed == false);
+
+ std::vector<DSample> DSamples;
+ size_t OutN = NumSamples;
+
+ // Diff
+ if (DiffN >= OutN)
+ return DSamples;
+ OutN -= DiffN;
+ diffSamples();
+
+ // Smooth
+ if (SmoothN == 0 || SmoothN > OutN)
+ return DSamples;
+ OutN -= (SmoothN - 1);
+ smoothSamples();
+
+ // Lag
+ if (LagN >= OutN)
+ return DSamples;
+ OutN -= LagN;
+ lagSamples();
+
+ DSamples.reserve(OutN);
+ Preprocessed = true;
+
+ uint32_t MaxMT = std::numeric_limits<uint32_t>::max();
+ uint32_t CutOff = static_cast<double>(MaxMT) * SamplingRatio;
+
+ for (size_t Idx = NumSamples - OutN; Idx != NumSamples; Idx++) {
+ if (RandNums[Idx] > CutOff)
+ continue;
+
+ DSample DS;
+ DS.set_size(NumDimsPerSample * (LagN + 1));
+
+ const Sample PS = getPreprocessedSample(Idx);
+ PS.initDSample(DS);
+
+ DSamples.push_back(DS);
+ }
+
+ return DSamples;
+}
diff --git a/ml/SamplesBuffer.h b/ml/SamplesBuffer.h
new file mode 100644
index 0000000..1c7215c
--- /dev/null
+++ b/ml/SamplesBuffer.h
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef SAMPLES_BUFFER_H
+#define SAMPLES_BUFFER_H
+
+#include <iostream>
+#include <vector>
+
+#include <cassert>
+#include <cstdlib>
+#include <cstring>
+
+#include <dlib/matrix.h>
+
+typedef double CalculatedNumber;
+typedef dlib::matrix<CalculatedNumber, 0, 1> DSample;
+
+class Sample {
+public:
+ Sample(CalculatedNumber *Buf, size_t N) : CNs(Buf), NumDims(N) {}
+
+ void initDSample(DSample &DS) const {
+ for (size_t Idx = 0; Idx != NumDims; Idx++) {
+ DS(Idx) = std::abs(CNs[Idx]);
+ }
+ }
+
+ void add(const Sample &RHS) const {
+ assert(NumDims == RHS.NumDims);
+
+ for (size_t Idx = 0; Idx != NumDims; Idx++)
+ CNs[Idx] += RHS.CNs[Idx];
+ };
+
+ void diff(const Sample &RHS) const {
+ assert(NumDims == RHS.NumDims);
+
+ for (size_t Idx = 0; Idx != NumDims; Idx++)
+ CNs[Idx] -= RHS.CNs[Idx];
+ };
+
+ void copy(const Sample &RHS) const {
+ assert(NumDims == RHS.NumDims);
+
+ std::memcpy(CNs, RHS.CNs, NumDims * sizeof(CalculatedNumber));
+ }
+
+ void scale(CalculatedNumber Factor) {
+ for (size_t Idx = 0; Idx != NumDims; Idx++)
+ CNs[Idx] *= Factor;
+ }
+
+ void lag(const Sample &S, size_t LagN) {
+ size_t N = S.NumDims;
+
+ for (size_t Idx = 0; Idx != (LagN + 1); Idx++) {
+ Sample Src(S.CNs - (Idx * N), N);
+ Sample Dst(CNs + (Idx * N), N);
+ Dst.copy(Src);
+ }
+ }
+
+ const CalculatedNumber *getCalculatedNumbers() const {
+ return CNs;
+ };
+
+ void print(std::ostream &OS) const;
+
+private:
+ CalculatedNumber *CNs;
+ size_t NumDims;
+};
+
+inline std::ostream& operator<<(std::ostream &OS, const Sample &S) {
+ S.print(OS);
+ return OS;
+}
+
+class SamplesBuffer {
+public:
+ SamplesBuffer(CalculatedNumber *CNs,
+ size_t NumSamples, size_t NumDimsPerSample,
+ size_t DiffN, size_t SmoothN, size_t LagN,
+ double SamplingRatio, std::vector<uint32_t> &RandNums) :
+ CNs(CNs), NumSamples(NumSamples), NumDimsPerSample(NumDimsPerSample),
+ DiffN(DiffN), SmoothN(SmoothN), LagN(LagN),
+ SamplingRatio(SamplingRatio), RandNums(RandNums),
+ BytesPerSample(NumDimsPerSample * sizeof(CalculatedNumber)),
+ Preprocessed(false) {};
+
+ std::vector<DSample> preprocess();
+ std::vector<Sample> getPreprocessedSamples() const;
+
+ size_t capacity() const { return NumSamples; }
+ void print(std::ostream &OS) const;
+
+private:
+ size_t getSampleOffset(size_t Index) const {
+ assert(Index < NumSamples);
+ return Index * NumDimsPerSample;
+ }
+
+ size_t getPreprocessedSampleOffset(size_t Index) const {
+ assert(Index < NumSamples);
+ return getSampleOffset(Index) * (LagN + 1);
+ }
+
+ void setSample(size_t Index, const Sample &S) const {
+ size_t Offset = getSampleOffset(Index);
+ std::memcpy(&CNs[Offset], S.getCalculatedNumbers(), BytesPerSample);
+ }
+
+ const Sample getSample(size_t Index) const {
+ size_t Offset = getSampleOffset(Index);
+ return Sample(&CNs[Offset], NumDimsPerSample);
+ };
+
+ const Sample getPreprocessedSample(size_t Index) const {
+ size_t Offset = getPreprocessedSampleOffset(Index);
+ return Sample(&CNs[Offset], NumDimsPerSample * (LagN + 1));
+ };
+
+ void diffSamples();
+ void smoothSamples();
+ void lagSamples();
+
+private:
+ CalculatedNumber *CNs;
+ size_t NumSamples;
+ size_t NumDimsPerSample;
+ size_t DiffN;
+ size_t SmoothN;
+ size_t LagN;
+ double SamplingRatio;
+ std::vector<uint32_t> &RandNums;
+
+ size_t BytesPerSample;
+ bool Preprocessed;
+};
+
+inline std::ostream& operator<<(std::ostream& OS, const SamplesBuffer &SB) {
+ SB.print(OS);
+ return OS;
+}
+
+#endif /* SAMPLES_BUFFER_H */
diff --git a/ml/SamplesBufferTests.cc b/ml/SamplesBufferTests.cc
new file mode 100644
index 0000000..5997a2a
--- /dev/null
+++ b/ml/SamplesBufferTests.cc
@@ -0,0 +1,146 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include "ml/ml-private.h"
+#include <gtest/gtest.h>
+
+/*
+ * The SamplesBuffer class implements the functionality of the following python
+ * code:
+ * >> df = pd.DataFrame(data=samples)
+ * >> df = df.diff(diff_n).dropna()
+ * >> df = df.rolling(smooth_n).mean().dropna()
+ * >> df = pd.concat([df.shift(n) for n in range(lag_n + 1)], axis=1).dropna()
+ *
+ * Its correctness has been verified by automatically generating random
+ * data frames in Python and comparing them with the correspondent preprocessed
+ * SampleBuffers.
+ *
+ * The following tests are meant to catch unintended changes in the SamplesBuffer
+ * implementation. For development purposes, one should compare changes against
+ * the aforementioned python code.
+*/
+
+TEST(SamplesBufferTest, NS_8_NDPS_1_DN_1_SN_3_LN_1) {
+ size_t NumSamples = 8, NumDimsPerSample = 1;
+ size_t DiffN = 1, SmoothN = 3, LagN = 3;
+
+ size_t N = NumSamples * NumDimsPerSample * (LagN + 1);
+ CalculatedNumber *CNs = new CalculatedNumber[N]();
+
+ CNs[0] = 0.7568336679490107;
+ CNs[1] = 0.4814406581763254;
+ CNs[2] = 0.40073555156221874;
+ CNs[3] = 0.5973257298194408;
+ CNs[4] = 0.5334727814345868;
+ CNs[5] = 0.2632477193454843;
+ CNs[6] = 0.2684839023122384;
+ CNs[7] = 0.851332948637479;
+
+ std::vector<uint32_t> RandNums(NumSamples, std::numeric_limits<uint32_t>::max());
+ SamplesBuffer SB(CNs, NumSamples, NumDimsPerSample, DiffN, SmoothN, LagN, 1.0, RandNums);
+ SB.preprocess();
+
+ std::vector<Sample> Samples = SB.getPreprocessedSamples();
+ EXPECT_EQ(Samples.size(), 2);
+
+ Sample S0 = Samples[0];
+ const CalculatedNumber *S0_CNs = S0.getCalculatedNumbers();
+ Sample S1 = Samples[1];
+ const CalculatedNumber *S1_CNs = S1.getCalculatedNumbers();
+
+ EXPECT_NEAR(S0_CNs[0], -0.109614, 0.001);
+ EXPECT_NEAR(S0_CNs[1], -0.0458293, 0.001);
+ EXPECT_NEAR(S0_CNs[2], 0.017344, 0.001);
+ EXPECT_NEAR(S0_CNs[3], -0.0531693, 0.001);
+
+ EXPECT_NEAR(S1_CNs[0], 0.105953, 0.001);
+ EXPECT_NEAR(S1_CNs[1], -0.109614, 0.001);
+ EXPECT_NEAR(S1_CNs[2], -0.0458293, 0.001);
+ EXPECT_NEAR(S1_CNs[3], 0.017344, 0.001);
+
+ delete[] CNs;
+}
+
+TEST(SamplesBufferTest, NS_8_NDPS_1_DN_2_SN_3_LN_2) {
+ size_t NumSamples = 8, NumDimsPerSample = 1;
+ size_t DiffN = 2, SmoothN = 3, LagN = 2;
+
+ size_t N = NumSamples * NumDimsPerSample * (LagN + 1);
+ CalculatedNumber *CNs = new CalculatedNumber[N]();
+
+ CNs[0] = 0.20511885291342846;
+ CNs[1] = 0.13151717360306558;
+ CNs[2] = 0.6017085062423134;
+ CNs[3] = 0.46256882933941545;
+ CNs[4] = 0.7887758447877941;
+ CNs[5] = 0.9237989080034406;
+ CNs[6] = 0.15552559051428083;
+ CNs[7] = 0.6309750314597955;
+
+ std::vector<uint32_t> RandNums(NumSamples, std::numeric_limits<uint32_t>::max());
+ SamplesBuffer SB(CNs, NumSamples, NumDimsPerSample, DiffN, SmoothN, LagN, 1.0, RandNums);
+ SB.preprocess();
+
+ std::vector<Sample> Samples = SB.getPreprocessedSamples();
+ EXPECT_EQ(Samples.size(), 2);
+
+ Sample S0 = Samples[0];
+ const CalculatedNumber *S0_CNs = S0.getCalculatedNumbers();
+ Sample S1 = Samples[1];
+ const CalculatedNumber *S1_CNs = S1.getCalculatedNumbers();
+
+ EXPECT_NEAR(S0_CNs[0], 0.005016, 0.001);
+ EXPECT_NEAR(S0_CNs[1], 0.326450, 0.001);
+ EXPECT_NEAR(S0_CNs[2], 0.304903, 0.001);
+
+ EXPECT_NEAR(S1_CNs[0], -0.154948, 0.001);
+ EXPECT_NEAR(S1_CNs[1], 0.005016, 0.001);
+ EXPECT_NEAR(S1_CNs[2], 0.326450, 0.001);
+
+ delete[] CNs;
+}
+
+TEST(SamplesBufferTest, NS_8_NDPS_3_DN_2_SN_4_LN_1) {
+ size_t NumSamples = 8, NumDimsPerSample = 3;
+ size_t DiffN = 2, SmoothN = 4, LagN = 1;
+
+ size_t N = NumSamples * NumDimsPerSample * (LagN + 1);
+ CalculatedNumber *CNs = new CalculatedNumber[N]();
+
+ CNs[0] = 0.34310900399667765; CNs[1] = 0.14694315994488194; CNs[2] = 0.8246677800938796;
+ CNs[3] = 0.48249504592307835; CNs[4] = 0.23241087965531182; CNs[5] = 0.9595348555892567;
+ CNs[6] = 0.44281094035598334; CNs[7] = 0.5143142171362715; CNs[8] = 0.06391303014242555;
+ CNs[9] = 0.7460491027783901; CNs[10] = 0.43887217459032923; CNs[11] = 0.2814395025355999;
+ CNs[12] = 0.9231114281214198; CNs[13] = 0.326882401786898; CNs[14] = 0.26747939220376216;
+ CNs[15] = 0.7787571209969636; CNs[16] =0.5851700001235088; CNs[17] = 0.34410728945321567;
+ CNs[18] = 0.9394494507088997; CNs[19] =0.17567223681734334; CNs[20] = 0.42732886195446984;
+ CNs[21] = 0.9460522396152958; CNs[22] =0.23462747016780894; CNs[23] = 0.35983249900892145;
+
+ std::vector<uint32_t> RandNums(NumSamples, std::numeric_limits<uint32_t>::max());
+ SamplesBuffer SB(CNs, NumSamples, NumDimsPerSample, DiffN, SmoothN, LagN, 1.0, RandNums);
+ SB.preprocess();
+
+ std::vector<Sample> Samples = SB.getPreprocessedSamples();
+ EXPECT_EQ(Samples.size(), 2);
+
+ Sample S0 = Samples[0];
+ const CalculatedNumber *S0_CNs = S0.getCalculatedNumbers();
+ Sample S1 = Samples[1];
+ const CalculatedNumber *S1_CNs = S1.getCalculatedNumbers();
+
+ EXPECT_NEAR(S0_CNs[0], 0.198225, 0.001);
+ EXPECT_NEAR(S0_CNs[1], 0.003529, 0.001);
+ EXPECT_NEAR(S0_CNs[2], -0.063003, 0.001);
+ EXPECT_NEAR(S0_CNs[3], 0.219066, 0.001);
+ EXPECT_NEAR(S0_CNs[4], 0.133175, 0.001);
+ EXPECT_NEAR(S0_CNs[5], -0.293154, 0.001);
+
+ EXPECT_NEAR(S1_CNs[0], 0.174160, 0.001);
+ EXPECT_NEAR(S1_CNs[1], -0.135722, 0.001);
+ EXPECT_NEAR(S1_CNs[2], 0.110452, 0.001);
+ EXPECT_NEAR(S1_CNs[3], 0.198225, 0.001);
+ EXPECT_NEAR(S1_CNs[4], 0.003529, 0.001);
+ EXPECT_NEAR(S1_CNs[5], -0.063003, 0.001);
+
+ delete[] CNs;
+}
diff --git a/ml/ml-dummy.c b/ml/ml-dummy.c
new file mode 100644
index 0000000..492dfe2
--- /dev/null
+++ b/ml/ml-dummy.c
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include "ml.h"
+
+#if !defined(ENABLE_ML)
+
+bool ml_capable() {
+ return false;
+}
+
+bool ml_enabled(RRDHOST *RH) {
+ (void) RH;
+ return false;
+}
+
+void ml_init(void) {}
+
+void ml_new_host(RRDHOST *RH) { (void) RH; }
+
+void ml_delete_host(RRDHOST *RH) { (void) RH; }
+
+char *ml_get_host_info(RRDHOST *RH) {
+ (void) RH;
+ return NULL;
+}
+
+char *ml_get_host_runtime_info(RRDHOST *RH) {
+ (void) RH;
+ return NULL;
+}
+
+char *ml_get_host_models(RRDHOST *RH) {
+ (void) RH;
+ return NULL;
+}
+
+void ml_new_dimension(RRDDIM *RD) { (void) RD; }
+
+void ml_delete_dimension(RRDDIM *RD) { (void) RD; }
+
+bool ml_is_anomalous(RRDDIM *RD, double Value, bool Exists) {
+ (void) RD; (void) Value; (void) Exists;
+ return false;
+}
+
+bool ml_streaming_enabled() {
+ return false;
+}
+
+#endif
diff --git a/ml/ml-private.h b/ml/ml-private.h
new file mode 100644
index 0000000..2bd72ac
--- /dev/null
+++ b/ml/ml-private.h
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef ML_PRIVATE_H
+#define ML_PRIVATE_H
+
+#include "KMeans.h"
+#include "ml/ml.h"
+
+#include <chrono>
+#include <map>
+#include <mutex>
+#include <sstream>
+
+namespace ml {
+
+using SteadyClock = std::chrono::steady_clock;
+using TimePoint = std::chrono::time_point<SteadyClock>;
+
+template<typename T>
+using Duration = std::chrono::duration<T>;
+
+using Seconds = std::chrono::seconds;
+
+} // namespace ml
+
+#endif /* ML_PRIVATE_H */
diff --git a/ml/ml.cc b/ml/ml.cc
new file mode 100644
index 0000000..1a7d6ae
--- /dev/null
+++ b/ml/ml.cc
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include "Config.h"
+#include "Dimension.h"
+#include "Host.h"
+
+#include <random>
+
+using namespace ml;
+
+bool ml_capable() {
+ return true;
+}
+
+bool ml_enabled(RRDHOST *RH) {
+ if (!Cfg.EnableAnomalyDetection)
+ return false;
+
+ if (simple_pattern_matches(Cfg.SP_HostsToSkip, rrdhost_hostname(RH)))
+ return false;
+
+ return true;
+}
+
+/*
+ * Assumptions:
+ * 1) hosts outlive their sets, and sets outlive their dimensions,
+ * 2) dimensions always have a set that has a host.
+ */
+
+void ml_init(void) {
+ // Read config values
+ Cfg.readMLConfig();
+
+ if (!Cfg.EnableAnomalyDetection)
+ return;
+
+ // Generate random numbers to efficiently sample the features we need
+ // for KMeans clustering.
+ std::random_device RD;
+ std::mt19937 Gen(RD());
+
+ Cfg.RandomNums.reserve(Cfg.MaxTrainSamples);
+ for (size_t Idx = 0; Idx != Cfg.MaxTrainSamples; Idx++)
+ Cfg.RandomNums.push_back(Gen());
+}
+
+void ml_new_host(RRDHOST *RH) {
+ if (!ml_enabled(RH))
+ return;
+
+ Host *H = new Host(RH);
+ RH->ml_host = static_cast<ml_host_t>(H);
+
+ H->startAnomalyDetectionThreads();
+}
+
+void ml_delete_host(RRDHOST *RH) {
+ Host *H = static_cast<Host *>(RH->ml_host);
+ if (!H)
+ return;
+
+ H->stopAnomalyDetectionThreads();
+
+ delete H;
+ RH->ml_host = nullptr;
+}
+
+void ml_new_dimension(RRDDIM *RD) {
+ RRDSET *RS = RD->rrdset;
+
+ Host *H = static_cast<Host *>(RD->rrdset->rrdhost->ml_host);
+ if (!H)
+ return;
+
+ if (static_cast<unsigned>(RD->update_every) != H->updateEvery())
+ return;
+
+ if (simple_pattern_matches(Cfg.SP_ChartsToSkip, rrdset_name(RS)))
+ return;
+
+ Dimension *D = new Dimension(RD);
+ RD->ml_dimension = static_cast<ml_dimension_t>(D);
+ H->addDimension(D);
+}
+
+void ml_delete_dimension(RRDDIM *RD) {
+ Dimension *D = static_cast<Dimension *>(RD->ml_dimension);
+ if (!D)
+ return;
+
+ Host *H = static_cast<Host *>(RD->rrdset->rrdhost->ml_host);
+ if (!H)
+ delete D;
+ else
+ H->removeDimension(D);
+
+ RD->ml_dimension = nullptr;
+}
+
+char *ml_get_host_info(RRDHOST *RH) {
+ nlohmann::json ConfigJson;
+
+ if (RH && RH->ml_host) {
+ Host *H = static_cast<Host *>(RH->ml_host);
+ H->getConfigAsJson(ConfigJson);
+ } else {
+ ConfigJson["enabled"] = false;
+ }
+
+ return strdupz(ConfigJson.dump(2, '\t').c_str());
+}
+
+char *ml_get_host_runtime_info(RRDHOST *RH) {
+ nlohmann::json ConfigJson;
+
+ if (RH && RH->ml_host) {
+ Host *H = static_cast<Host *>(RH->ml_host);
+ H->getDetectionInfoAsJson(ConfigJson);
+ } else {
+ return nullptr;
+ }
+
+ return strdup(ConfigJson.dump(1, '\t').c_str());
+}
+
+char *ml_get_host_models(RRDHOST *RH) {
+ nlohmann::json ModelsJson;
+
+ if (RH && RH->ml_host) {
+ Host *H = static_cast<Host *>(RH->ml_host);
+ H->getModelsAsJson(ModelsJson);
+ return strdup(ModelsJson.dump(2, '\t').c_str());
+ }
+
+ return nullptr;
+}
+
+bool ml_is_anomalous(RRDDIM *RD, double Value, bool Exists) {
+ Dimension *D = static_cast<Dimension *>(RD->ml_dimension);
+ if (!D)
+ return false;
+
+ return D->predict(Value, Exists);
+}
+
+bool ml_streaming_enabled() {
+ return Cfg.StreamADCharts;
+}
+
+#if defined(ENABLE_ML_TESTS)
+
+#include "gtest/gtest.h"
+
+int test_ml(int argc, char *argv[]) {
+ (void) argc;
+ (void) argv;
+
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
+
+#endif // ENABLE_ML_TESTS
+
+#include "ml-private.h"
diff --git a/ml/ml.h b/ml/ml.h
new file mode 100644
index 0000000..8e62c49
--- /dev/null
+++ b/ml/ml.h
@@ -0,0 +1,50 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#ifndef NETDATA_ML_H
+#define NETDATA_ML_H
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include "daemon/common.h"
+#include "web/api/queries/rrdr.h"
+
+// This is a DBEngine function redeclared here so that we can free
+// the anomaly rate dimension, whenever its backing dimension is freed.
+void rrddim_free(RRDSET *st, RRDDIM *rd);
+
+typedef void* ml_host_t;
+typedef void* ml_dimension_t;
+
+bool ml_capable();
+
+bool ml_enabled(RRDHOST *RH);
+
+void ml_init(void);
+
+void ml_new_host(RRDHOST *RH);
+void ml_delete_host(RRDHOST *RH);
+
+char *ml_get_host_info(RRDHOST *RH);
+char *ml_get_host_runtime_info(RRDHOST *RH);
+char *ml_get_host_models(RRDHOST *RH);
+
+void ml_new_dimension(RRDDIM *RD);
+void ml_delete_dimension(RRDDIM *RD);
+
+bool ml_is_anomalous(RRDDIM *RD, double value, bool exists);
+
+bool ml_streaming_enabled();
+
+#define ML_ANOMALY_RATES_CHART_ID "anomaly_detection.anomaly_rates"
+
+#if defined(ENABLE_ML_TESTS)
+int test_ml(int argc, char *argv[]);
+#endif
+
+#ifdef __cplusplus
+};
+#endif
+
+#endif /* NETDATA_ML_H */
diff --git a/ml/notebooks/README.md b/ml/notebooks/README.md
new file mode 100644
index 0000000..5e9db6d
--- /dev/null
+++ b/ml/notebooks/README.md
@@ -0,0 +1,5 @@
+## Machine Learning Notebooks
+
+This folder is a home for any documentation supporting machine learning related notebooks.
+
+- [Netdata anomaly detection deepdive](netdata_anomaly_detection_deepdive.ipynb): This is a starter notebook to help users understand how anomaly detection works in the Netdata agent and go a little deeper if they want. \ No newline at end of file
diff --git a/ml/notebooks/netdata_anomaly_detection_deepdive.ipynb b/ml/notebooks/netdata_anomaly_detection_deepdive.ipynb
new file mode 100644
index 0000000..8d0c0c7
--- /dev/null
+++ b/ml/notebooks/netdata_anomaly_detection_deepdive.ipynb
@@ -0,0 +1,1712 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "## Netdata Anomaly Detection Deepdive"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/netdata/netdata/blob/master/ml/notebooks/netdata_anomaly_detection_deepdive.ipynb)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "This notebook will walk through a simplified python based implementation of the C & C++ code in [`netdata/netdata/ml/`](https://github.com/netdata/netdata/tree/master/ml) used to power the [anomaly detection capabilities](https://github.com/netdata/netdata/blob/master/ml/README.md) of the Netdata agent.\n",
+ "\n",
+ "The main goal here is to help interested users learn more about how the machine learning works under the hood. If you just want to get started by enabling ml on your agent you can check out these [simple configuration steps](https://learn.netdata.cloud/docs/agent/ml#configuration). \n",
+ "\n",
+ "🚧 **Note**: This functionality is still under active development and considered experimental. Changes might cause the feature to break. We dogfood it internally and among early adopters within the Netdata community to build the feature. If you would like to get involved and help us with some feedback, email us at analytics-ml-team@netdata.cloud or come join us in the [🤖-ml-powered-monitoring](https://discord.gg/4eRSEUpJnc) channel of the Netdata discord. Alternativley, if GitHub is more of your thing, feel free to create a [GitHub discussion](https://github.com/netdata/netdata/discussions?discussions_q=label%3Aarea%2Fml).\n",
+ "\n",
+ "In this notebook we will:\n",
+ "\n",
+ "1. [**Get raw data**](#get-raw-data): Pull some recent data from one of our demo agents.\n",
+ "2. [**Add some anomalous data**](#add-some-anomalous-data): Be evil and mess up the tail end of the data to make it obviously \"anomalous\".\n",
+ "3. [**Lets do some ML!**](#lets-do-some-ml): Implement an unsupervised clustering based approach to anomaly detection.\n",
+ "4. [**Lets visualize all this!**](#lets-visualize-all-this): Plot and explore all this visually.\n",
+ "5. [**So, how does it _actually_ work?**](#so-how-does-it-actually-work): Dig a little deeper on what's going on under the hood."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "### Imports & Helper Functions"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {
+ "pycharm": {
+ "name": "#%% md\n"
+ }
+ },
+ "source": [
+ "Uncomment and run the next cell to install [netdata-pandas](https://github.com/netdata/netdata-pandas) which we will use to easily pull data from the [Netdata agent REST API](https://learn.netdata.cloud/docs/agent/web/api) into a nice clean [Pandas](https://pandas.pydata.org/) [DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html) where it will be easier to work with. \n",
+ "\n",
+ "Once you have [netdata-pandas](https://github.com/netdata/netdata-pandas) installed you can comment it back out and rerun the cell to clear the output."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "id": "aL4gm-jUffEx",
+ "pycharm": {
+ "is_executing": true
+ }
+ },
+ "outputs": [],
+ "source": [
+ "# uncomment the line below (when running in google colab) to install the netdata-pandas library, comment it again when done.\n",
+ "#!pip install netdata-pandas"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "id": "EMZBHjG4mOQh",
+ "pycharm": {
+ "is_executing": true
+ }
+ },
+ "outputs": [],
+ "source": [
+ "from datetime import datetime, timedelta\n",
+ "import itertools\n",
+ "import random\n",
+ "import pandas as pd\n",
+ "import numpy as np\n",
+ "import matplotlib.pyplot as plt\n",
+ "import seaborn as sns\n",
+ "import matplotlib.patches as mpatches\n",
+ "from sklearn.cluster import KMeans\n",
+ "from scipy.spatial.distance import cdist\n",
+ "from netdata_pandas.data import get_data\n",
+ "\n",
+ "# helper functions\n",
+ "\n",
+ "\n",
+ "def preprocess_df(df, lags_n, diffs_n, smooth_n):\n",
+ " \"\"\"Given a pandas dataframe preprocess it to take differences, add smoothing, lags and abs values. \n",
+ " \"\"\"\n",
+ " if diffs_n >= 1:\n",
+ " # take differences\n",
+ " df = df.diff(diffs_n).dropna()\n",
+ " if smooth_n >= 2:\n",
+ " # apply a rolling average to smooth out the data a bit\n",
+ " df = df.rolling(smooth_n).mean().dropna()\n",
+ " if lags_n >= 1:\n",
+ " # for each dimension add a new columns for each of lags_n lags of the differenced and smoothed values for that dimension\n",
+ " df_columns_new = [f'{col}_lag{n}' for n in range(lags_n+1) for col in df.columns]\n",
+ " df = pd.concat([df.shift(n) for n in range(lags_n + 1)], axis=1).dropna()\n",
+ " df.columns = df_columns_new\n",
+ " # sort columns to have lagged values next to each other for clarity when looking at the feature vectors\n",
+ " df = df.reindex(sorted(df.columns), axis=1)\n",
+ " \n",
+ " # take absolute values as last step\n",
+ " df = abs(df)\n",
+ " \n",
+ " return df\n",
+ "\n",
+ "\n",
+ "def add_shading_to_plot(ax, a, b, t, c='y', alpha=0.2):\n",
+ " \"\"\"Helper function to add shading to plot and add legend item.\n",
+ " \"\"\"\n",
+ " plt.axvspan(a, b, color=c, alpha=alpha, lw=0)\n",
+ " handles, labels = ax.get_legend_handles_labels()\n",
+ " patch = mpatches.Patch(color=c, label=t, alpha=alpha)\n",
+ " handles.append(patch) \n",
+ " plt.legend(handles=handles)\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Inputs & Parameters"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "A full list of all the anomaly detection configuration parameters, and descriptions of each, can be found in the [configuration](https://github.com/netdata/netdata/blob/master/ml/README.md#configuration) section of the [ml readme](https://github.com/netdata/netdata/blob/master/ml/README.md).\n",
+ "\n",
+ "Below we will focus on some basic params to decide what data to pull and the main ml params of importance in understanding how it all works.\n",
+ "\n",
+ "#### training size/scheduling parameters:\n",
+ "- `train_every`: How often to train or retrain each model.\n",
+ "- `num_samples_to_train`: How much of the recent data to train on, for example 3600 would mean training on the last 1 hour of raw data. The default in the netdata agent currently is 14400, so last 4 hours.\n",
+ "\n",
+ "#### feature preprocessing related parameters:\n",
+ "- `num_samples_to_diff`: This is really just a 1 or 0 flag to turn on or off differencing in the feature preprocessing. It defaults to 1 (to take differences) and generally should be left alone.\n",
+ "- `num_samples_to_smooth`: The extent of smoothing (averaging) applied as part of feature preprocessing.\n",
+ "- `num_samples_to_lag`: The number of previous values to also include in our feature vector.\n",
+ "\n",
+ "#### anomaly score related parameters:\n",
+ "- `dimension_anomaly_score_threshold`: The threshold on the anomaly score, above which the data it considered anomalous and the [anomaly bit](https://github.com/netdata/netdata/blob/master/ml/README.md#anomaly-bit) is set to 1 (its actually set to 100 in reality but this just to make it behave more like a rate when aggregated in the netdata agent api). By default this is `0.99` which means anything with an anomaly score above 99% is considered anomalous. Decreasing this threshold makes the model more sensitive and will leave to more anomaly bits, increasing it does the opposite.\n",
+ "\n",
+ "#### model parameters:\n",
+ "- `n_clusters_per_dimension`: This is the number of clusters to fit for each model, by default it is set to 2 such that 2 cluster [centroids](https://en.wikipedia.org/wiki/Centroid) will be fit for each model.\n",
+ "- `max_iterations`: The maximum number of iterations the fitting of the clusters is allowed to take. In reality the clustering will converge a lot sooner than this.\n",
+ "\n",
+ "**Note**: There is much more detailed discussion of all there configuration parameters in the [\"Configuration\"](https://github.com/netdata/netdata/blob/master/ml/README.md#configuration) section of the ml readme."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "id": "tBUVUpR3fohX"
+ },
+ "outputs": [],
+ "source": [
+ "# data params\n",
+ "hosts = ['london.my-netdata.io']\n",
+ "charts = ['system.cpu']\n",
+ "# if want to just focus on a subset of dims, in this case lets just pick one for simplicity\n",
+ "dims = ['system.cpu|user'] \n",
+ "last_n_hours = 2\n",
+ "# based on last_n_hours define the relevant 'before' and 'after' params for the netdata rest api on the agent\n",
+ "before = int(datetime.utcnow().timestamp())\n",
+ "after = int((datetime.utcnow() - timedelta(hours=last_n_hours)).timestamp())\n",
+ "\n",
+ "# ml params\n",
+ "train_every = 3600\n",
+ "num_samples_to_train = 3600\n",
+ "num_samples_to_diff = 1\n",
+ "num_samples_to_smooth = 3\n",
+ "num_samples_to_lag = 5\n",
+ "dimension_anomaly_score_threshold = 0.99\n",
+ "n_clusters_per_dimension = 2\n",
+ "max_iterations = 1000"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 1. Get raw data<a id=\"get-raw-data\"></a>"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Next we will use the `get_data()` function from the [netdata-pandas](https://github.com/netdata/netdata-pandas) library to just pull down our raw data from the agent into a Pandas dataframe."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 660
+ },
+ "id": "Ypudrfu-fpje",
+ "outputId": "b25c7322-03b4-4475-c416-37c3abbe78a4"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(7200, 1)\n",
+ "1647978087 1647985286\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>system.cpu|user</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>time_idx</th>\n",
+ " <th></th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>1647978087</th>\n",
+ " <td>1.503759</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1647978088</th>\n",
+ " <td>0.252525</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1647978089</th>\n",
+ " <td>0.755668</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1647978090</th>\n",
+ " <td>0.503778</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1647978091</th>\n",
+ " <td>0.501253</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " system.cpu|user\n",
+ "time_idx \n",
+ "1647978087 1.503759\n",
+ "1647978088 0.252525\n",
+ "1647978089 0.755668\n",
+ "1647978090 0.503778\n",
+ "1647978091 0.501253"
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 1152x432 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# get raw data\n",
+ "df = get_data(hosts=hosts, charts=charts, after=after, before=before)\n",
+ "\n",
+ "# filter df for just the dims if set\n",
+ "if len(dims):\n",
+ " df = df[[dim for dim in dims]]\n",
+ "\n",
+ "# set some variables based on our data\n",
+ "df_timestamp_min = df.index.min()\n",
+ "df_timestamp_max = df.index.max()\n",
+ "\n",
+ "# print some info\n",
+ "print(df.shape)\n",
+ "print(df_timestamp_min, df_timestamp_max)\n",
+ "display(df.head())\n",
+ "\n",
+ "# lets just plot each dimension to have a look at it\n",
+ "for col in df.columns: \n",
+ "\n",
+ " # plot dimension, setting index to datetime so its more readable on the plot\n",
+ " df[[col]].set_index(pd.to_datetime(df.index, unit='s')).plot(title=f'Raw Data - {col}', figsize=(16,6))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 2. Add some anomalous data<a id=\"add-some-anomalous-data\"></a>"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Below we will pick the last `n_tail_anomalous` observations and mess them up in some random but noticable way. In this case we randomly shuffle the data and then multiply each observation by some integer randomly chosen from `integers_to_pick_randomly`"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 405
+ },
+ "id": "RDuB5PdjOaAX",
+ "outputId": "d686cea5-d0a8-4ed4-aa58-64770a063fbb"
+ },
+ "outputs": [],
+ "source": [
+ "# size of anomalous data\n",
+ "n_tail_anomalous = 500\n",
+ "integers_to_pick_randomly = [0,1,5,10]\n",
+ "\n",
+ "# randomly scramble data and multiply randomly by some numbers to make it anomalous looking\n",
+ "anomalous_shape = (n_tail_anomalous, len(df.columns))\n",
+ "randomly_scrambled_data = np.random.choice(df.tail(n_tail_anomalous).values.reshape(-1,), anomalous_shape)\n",
+ "random_integers = np.random.choice(integers_to_pick_randomly, anomalous_shape)\n",
+ "data_anomalous = randomly_scrambled_data * random_integers\n",
+ "\n",
+ "# create anomalous dataframe\n",
+ "df_anomalous = pd.DataFrame(data = data_anomalous, columns = df.columns)\n",
+ "# make sure it has the expected index since we don't want to shuffle that\n",
+ "df_anomalous.index = df.tail(n_tail_anomalous).index\n",
+ "\n",
+ "# overwrite last n_tail observations with anomalous data\n",
+ "df.update(df_anomalous)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In the plot below it should be clear that the light yellow section of the data has been messed with and is now \"anomalous\" or \"strange looking\" in comparison to all the data that comes before it. \n",
+ "\n",
+ "Our goal now is to create some sort of [anomaly score](https://github.com/netdata/netdata/blob/master/ml/README.md#anomaly-score) that can easily capture this."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 1152x432 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# let's just plot each dimension now that we have added some anomalous data\n",
+ "for col in df.columns:\n",
+ " \n",
+ " ax = df.set_index(pd.to_datetime(df.index, unit='s')).plot(title=f'Anomalous Data Appended - {col}', figsize=(16,6))\n",
+ " add_shading_to_plot(ax, df_timestamp_max - n_tail_anomalous, df_timestamp_max, 'anomalous data')\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 3. Lets do some ML!<a id=\"lets-do-some-ml\"></a>"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "\n",
+ "\n",
+ "In this notebook we will just use good old [kmeans](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html) from [scikit-learn](https://scikit-learn.org/stable/index.html). \n",
+ "\n",
+ "In reality the Netdata Agent uses the awesome [dlib](https://github.com/davisking/dlib) c++ library and the [`find_clusters_using_kmeans`](http://dlib.net/ml.html#find_clusters_using_kmeans) function along with a few others. You can see the Netdata KMeans code [here](https://github.com/netdata/netdata/blob/master/ml/kmeans/KMeans.cc).\n",
+ "\n",
+ "The code below:\n",
+ "\n",
+ "1. Will initialize some empty objects to use during model training and inference.\n",
+ "2. Will loop over every observation and run training and inference in a similar way to how the Agent would process each observation.\n",
+ "\n",
+ "Of course the Agent implemtation is a lot more efficient and uses more efficient streaming and buffer based approaches as opposed to the fairly naive implementation below. \n",
+ "\n",
+ "The idea in this notebook is to make the general approach as readable and understandable as possible."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 7,
+ "metadata": {
+ "id": "W6UL8U04ppmM"
+ },
+ "outputs": [],
+ "source": [
+ "# initialize an empty kmeans model for each dimension\n",
+ "models = {\n",
+ " dim: {\n",
+ " 'model' : KMeans(n_clusters=n_clusters_per_dimension, max_iter=max_iterations),\n",
+ " 'fitted': False\n",
+ " } for dim in df.columns\n",
+ "}\n",
+ "\n",
+ "# initialize dictionary for storing anomaly scores for each dim\n",
+ "anomaly_scores = {\n",
+ " dim: {\n",
+ " 't' : [],\n",
+ " 'anomaly_score': []\n",
+ " } for dim in df.columns\n",
+ "}\n",
+ "\n",
+ "# initialize dictionary for storing anomaly bits for each dim\n",
+ "anomaly_bits = {\n",
+ " dim: {\n",
+ " 't' : [],\n",
+ " 'anomaly_bit': []\n",
+ " }\n",
+ " for dim in df.columns\n",
+ "}"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now we are ready to just loop over each row of data and produce anomaly scores once we have some trained models and train or retrain periodically as defined by `train_every`. \n",
+ "\n",
+ "**Note**: The Netdata Agent implementation spreads out the training across each `train_every` window as opposed to trying to train all models in one go like the below implementation. It also avoids some obvious edges cases where there is no need to retrain, for example when the data have not changed at all since last model was trained."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/"
+ },
+ "id": "_wxIeEhGiWYv",
+ "outputId": "8fdfad43-917d-42d1-8997-a49daac25b3d"
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "train at t=1647981687, (n=3600, train_after=1647981687, train_before=1647978087)\n"
+ ]
+ }
+ ],
+ "source": [
+ "# loop over each row of data in dataframe\n",
+ "for t, row in df.iterrows():\n",
+ "\n",
+ " # get n based on timestamp\n",
+ " n = t - df_timestamp_min\n",
+ "\n",
+ " # for each dimension, if we have a fitted model then make predictions\n",
+ " for dim in df.columns:\n",
+ "\n",
+ " # if we have a fitted model, get anomaly score\n",
+ " if models[dim]['fitted']:\n",
+ " \n",
+ " #################################\n",
+ " # Inference / Scoring\n",
+ " #################################\n",
+ "\n",
+ " # get a buffer of recent data\n",
+ " buffer_size = num_samples_to_diff + num_samples_to_smooth + num_samples_to_lag * 2\n",
+ " df_dim_recent = df[[dim]].loc[(t-buffer_size):t]\n",
+ "\n",
+ " # preprocess/featurize recent data\n",
+ " df_dim_recent_preprocessed = preprocess_df(\n",
+ " df_dim_recent,\n",
+ " num_samples_to_lag,\n",
+ " num_samples_to_diff,\n",
+ " num_samples_to_smooth\n",
+ " )\n",
+ "\n",
+ " # take most recent feature vector\n",
+ " X = df_dim_recent_preprocessed.tail(1).values\n",
+ " \n",
+ " # get the existing trained cluster centers\n",
+ " cluster_centers = models[dim]['model'].cluster_centers_\n",
+ "\n",
+ " # get anomaly score based on the sum of the euclidian distances between the \n",
+ " # feature vector and each cluster centroid\n",
+ " raw_anomaly_score = np.sum(cdist(X, cluster_centers, metric='euclidean'), axis=1)[0]\n",
+ "\n",
+ " # normalize anomaly score based on min-max normalization\n",
+ " # https://en.wikipedia.org/wiki/Feature_scaling#Rescaling_(min-max_normalization)\n",
+ " # the idea here is to convert the raw_anomaly_score we just computed into a number on a\n",
+ " # [0, 1] scale such that it behaves more like a percentage. We use the min and max raw scores\n",
+ " # observed during training to achieve this. This would mean that a normalized score of 1 would\n",
+ " # correspond to a distance as big as the biggest distance (most anomalous) observed on the \n",
+ " # training data. So scores that are 99% or higher will tend to be as strange or more strange\n",
+ " # as the most strange 1% observed during training.\n",
+ " \n",
+ " # normalize based on scores observed during training the model\n",
+ " train_raw_anomaly_score_min = models[dim]['train_raw_anomaly_score_min']\n",
+ " train_raw_anomaly_score_max = models[dim]['train_raw_anomaly_score_max']\n",
+ " train_raw_anomaly_score_range = train_raw_anomaly_score_max - train_raw_anomaly_score_min\n",
+ " \n",
+ " # normalize\n",
+ " anomaly_score = (raw_anomaly_score - train_raw_anomaly_score_min) / train_raw_anomaly_score_range\n",
+ " \n",
+ " # The Netdata Agent does not actually store the normalized_anomaly_score since doing so would require more storage\n",
+ " # space for each metric, essentially doubling the amount of metrics that need to be stored. Instead, the Netdata Agent\n",
+ " # makes use of an existing bit (the anomaly bit) in the internal storage representation used by netdata. So if the \n",
+ " # normalized_anomaly_score passed the dimension_anomaly_score_threshold netdata will flip the corresponding anomaly_bit\n",
+ " # from 0 to 1 to signify that the observation the scored feature vector is considered \"anomalous\". \n",
+ " # All without any extra storage overhead required for the Netdata Agent database! Yes it's almost magic :)\n",
+ "\n",
+ " # get anomaly bit\n",
+ " anomaly_bit = 100 if anomaly_score >= dimension_anomaly_score_threshold else 0\n",
+ " \n",
+ " # save anomaly score\n",
+ " anomaly_scores[dim]['t'].append(t)\n",
+ " anomaly_scores[dim]['anomaly_score'].append(anomaly_score)\n",
+ "\n",
+ " # save anomaly bit\n",
+ " anomaly_bits[dim]['t'].append(t)\n",
+ " anomaly_bits[dim]['anomaly_bit'].append(anomaly_bit)\n",
+ " \n",
+ " # check if the model needs (re)training\n",
+ " if (n >= num_samples_to_train) & (n % train_every == 0):\n",
+ " \n",
+ " #################################\n",
+ " # Train / Re-Train\n",
+ " #################################\n",
+ "\n",
+ " train_before = t - num_samples_to_train\n",
+ " train_after = t\n",
+ " print(f'train at t={t}, (n={n}, train_after={train_after}, train_before={train_before})')\n",
+ "\n",
+ " # loop over each dimension/model\n",
+ " for dim in df.columns:\n",
+ " \n",
+ " # get training data based on most recent num_samples_to_train\n",
+ " df_dim_train = df[[dim]].loc[(t-num_samples_to_train):t]\n",
+ " \n",
+ " # preprocess/featurize training data\n",
+ " df_dim_train_preprocessed = preprocess_df(\n",
+ " df_dim_train,\n",
+ " num_samples_to_lag,\n",
+ " num_samples_to_diff,\n",
+ " num_samples_to_smooth\n",
+ " )\n",
+ "\n",
+ " # fit model using the fit method of kmeans\n",
+ " models[dim]['model'].fit(df_dim_train_preprocessed.values) \n",
+ " models[dim]['fitted'] = True # mark model as fitted\n",
+ " \n",
+ " # get cluster centers of model we just trained\n",
+ " cluster_centers = models[dim]['model'].cluster_centers_\n",
+ "\n",
+ " # get training scores, needed to get min and max scores for normalization at inference time\n",
+ " train_raw_anomaly_scores = np.sum(cdist(df_dim_train_preprocessed.values, cluster_centers, metric='euclidean'), axis=1)\n",
+ " # save min and max anomaly score during training, used to normalize all scores to be 0,1 scale\n",
+ " models[dim]['train_raw_anomaly_score_min'] = min(train_raw_anomaly_scores)\n",
+ " models[dim]['train_raw_anomaly_score_max'] = max(train_raw_anomaly_scores)\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The hard work is now all done. The above cell has processed all the data, trained or retrained models as defined by the inital config, and saved all anomaly scores and anomaly bits.\n",
+ "\n",
+ "The rest of the notebook will try to help make more sense of all this."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "id": "0iN0PCPGiWBx"
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>system.cpu|user</th>\n",
+ " <th>system.cpu|user__anomaly_score</th>\n",
+ " <th>system.cpu|user__anomaly_bit</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>time_idx</th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>1647981888</th>\n",
+ " <td>0.753769</td>\n",
+ " <td>0.228337</td>\n",
+ " <td>0.0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1647984190</th>\n",
+ " <td>0.757576</td>\n",
+ " <td>0.144231</td>\n",
+ " <td>0.0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1647983651</th>\n",
+ " <td>0.753769</td>\n",
+ " <td>0.198606</td>\n",
+ " <td>0.0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1647982084</th>\n",
+ " <td>0.757576</td>\n",
+ " <td>0.189867</td>\n",
+ " <td>0.0</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1647983422</th>\n",
+ " <td>1.002506</td>\n",
+ " <td>0.333199</td>\n",
+ " <td>0.0</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " system.cpu|user system.cpu|user__anomaly_score \\\n",
+ "time_idx \n",
+ "1647981888 0.753769 0.228337 \n",
+ "1647984190 0.757576 0.144231 \n",
+ "1647983651 0.753769 0.198606 \n",
+ "1647982084 0.757576 0.189867 \n",
+ "1647983422 1.002506 0.333199 \n",
+ "\n",
+ " system.cpu|user__anomaly_bit \n",
+ "time_idx \n",
+ "1647981888 0.0 \n",
+ "1647984190 0.0 \n",
+ "1647983651 0.0 \n",
+ "1647982084 0.0 \n",
+ "1647983422 0.0 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# create dataframe of anomaly scores\n",
+ "df_anomaly_scores = pd.DataFrame()\n",
+ "for dim in anomaly_scores:\n",
+ " df_anomaly_scores_dim = pd.DataFrame(data=zip(anomaly_scores[dim]['t'],anomaly_scores[dim]['anomaly_score']),columns=['time_idx',f'{dim}__anomaly_score']).set_index('time_idx')\n",
+ " df_anomaly_scores = df_anomaly_scores.join(df_anomaly_scores_dim, how='outer')\n",
+ "\n",
+ "# create dataframe of anomaly bits\n",
+ "df_anomaly_bits = pd.DataFrame()\n",
+ "for dim in anomaly_bits:\n",
+ " df_anomaly_bits_dim = pd.DataFrame(data=zip(anomaly_bits[dim]['t'],anomaly_bits[dim]['anomaly_bit']),columns=['time_idx',f'{dim}__anomaly_bit']).set_index('time_idx')\n",
+ " df_anomaly_bits = df_anomaly_bits.join(df_anomaly_bits_dim, how='outer')\n",
+ "\n",
+ "# join anomaly scores to raw df\n",
+ "df_final = df.join(df_anomaly_scores, how='outer')\n",
+ "\n",
+ "# join anomaly bits to raw df\n",
+ "df_final = df_final.join(df_anomaly_bits, how='outer')\n",
+ "\n",
+ "# let's look at a sample of some scored observations\n",
+ "display(df_final.tail(num_samples_to_train).sample(5))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In the dataframe above we see that each observation now also has a column with the `__anomaly_score` and one with the `__anomaly_bit`."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 4. Lets visualize all this!<a id=\"lets-visualize-all-this\"></a>"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now that we have our raw data, our anomaly scores, and our anomaly bits - we can plot this all side by side to get a clear picture of how it all works together.\n",
+ "\n",
+ "In the plots below we see that during the light yellow \"anomalous\" period the \"[anomaly scores](https://github.com/netdata/netdata/blob/master/ml/README.md#anomaly-score)\" get elevated to such an extend that many \"[anomaly bits](https://github.com/netdata/netdata/blob/master/ml/README.md#anomaly-bit)\" start flipping from 0 to 1 and essentially \"turning on\" to signal potentially anomalous data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 10,
+ "metadata": {
+ "colab": {
+ "base_uri": "https://localhost:8080/",
+ "height": 1000
+ },
+ "id": "zVoR1BJ5nCGv",
+ "outputId": "ffcc7765-ea39-47c1-da99-ec79647d0871",
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 1440x288 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 1440x288 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 1440x288 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 1440x288 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "figsize = (20,4)\n",
+ "\n",
+ "for dim in models:\n",
+ "\n",
+ " # create a dim with the raw data, anomaly score and anomaly bit for the dim\n",
+ " df_final_dim = df_final[[dim,f'{dim}__anomaly_score',f'{dim}__anomaly_bit']]\n",
+ " \n",
+ " # plot raw data, including the anomalous data\n",
+ " ax = df_final_dim[[dim]].set_index(pd.to_datetime(df_final_dim.index, unit='s')).plot(\n",
+ " title=f'Raw Data (Anomalous Appended) - {dim}', figsize=figsize\n",
+ " )\n",
+ " add_shading_to_plot(ax, df_timestamp_max - n_tail_anomalous, df_timestamp_max, 'Anomalous Data')\n",
+ " \n",
+ " # plat the corresponding anomaly scores\n",
+ " ax = df_final_dim[[f'{dim}__anomaly_score']].set_index(pd.to_datetime(df_final_dim.index, unit='s')).plot(\n",
+ " title=f'Anomaly Score - {dim}', figsize=figsize\n",
+ " )\n",
+ " add_shading_to_plot(ax, df_timestamp_max - n_tail_anomalous, df_timestamp_max, 'Anomalous Data')\n",
+ " \n",
+ " # plot the corresponding anomaly bits\n",
+ " ax = df_final_dim[[f'{dim}__anomaly_bit']].set_index(pd.to_datetime(df_final_dim.index, unit='s')).plot(\n",
+ " title=f'Anomaly Bit - {dim}', figsize=figsize\n",
+ " )\n",
+ " add_shading_to_plot(ax, df_timestamp_max - n_tail_anomalous, df_timestamp_max, 'Anomalous Data')\n",
+ "\n",
+ " # finally, plot it all on the same plot (which might not be so easy or clear to read)\n",
+ " df_final_dim_normalized = (df_final_dim-df_final_dim.min())/(df_final_dim.max()-df_final_dim.min())\n",
+ " ax = df_final_dim_normalized.set_index(pd.to_datetime(df_final_dim_normalized.index, unit='s')).plot(\n",
+ " title=f'Combined (Raw, Score, Bit) - {dim}', figsize=figsize\n",
+ " )\n",
+ " add_shading_to_plot(ax, df_timestamp_max - n_tail_anomalous, df_timestamp_max, 'Anomalous Data')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The last concept to introduce now is the \"[anomaly rate](https://github.com/netdata/netdata/blob/master/ml/README.md#anomaly-rate)\" which is really just an average over \"anomaly bits\".\n",
+ "\n",
+ "For example, in the next cell we will just average all the anomaly bits across the light yellow window of time to find the anomaly rate for the metric within this window. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 11,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "n_tail_anomalous_anomaly_rate = 96.6%\n",
+ "\n",
+ "This means the \"anomaly rate\" within the yellow period of anomalous data was 96.6%\n",
+ "\n",
+ "Another way to think of this is that 96.6% of the observations during the yellow \n",
+ "window were considered anomalous based on the latest trained model.\n"
+ ]
+ }
+ ],
+ "source": [
+ "# average the anomaly bits within the n_tail_anomalous period of the data\n",
+ "n_tail_anomalous_anomaly_rate = df_final_dim[[f'{dim}__anomaly_bit']].tail(n_tail_anomalous).mean()[0]\n",
+ "\n",
+ "print(f'n_tail_anomalous_anomaly_rate = {n_tail_anomalous_anomaly_rate}%')\n",
+ "print(f'\\nThis means the \"anomaly rate\" within the yellow period of anomalous data was {n_tail_anomalous_anomaly_rate}%')\n",
+ "print(f'\\nAnother way to think of this is that {n_tail_anomalous_anomaly_rate}% of the observations during the yellow \\nwindow were considered anomalous based on the latest trained model.')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### 5. So, how does it _actually_ work?<a id=\"so-how-does-it-actually-work\"></a>"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In this final section of the notebook below we will dig in to try understand this a bit more intuitivley.\n",
+ "\n",
+ "First we will \"[featureize](https://brilliant.org/wiki/feature-vector/)\" or \"preprocess\" all the data. Then we will explore what these feature vectors actually are, how they look, and how we derive anomaly scores based on thier distance to the models cluster centroids."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 12,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# preprocess/featurize all data\n",
+ "df_preprocessed = preprocess_df(\n",
+ " df,\n",
+ " num_samples_to_lag,\n",
+ " num_samples_to_diff,\n",
+ " num_samples_to_smooth\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Now that we have preprocessed all of our data, lets just take a look at it.\n",
+ "\n",
+ "You will see that we have essentially just added `num_samples_to_lag` additional columns to the dataframe, one for each lag. The numbers themselve also are now longer the original raw metric values, instead they have first been differenced (just take difference of latest value with pervious value so that we are working with delta's as opposed to original raw metric) and also smoothed (in this case by just averaging the previous `num_samples_to_smooth` previous differenced values).\n",
+ "\n",
+ "The idea here is to define the representation that the model will work in. In this case the model will decide if a recent observation is anomalous based on it's corresponding feature vector which is a differenced, smoothed, and lagged array or list of recent values."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "(7192, 6)\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>system.cpu|user_lag0</th>\n",
+ " <th>system.cpu|user_lag1</th>\n",
+ " <th>system.cpu|user_lag2</th>\n",
+ " <th>system.cpu|user_lag3</th>\n",
+ " <th>system.cpu|user_lag4</th>\n",
+ " <th>system.cpu|user_lag5</th>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>time_idx</th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " <th></th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>1647983445</th>\n",
+ " <td>3.330669e-16</td>\n",
+ " <td>0.167293</td>\n",
+ " <td>0.499561</td>\n",
+ " <td>0.167504</td>\n",
+ " <td>0.000633</td>\n",
+ " <td>0.253165</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1647980613</th>\n",
+ " <td>5.967300e-03</td>\n",
+ " <td>0.000422</td>\n",
+ " <td>0.166665</td>\n",
+ " <td>0.335848</td>\n",
+ " <td>0.083963</td>\n",
+ " <td>0.083542</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1647984383</th>\n",
+ " <td>2.531518e-01</td>\n",
+ " <td>0.083327</td>\n",
+ " <td>0.001899</td>\n",
+ " <td>0.251886</td>\n",
+ " <td>0.083963</td>\n",
+ " <td>0.082700</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1647984447</th>\n",
+ " <td>1.696266e-01</td>\n",
+ " <td>0.083542</td>\n",
+ " <td>0.081459</td>\n",
+ " <td>0.082074</td>\n",
+ " <td>0.082280</td>\n",
+ " <td>0.168344</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>1647983270</th>\n",
+ " <td>5.101800e-03</td>\n",
+ " <td>0.082498</td>\n",
+ " <td>0.082703</td>\n",
+ " <td>0.004262</td>\n",
+ " <td>0.174051</td>\n",
+ " <td>0.001050</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " system.cpu|user_lag0 system.cpu|user_lag1 system.cpu|user_lag2 \\\n",
+ "time_idx \n",
+ "1647983445 3.330669e-16 0.167293 0.499561 \n",
+ "1647980613 5.967300e-03 0.000422 0.166665 \n",
+ "1647984383 2.531518e-01 0.083327 0.001899 \n",
+ "1647984447 1.696266e-01 0.083542 0.081459 \n",
+ "1647983270 5.101800e-03 0.082498 0.082703 \n",
+ "\n",
+ " system.cpu|user_lag3 system.cpu|user_lag4 system.cpu|user_lag5 \n",
+ "time_idx \n",
+ "1647983445 0.167504 0.000633 0.253165 \n",
+ "1647980613 0.335848 0.083963 0.083542 \n",
+ "1647984383 0.251886 0.083963 0.082700 \n",
+ "1647984447 0.082074 0.082280 0.168344 \n",
+ "1647983270 0.004262 0.174051 0.001050 "
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "print(df_preprocessed.shape)\n",
+ "df_preprocessed.sample(5)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The model works based on these feature vectors. A lot of ML is about training a model to define some [\"compressed representation\"](https://en.wikipedia.org/wiki/Data_compression#Machine_learning) of the training data that can then be useful for new data in some way.\n",
+ "\n",
+ "This is exactly what our cluster models are trying to do. They process a big bag of preprocessed feature vectors, covering `num_samples_to_train` raw observations, during training to come up with the best, synthetic, `n_clusters_per_dimension` feature vectors as a useful compressed representation of the training data.\n",
+ "\n",
+ "The cell below will just show you what those `n_clusters_per_dimension` (in this case 2) synthetic (made up by the kemans algo) feature vectors are."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 14,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "<div>\n",
+ "<style scoped>\n",
+ " .dataframe tbody tr th:only-of-type {\n",
+ " vertical-align: middle;\n",
+ " }\n",
+ "\n",
+ " .dataframe tbody tr th {\n",
+ " vertical-align: top;\n",
+ " }\n",
+ "\n",
+ " .dataframe thead th {\n",
+ " text-align: right;\n",
+ " }\n",
+ "</style>\n",
+ "<table border=\"1\" class=\"dataframe\">\n",
+ " <thead>\n",
+ " <tr style=\"text-align: right;\">\n",
+ " <th></th>\n",
+ " <th>system.cpu|user_lag0</th>\n",
+ " <th>system.cpu|user_lag1</th>\n",
+ " <th>system.cpu|user_lag2</th>\n",
+ " <th>system.cpu|user_lag3</th>\n",
+ " <th>system.cpu|user_lag4</th>\n",
+ " <th>system.cpu|user_lag5</th>\n",
+ " </tr>\n",
+ " </thead>\n",
+ " <tbody>\n",
+ " <tr>\n",
+ " <th>centroid 0</th>\n",
+ " <td>0.182626</td>\n",
+ " <td>0.169506</td>\n",
+ " <td>0.100484</td>\n",
+ " <td>0.178778</td>\n",
+ " <td>0.177843</td>\n",
+ " <td>0.100711</td>\n",
+ " </tr>\n",
+ " <tr>\n",
+ " <th>centroid 1</th>\n",
+ " <td>0.115532</td>\n",
+ " <td>0.141029</td>\n",
+ " <td>0.276627</td>\n",
+ " <td>0.122611</td>\n",
+ " <td>0.124448</td>\n",
+ " <td>0.276112</td>\n",
+ " </tr>\n",
+ " </tbody>\n",
+ "</table>\n",
+ "</div>"
+ ],
+ "text/plain": [
+ " system.cpu|user_lag0 system.cpu|user_lag1 system.cpu|user_lag2 \\\n",
+ "centroid 0 0.182626 0.169506 0.100484 \n",
+ "centroid 1 0.115532 0.141029 0.276627 \n",
+ "\n",
+ " system.cpu|user_lag3 system.cpu|user_lag4 system.cpu|user_lag5 \n",
+ "centroid 0 0.178778 0.177843 0.100711 \n",
+ "centroid 1 0.122611 0.124448 0.276112 "
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# lets pick the first model to look at\n",
+ "model = list(models.keys())[0]\n",
+ "\n",
+ "# get the cluster centroids and put them in a dataframe similar to above\n",
+ "df_cluster_centers = pd.DataFrame(models[model]['model'].cluster_centers_, columns=df_preprocessed.columns)\n",
+ "df_cluster_centers.index = [f'centroid {i}' for i in df_cluster_centers.index.values]\n",
+ "display(df_cluster_centers)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "At inference time we can now use our `n_clusters_per_dimension` cluster centers as a sort of set of \"reference\" feature vectors we can compare against. \n",
+ "\n",
+ "When we see a new feature vector that is very far away from these \"reference\" feature vectors, we can take that as a signal that the recent data the feature vecotr was derived from may look significantly different than most of the data the clusters where initially train on. And as such it may be \"anomalous\" or \"strange\" in some way that might be meaningful to you are a user trying to monitor and troubleshoot systems based on these metrics.\n",
+ "\n",
+ "To try make this visually clearer we will take 10 random feature vectors from the first half of our data where things were generally normal and we will also take 10 random feature vectors from the yellow anomalous period of time. Lastly we will also include the cluster centroids themselves to see how they compare to both sets of 10 feature vectors. \n",
+ "\n",
+ "Basically this is represented in the heatmap below where each row is a processed feature vectors corresponding to some timestamp `t`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {
+ "pycharm": {
+ "name": "#%%\n"
+ }
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "<AxesSubplot:>"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 720x720 with 2 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# prepare heatmap\n",
+ "df_heatmap = pd.concat([df_preprocessed.sample(10),df_preprocessed.tail(n_tail_anomalous).sample(10), df_cluster_centers])\n",
+ "df_heatmap = df_heatmap.round(2)\n",
+ "\n",
+ "# get scores\n",
+ "heatmap_scores = np.sum(cdist(df_heatmap, models[dim]['model'].cluster_centers_, metric='euclidean'), axis=1)\n",
+ "heatmap_bits = [1 if score >= dimension_anomaly_score_threshold else 0 for score in heatmap_scores]\n",
+ "\n",
+ "# add anomaly score to index\n",
+ "heatmap_index_inputs = list(zip(range(1, len(df_heatmap)+1), df_heatmap.index, heatmap_scores, heatmap_bits))\n",
+ "df_heatmap.index = [f'{x[0]}. t={x[1]} (AS={round(x[2]*100)}%, AB={x[3]})' for x in heatmap_index_inputs]\n",
+ "\n",
+ "fig, ax = plt.subplots(figsize=(10,10))\n",
+ "sns.heatmap(df_heatmap, annot=True, ax=ax, cmap='RdYlBu')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You should see from the above heatmap that the top 10 rows all tend to have lower anomaly scores (AS) and anomaly bits (AB) that are 0. While its the opposite for rows 11-20.\n",
+ "\n",
+ "The final two rows are the cluster centroids themselve which should look more similar to the first 10 rows, fairly different to rows 11-20. And of course, you would expect that each cluster centroid itself has a low anomaly score and non-anomalous anomaly bit."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Another way to present this visually is via line or bar plots. Below we just plot each of the rows above as a line. First all at the same time on one plot and then each one individually as a bar plot."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In the plot below the dotted lines correspond to the feature vectors sampled from the yellow anomalous period and as such we expect them to look very different to the solid lines (sampled from the normal period) and the solid circled lines which correspond to the two centroids. (Admittedly its not the clearest of plots to read since the normal lines all bunch together)."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "<AxesSubplot:>"
+ ]
+ },
+ "execution_count": 16,
+ "metadata": {},
+ "output_type": "execute_result"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 720x360 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "line_styles = ['-' for i in range(10)] + ['--' for i in range(10)] + ['o-' for i in range(2)]\n",
+ "df_heatmap.transpose().plot(legend=False, style=line_styles, figsize=(10,5), rot=15)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We can also just plot each feature vector itself as a bar plot with one bar for each of our 6 features. We have set the y-axis in the below plots to be fixed such that the differences are more obvious."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {
+ "scrolled": true
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 720x216 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 720x216 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 720x216 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 720x216 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 720x216 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 720x216 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 720x216 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 720x216 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAlEAAADvCAYAAADSI4HyAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAhj0lEQVR4nO3debwkVXnw8d8DDIvIEmQEBGQAARGVJSMIoqJGEBGIy2sUg0uMQ14lcTfqq1HURGJ8EWKCbxBBSDTikriCCmFVFBg22RERZREYwFEW2Z/3j3Naivbemb41t+t23/v7fj71ud21nnqqbvfT51SdisxEkiRJU7PSTBdAkiRpHJlESZIktWASJUmS1IJJlCRJUgsmUZIkSS2YREmSJLVgEiVprEXEQRFx+EyXYy6LiH0j4oSZLofUNZMoCYiIbSPi1Ij4TURcExEvncKyn4+Ij7Xc7sERsTgi7ouIz08w/TERcWRE3FbLduYE86waEVdExA2Ncc+OiLv6hoyIl9fpq0XEpyLipoj4dd3GvMby/cs+FBGfbkx/Zd3mnRFxeUT8aWPa6+v8zeX3aEzfISLOqvtzQ0R8sDHtmRFxckTcERFLIuIrEbHRMuK3KvAB4J/6xj+2bvekCZbZPSLOrtu/IyJ+GBHPmGwbyxMRx9TYPqkx7vSIuLex/1c1pm0fEZfVY/qOxvh5EXFORGzaogx71DL8bd/4BXV8rxy39B/rKWxjvYj474i4OyJ+EREH9KZl5reA7SLi6VNdrzTOTKI050XEKsA3gG8D6wGLgP+IiK072PxNwMeAYyaZflQt07b179snmOfdwJLmiMw8KzMf2xuAlwB3Ad+ts7wXWAg8Fdga2ImSjPSWby67IfA74CsAEbEx8B/AO4C16/a/GBGPbxThR811ZObpjWlfBM6s+/Nc4M0RsV+d9kd1nxcAmwF3AsdOEhuA/YErM/PGvvEvB+4DXhgRG/ZGRsTalOP86br9jYFD6rxTFhG7A1tOMvngxv5v0xj/ceBdwPbA/2mU7x3A1zLz+hZFeR1wB/DaSaavW4/l04Bdgbe02Ma/AvcDGwCvAT4TEds1pv8n5X9Hmjsy08FhTg+UROIuIBrjvg98dIBlFwEPUL5c7gK+1bIMHwM+3zfuycBvgbWXsdzmwBXA3sANy5jvWODYxvvFwP9qvD8AuH6SZV8HXNuLD7ALcGvfPEuAXevr1wM/WEZZ7gGe0nj/FeB9k8y7E3DnMtZ1DPCBCcafCvw9cAHwrsb4hcDSaTpvVgEuBJ4OJPCkxrTTgb+cZLkrgNXq6x8DO1MSxnOBeS3KsSYl2XxVPQ8XNqYtqGVbpTHuE8BRLbZxP7B1Y9y/A4c23j8L+Pl0xNbBYVwGa6KkiQUluVqmzDwK+ALwiSw1DvsCRMS3I2LpJMO3ByzDzsAvgENq088lvea4hk8D76fUFE28IxFrAq8AjptgH5uvN4mIdSZYxeuA4zOz94yoxcAVEbFfRKxcm/LuA37SWGbHWuarI+KDtbav53DgtbX5ahtKzcgpkxT/OcBlk+0bpWblquaIiNgM2INyXL7Ao2tnrgYeiojjImLviPijvmV3X8ZxW1prnnreDpyZmc39bvp4jcEPm82ZwKXAnhGxCSXJ+RlwBPDuzHxgGfs6mZdREvivAN+jHK8JRcQTgL0oyVtv3CDn6tbAg5l5dWN1FwPNmqgrgAW1tk+aG2Y6i3NwmOkBmEepaXlPfb0n5Vf39wZc/vPAx1awDBPVRL2fUovwYWBVStPXXcC2dfpLgZPq6z2YpCYKOBD4OY+uafsY8ENgPqW57py6rY36lt0MeAjYvG/8G2tZHqTULO3TmLYFpYZsJUqSczmNmiZgN+CaumwCh0xS7qdTmqievYy4/RR4Ud+4DwAX1dcb1/Lv2Ji+bT1mN9QyfBPYYIrHa9O6D+vU9/01UbsAawGrUZKaO4EtGzE9kVJL9mpgP0qtzhMpzcpn0KglHKAspwCH19evptQKzqvvF9SyLa1DAmezjNrNSbbxbODmvnFvAk7v+z9K4InD+D91cBjFwZoozXlZfv3/KbAPcDPwTuDLlC/ZmfQ7SlPhxzLz/sw8AziNUouxJqVZ5m8GWE9/TRKUpq4LgYsoX6pfr9u6pW/ZAylNcz/vjYiIP6nb3oNHkrujI2IHgMy8NjN/npkPZ+YlwEcoNWFExHqU67I+AqxOSUb2iog3NzdaL9I+CXhrZp61jH37NSVZaXotpQaKLNdKnUGjdiYzr8jM12fmJpTaxidQasem4nDgI5n5m4kmZuY5mXlnZt6XmcdREtYX12m/yMwXZ+ZOlKTpo5RrpD4JnEBJqg6rsVqmehH683r7W9e3OuVcblo/M9cFHlPL8r0p7CuUhLm/hmltSnLY0zsOS6e4bmlsmURJQGb+JDOfm5mPy8y9KLUp5w66eP+IiDgp/vAOt97wB3eMTWKiZqLetrai1DKcFRE3A/8FbBQRN0fEgkY5NqUkO8c/aiWZv8vMgzNz48zcArgdOD8zH+7b3mv5w2bAHSjNWItronQepSbrTybZj+SRpsMtgIcy8/jMfDAzbwC+RE0wapk3o9SufDQz/32Sdfb8hNLU1Ft2N0ps3ldjcTOlVuiAvibFXhyupNRKPbUuP9Fdjc3h2XXRFwD/1NgGwI+ad6wtIwZNfwd8NjNvodTaLa6J2Q3AkyaYv9+BlM/xb9VyXEtJoiZs0svM39X9fWZErF/3eZBz9WpglYjYqrG67Xl0U+u2wHWZ+dsByi3NDjNdFebgMAoDpelodcov9XdRmr9WG3DZQ4EvttzuKnW7H6c06axOvQiY0jxyDfDBOt+zKL/8n1zfb9gYXka5029DYOXG+t9PSXj6t7sxpQYmgGcC1wN79s2zG3A3sFbf+OcCtwE71Pc7UpKwPev7vanNY7WslwIfqu/XptRUHED58t8Q+BHwD41y/YzGxeDLid/LgO833v8b5aaAZmw2r3Hbt5bnncAmdf5NKTUzn53icXt83zayxnENYF3KdUer1+P0mhrHrfvW8RTKtUkr1/cnAn9FufvtNmDDOv504MOTlOMqSnNvsyz7Ua5Rexx9F5ZTmhcPBX5Fo3l3wH3+EuUOvDXrufgbYLu+c+3Imf5fdnDocpjxAjg4jMJA6Wfo15Rmi5N49PUtT6zjJ7zWg1LzcVFNDr4+xe1+uH7JNYcPN6ZvV5OMuynXFr10kvXswQTXRAFXAm+cYPxzgOso1zNdBbxmgnn+Dfj3SbZ3MCXBu5NS+/HOxrRPUpoF767TPkLjrjPg+cB59Uv4ZuCzwGPqtA/VGNzVHJYRv3nALykJ4er1GO47wXxHAl+lJGlfBm6s5bux7ueUrhGaYP2/vyaKcp3ZeTU2SymJ0gsnWOY0YJfG++3rMb4NeEdj/M8mWf6ZwL3A/AmmXVaP0YK+eC6lNG8+o8U+rkdp9r27xvyAvumXANsP8//UwWHUht4ty5I0liJiEaXLhLfNdFmmW72D78uZudtMl2VZImJf4MDMfOVMl0XqkkmUJElSC15YLkmS1IJJlCRJUgsmUZIkSS2YREmSJLXwB53PTYf1118/FyxYMIxVS5IkTavzzz//tsycP9XlhpJELViwgMWLFw9j1ZIkSdMqIn7RZjmb8yRJkloYKImKiHUj4qsRcWVEXBERuw67YJIkSaNs0Oa8I4DvZuYrImJVyvPFJEmS5qzlJlERsQ7lOVuvB8jM+4H7h1ssSZKk0TZIc97mwBLg2Ii4MCKOjog1+2eKiEURsTgiFi9ZsmTaCypJkjRKBkmiVgF2Aj6TmTtSnuD93v6ZMvOozFyYmQvnz5/yXYKSJEljZZAk6gbghsw8p77/KiWpkiRJmrOWm0Rl5s3A9RGxTR31AuDyoZZKkiRpxA16d95fA1+od+ZdC7xheEWSJEkafQMlUZl5EbBwuEWRJEkaH/ZYLkmS1IJJlCRJUgsmUZIkSS2YREmSJLVgEiVJktSCSZQkSVILJlGSJEktmERJkiS1YBIlSZLUgkmUJElSCyZRkiRJLZhESZIktWASJUmS1IJJlCRJUgsmUZIkSS2YREmSJLVgEiVJktTCKoPMFBHXAXcCDwEPZubCYRZKkiRp1A2URFXPy8zbhlYSSZKkMWJzniRJUguDJlEJfD8izo+IRRPNEBGLImJxRCxesmTJ9JVQkiRpBA2aRO2emTsBewNviYjn9M+QmUdl5sLMXDh//vxpLaQkSdKoGSiJyswb699bgf8Gdh5moSRJkkbdcpOoiFgzItbqvQb2BC4ddsEkSZJG2SB3520A/HdE9Ob/YmZ+d6ilkiRJGnHLTaIy81pg+w7KIkmSNDbs4kCSJKkFkyhJkqQWTKIkSZJaMImSJElqwSRKkiSpBZMoSZKkFkyiJEmSWjCJkiRJasEkSpIkqQWTKEmSpBZMoiRJklowiZIkSWrBJEqSJKkFkyhJkqQWTKIkSZJaMImSJElqwSRKkiSphYGTqIhYOSIujIhvD7NAkiRJ42AqNVFvBa4YVkEkSZLGyUBJVERsAuwDHD3c4kiSJI2HQWuiDgfeAzw8vKJIkiSNj+UmURHxEuDWzDx/OfMtiojFEbF4yZIl01ZASZKkUTRITdSzgP0i4jrgS8DzI+I/+mfKzKMyc2FmLpw/f/40F1OSJGm0LDeJysz3ZeYmmbkAeBVwamb++dBLJkmSNMLsJ0qSJKmFVaYyc2aeDpw+lJJIkiSNEWuiJEmSWjCJkiRJasEkSpIkqQWTKEmSpBZMoiRJklowiZIkSWrBJEqSJKkFkyhJkqQWTKIkSZJaMImSJElqwSRKkiSpBZMoSZKkFkyiJEmSWjCJkiRJasEkSpIkqQWTKEmSpBZMoiRJklpYbhIVEatHxLkRcXFEXBYRh3RRMEmSpFG2ygDz3Ac8PzPvioh5wA8i4qTM/PGQyyZJkjSylptEZWYCd9W38+qQwyyUJEnSqBvomqiIWDkiLgJuBU7OzHOGWipJkqQRN1ASlZkPZeYOwCbAzhHx1P55ImJRRCyOiMVLliyZ5mJKkiSNlindnZeZS4HTgBdNMO2ozFyYmQvnz58/TcWTJEkaTYPcnTc/Itatr9cAXghcOeRySZIkjbRB7s7bCDguIlamJF1fzsxvD7dYkiRJo22Qu/N+AuzYQVkkSZLGhj2WS5IktWASJUmS1IJJlCRJUgsmUZIkSS2YREmSJLVgEiVJktSCSZQkSVILJlGSJEktmERJkiS1YBIlSZLUgkmUJElSCyZRkiRJLZhESZIktWASJUmS1IJJlCRJUgsmUZIkSS2YREmSJLWw3CQqIjaNiNMi4vKIuCwi3tpFwSRJkkbZKgPM8yDwzsy8ICLWAs6PiJMz8/Ihl02SJGlkLbcmKjN/lZkX1Nd3AlcAGw+7YJIkSaNsStdERcQCYEfgnKGURpIkaUwMnERFxGOBrwFvy8zfTjB9UUQsjojFS5Ysmc4ySpIkjZyBkqiImEdJoL6Qmf810TyZeVRmLszMhfPnz5/OMkqSJI2cQe7OC+BzwBWZedjwiyRJkjT6BqmJehZwIPD8iLioDi8ecrkkSZJG2nK7OMjMHwDRQVkkSZLGhj2WS5IktWASJUmS1IJJlCRJUgsmUZIkSS2YREmSJLVgEiVJktSCSZQkSVILJlGSJEktmERJkiS1YBIlSZLUgkmUJElSCyZRkiRJLZhESZIktWASJUmS1IJJlCRJUgsmUZIkSS2YREmSJLWw3CQqIo6JiFsj4tIuCiRJkjQOBqmJ+jzwoiGXQ5IkaawsN4nKzDOBOzooiyRJ0tjwmihJkqQWpi2JiohFEbE4IhYvWbJkulYrSZI0kqYticrMozJzYWYunD9//nStVpIkaSTZnCdJktTCIF0c/CfwI2CbiLghIt44/GJJkiSNtlWWN0NmvrqLgkiSJI0Tm/MkSZJaMImSJElqwSRKkiSpBZMoSZKkFkyiJEmSWjCJkiRJasEkSpIkqQWTKEmSpBZMoiRJklowiZIkSWrBJEqSJKkFkyhJkqQWTKIkSZJaMImSJElqYZWZLoAkSerGgvd+Z6aL0Mp1h+4z00WYkDVRkiRJLZhESZIktTBQc15EvAg4AlgZODozDx1qqTRSrP7tnjGXpNG33JqoiFgZ+Fdgb+ApwKsj4inDLpgkSdIoG6Qmamfgmsy8FiAivgTsD1w+zIJNxl/okobBzxZJUzXINVEbA9c33t9Qx0mSJM1ZkZnLniHiFcCLMvMv6/sDgV0y8+C++RYBi+rbbYCrpr+4Q7c+cNtMF2KOMebdM+bdM+bdM+bdG+eYb5aZ86e60CDNeTcCmzbeb1LHPUpmHgUcNdUCjJKIWJyZC2e6HHOJMe+eMe+eMe+eMe/eXIz5IM155wFbRcTmEbEq8Crgm8MtliRJ0mhbbk1UZj4YEQcD36N0cXBMZl429JJJkiSNsIH6icrME4ETh1yWUTDWzZFjyph3z5h3z5h3z5h3b87FfLkXlkuSJOkP+dgXSZKkFkyiJEmSWjCJmkUiYrWImFdfx0yXZy6IiJXqX+PdkYhYtT6Oyrh3pH62rFZfG/Mh6sU3ItaIiPn1td/VHYiIx0bEgvp6oPPcAzMLRMTuEXEZ8D/A2wHSi92GJiLWioh3R8RPgH+uo/1fGqKI2CAiPhQRPwS+C/wNeJ4PU0Q8PiI+HhGnAqcCb4+I1Yz5cGVmRsQOwC+Bv53h4sx6EbFeRHw0Ir4DXAi8Dgb/bBno7jyNlvqrJDLzoYhYndJT/PuAM4HvRMS1wNf8sJs+NeYrZeaDlK4+NgKOB14DkJkPzWDxZqXmeU7p8Hcj4G3AL4BTI+LizDx1Bos46/Sd56sB84APAJcAZwOLgVNmroSzT6+WKTMfbozelvKjePMJpmkF9Z3nawHvBfbMzNOmui5/PY+RXvViZj7c+9LOzHspD4m+MDOXAv8X2IPy6B2toL6YP1hfLwU+DhwG3BcROzbn1YqZ6DwHrgHelZnnZeatwLnULxituEnO8+sz812ZeXZm3glcC9w7k+WcTfpi3p8kvQI4Abg3Iv64Ob/am+Q8/wVwWR2IiI2msk6TqBEUxcr97eC1mnfDiNgjIo6IiH0jYh3gB8BT62yXAfcBfrFPwYAxPzwi9q/jl9QPvkuAvers/j9NwRRivl9mLs3Mu+pTE6DUklj7N0VTiXljmTdExAOU56I9oesyj7upfrbUprxrgIuAWyi1UuDny8CmEPOX10mXAj+OiPOBf4mIRYNeh+ZBGQERsW5E7FMTIrJ4KDMfbiZBEfEaSpX6i4E/Ad4A3APcxCP/aEuAm4GNe+vqbk/GR8uYvwB4Yx3f+985A3hOt6UfTysQ8zfV8fMy8/6I2BnYDPiqPxKWbUVjXp0EPK6Oe1nvy14TW4GY/1WdtDVwU2b+HFgKHBQRB3nJwORWIOZ/WScdDhwKPAv4R+BPgZcNsm2viRoNT6FcW3MfcEpEbAP8ObALcFZE/AulGn034K2Z+a2IOAU4GgjgSuBFAJl5R13+pO53Y6xMFvOdgR8sI+bHQKkOrv+c5wDvruP8kFu2FY35A3U97waOzMy7ut6BMbRCMQfIzJvry8sj4gZg84hYyet0JtX28/xztaZ1a+DAiFgEPJbyGX/TDOzHOGl7nh8LkJmLKdf7AZwbEZcDGwxynlsT1ZFavThZvK+jVN8+qb7fg1Kj9G7gbuDvKCfHQuDi+ov8+5Tjty3wdWCHiNizLv/Euvyc1jLm7+GRmN/PH8a8d+dM79fOT4F7IuKwiHhjRGwwrP0ZB0OMea95elfKh+HiiNg/Ig6IiLWGtT/jYNjneWM7KwNbAVfO9QRqSJ/nAWxJuUPsUGBf4BnAeZRmvTl9ecaQzvOHJzjPV6FcU/zTQc5zk6iO1C/cyQ7IEuBXlF8gAMcB5wNvplQ37g6sWufbpfGL/E5g/8y8BzgEeH1E3A78pA5z2jTEfB5wK/DMvpi/GCAinhkRZ1C+WHYEHqBUv89ZQ4z5S+rrv6b84jyaclfqPcDvpnk3xsoQY74XQEQcFBHnUa7RuYZS+zqnDfHz/KWZ+Z3MPDYzr6UkWydSr/+by5dnDPE83xsgIl4X5ZqoC4GrKDevLJfNedNsouq/mj1vAbweeCAzD2lOz8wHIuKXwE4R8URKxnwQ5Xqbv6fcBbYr8FnglbXdN4DbKNWYUGqj/ifLnWNzygAxvz8zP9Kc3hfzzSgfVgdRuoloxvxoSszXroveDmxfX/8SeFtmXjiUHRthHcf8DuDJ9fUxwD9n5o+HsmMjbAZi/vT6+kLg4Mycc8nTDHyeP61uY7XMvC8zfwN8boi7OHJm4Dx/Wn19CfCWqX62WBO1AiJipVrF/Xu9gx8RT43ShxOUA3gE5dfEcX3r6FXP/pJSk7Ex8Fxgncz8HPAgpYr35Zn5DcpJsC+lb4vPUKsv60V0S+s6V56t1b4tY3583zr6Y/4EHon50Uwe87UpMd+ybvemXgJVY/6ocs0WIxDzI6lddmTmKb0PuYnKNVuMWMzP7SVQfrYAw/0836pu977+sk3Tbo6UETnPt67bvaDx2TLweW5N1BRERDSrUyeqWoyI91L6+PgtcEZEHE/p7fcZwFcy87rm/I31/aoOO1BqlV4bEV+jHOivU04EKL9mLgL+mFJN+Yn+MuQsusDZmHdv1GPeK98yqvbHzhjF3PO848+Wyco2jkY95m3Oc5Oo5WhWLTYPfpRn1O0FvJLSrvoJSptrUu7CWAf4KrAu8ClKR3XLivftddgVOAo4mHLH3RmZeXljvtXrttYBvgN8a0X3cdQY8+6NU8yb5Rtnxrx74xTz2WKcYt7qPM9Mh8ZA6QV5EbDxBNM2Bl5SX+8JnAy8HNiujtuLcmHxKZQ7Kj5XT4bHUKp991vOtrcANp9k2sozHRtjPnsGY27MjbkxN+YrPlgTVTWy5fUpF7FeA9wYEc8D1sjMEynXCLw9Iq6iVBGuQrkY7Z66mvMpWfSbsnSU1lz/LcDTIuK0zLyztrcGjzwbjCx3YzSX6XVRnzmLqtF7jHn3jHn3jHn3jHn35mrMZ+XFaoPoBbcR5F7niVdT+uToPZfrOTzyWI+zKdWOT6B0Znk78BbgUxHRqxI8F9g/StfyL4zSf9CGlEez3Az8vlozG88Gi4gnRumJuf/Az4pqdDDmM8GYd8+Yd8+Yd8+YF3OiJir6npIdUS4ei3obaUSsRql+XCczPxYRNwNb1ANxEbB3RDw+M2+NiF9Rbok8OzNfUde3FqUNdlfgf1Nuw/wOpe+KbwB3ZeY3+8q0OrAP8HxgJ8rtrZ+u5Rz7fzRj3j1j3j1j3j1j3j1jPrlZmUT1H/Bs3AEQEY/LzNsj4vGUK/93y8xfR8T9wLr1YF5L6SNlY+BnlNsqn05pp72e8sydEyJiXcozdnagtNmeU0+SjwIf6j+Q8ej+L15I6Vn8/1F6AH6AMTZGMd8UY27MWzLm3RujmPt5PgfP81nZnJeliq+XMa8ZEc+PiCMj4mrg2IjYNTNvpXQVv0dd7GeUJ8NvVV/fT3mkytWU6sd96nxrU9p7NwI2qPP/F3BgXSeZ+UDN0h/VB0bzRMzMb2XmpzLzknH/h4OxivnhxtyYt2XMuzdGMffzfA6e57MuiYqIdaI8T+uLEfEMysH6B8qV+VsDPwL+KiKeBJzGI22111HaWreitOfeBmybmfdTekneISIupVzI9jbgqsz8QWYuysyvZeZv+8uSjfba2cyYd8+Yd8+Yd8+Yd8+YT81YNOdF/L799VEddU0w30rAhylViGdSDupKwJWU7t0B/pPS3rorcDpwIEBmXhMRuwB3ZuYJEXE9sH1ErJ2ZV0fEn/Wy5Am2+agMeTYw5t0z5t0z5t0z5t0z5sMzsklURPRuXXy4d9B7fyNia+C2zLyj76R4DrB7Zj6jsZ7VgMWU9moy87qI2AK4NDPPjdK9+z8Cj6O01d4T5YK16ykXq20BXNQ7+P0HfFwP/ESMefeMefeMefeMefeMeTdGJomqgW3295CU/iKIUm24PuVK/S/VRS4B/qIvq76D2t9ElN5QH85y58B1wKKI+EJmXky5GK2XVb8a2L+u7xuZeWdd/hbKQwy3AC7qnWjjfsCbjHn3jHn3jHn3jHn3jPnMmLEkKvqe1Nwf2ChPtn4L5UK1fYF7KX1EvCwzr4+In0bETpl5QWOx24H7IuJZmfnDup5evxW/AD4Z5Y6C04AL6nYvBi5ubLeXld8AfJ96ovSdaGPJmHfPmHfPmHfPmHfPmI+GzpOoegD2AL5S3/faavcA9qZkyx/MzJsiYn/g8szcKSJ2pLTFPrau6jRgt4i4KB+pFrwxIi4A3lrX9zxKu+7hwI+BlTLzoxOUaaJqz/uBH05/BLpnzLtnzLtnzLtnzLtnzEfL0O/Oi9r+2ZOlXXQR8KqIeA+wdkRsCfw5JZs9ETgsIjYGvgv8qh6gGyhPct6lruocYHtgzYiYFxF71/F/R+lwaz3gMODjwN2UWyy3qGWKZrmymDVVjMa8e8a8e8a8e8a8e8Z8tA09ieoFNiI2i4jdI2InypX/hwBPohyYtwE3AXdRrvRfSOk/4hJgE2ANykMJr6JcqAal+/idKQd6deD5EbF6Zt6fmWdl5jsz88Qs/U08RKlW/GQt06w+4Ma8e8a8e8a8e8a8e8Z8xOWKPa05mOTJyJSDtgbwx8AZlIP5QWBD4F3AJxvzvp/SbvphYD9gtTr+iZQDt2V9/xLgrN42gZcCa06y/ZUoVY8rtI+jNhhzY27Mjbkxnx2DMR//YbpPiHXq37WBzwAHAK8B/rFvvh2B7wELKNdlvRA4s2+e59a/ZwP719ePA7bpHeAJTsaY6YB2fgCNuTGfA4MxN+ZzYTDm4ze0urC8cSHb1sArgFUp1YhbU9pqN6gnwSmULPoNtf30FuDqzPxmlK7c52fmdcDJEfG3EfHPlKrFnYCvU7LvN1Cew0Nm3k65e4Dsq0rMehbMVsa8e8a8e8a8e8a8e8Z89oi2cYuIJwOfB06mHOhbgeOBv6C0u14BbJeZ90bEQsodAc8EXkd5hs6rKc/OeTzw2cw8OSL+jPIQwlMy8/oV2K9ZyZh3z5h3z5h3z5h3z5jPDivSxcGWwDXAccCNmfm7iDgUeAfwbeBrwAYR8cvMXAy/73xrG2AecCSlqnIppQ8KMvOEFSjPXGDMu2fMu2fMu2fMu2fMZ4EVqYlaCziW0iPpSpRbIA+jdBt/CHBCZv5NRKxBqa78ICXTPgH4l8mqDqOvAzE9wph3z5h3z5h3z5h3z5jPDq2TqEetpFRL/gXlqc1HAkcAG2XmiyMiKJ1/PZCZSydYdmVK1/K2x06BMe+eMe+eMe+eMe+eMR9frZvz6oF9AvA0SuddOwJvzsy7IuJcYL2IWDlL/xJLGsusVMcB0HytZTPm3TPm3TPm3TPm3TPms0PrzjZr1rsp8CbgQeA9mfnTiNgKOAi4IDMfqgf998t4wNsz5t0z5t0z5t0z5t0z5rPDtDTnPWqF5e6A7YAjstxOqSEz5t0z5t0z5t0z5t0z5uNlhZOoXvUiJUn2YrYOGPPuGfPuGfPuGfPuGfPxNu01UZIkSXPB0B9ALEmSNBuZREmSJLVgEiVJktSCSZQkSVILJlGSJEktmERJkiS18P8B/+9LhSrveYYAAAAASUVORK5CYII=\n",
+ "text/plain": [
+ "<Figure size 720x216 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 720x216 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 720x216 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 720x216 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 720x216 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 720x216 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 720x216 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 720x216 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 720x216 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 720x216 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 720x216 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 720x216 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 720x216 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 720x216 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "for i,row in df_heatmap.iterrows():\n",
+ " ax = row.plot(kind='bar', figsize=(10,3),title=f'{i}', rot=15)\n",
+ " ax.set_ylim(np.min(df_heatmap.values),np.max(df_heatmap.values))\n",
+ " plt.show()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Lastly, an alternative way to visualize the distances between the three groups of points from the heatmap would be a series of scatter plots, one for each pair of features. \n",
+ "\n",
+ "This should give a good intuition for the distance measures (in 6 dimensional feature space, lastest preprocessed observation plus the 5 lagged values) that underpin the raw anomaly score.\n",
+ "\n",
+ "Generally we would expect to see the blue 'normal' points group closer together and 'near' the green cluster centroids, meanwhile the anomalous orange points should tend to be further away from the other two groups."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 432x288 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 432x288 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 432x288 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ "<Figure size 432x288 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ },
+ {
+ "data": {
+ "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYkAAAEKCAYAAADn+anLAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjUuMSwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy/YYfK9AAAACXBIWXMAAAsTAAALEwEAmpwYAAAm9klEQVR4nO3de5xVdb3/8deb4eYAKcLkhYtomeYVdbxbYinhlTp2CstS0+PJ1LLbSatfdkyPdjwdS7soR4lS09LSqDTAe5okg3hDRJE0QVQUBBEEZvj8/lhrYjPMntl7Zq9Zm5n38/HYj1nru9Z37c9ebOYza32/6/tVRGBmZtaaXnkHYGZm1ctJwszMinKSMDOzopwkzMysKCcJMzMryknCzMyKyjRJSBoh6V5JT0uaI+lLrewjSVdKmi/pCUn7Fmw7RdJz6euULGM1M7NNKcvnJCRtB2wXEY9KGgTMAj4aEU8X7HMMcC5wDHAg8KOIOFDS1kADUA9EWne/iFiWWcBmZraR3lkePCIWA4vT5bckzQWGAU8X7DYe+GUk2WqGpK3S5DIGmB4RSwEkTQfGATcVe7+hQ4fGqFGjsvgoZmbd1qxZs16PiLrWtmWaJApJGgXsA/ytxaZhwEsF6wvTsmLlRY0aNYqGhoZOx2pm1pNIerHYti5puJY0EPgtcF5ErKjwsc+U1CCpYcmSJZU8tJlZj5d5kpDUhyRB3BgRv2tll0XAiIL14WlZsfKNRMTEiKiPiPq6ulavlszMrIOy7t0k4DpgbkT8b5HdpgCfTXs5HQQsT9sypgJjJQ2WNBgYm5aZmVkXybpN4lDgM8CTkh5Ly74JjASIiKuBO0h6Ns0HVgGnpduWSvoeMDOtd1FzI7aZmXWNrHs3PQionX0COLvItknApAxCMzOzEnRZ7yYz64bWvgmLpwKC7cZC361yDsgqzUnCzDpm1UL4cz00rgICeg+EcbOgdvu8I7MK8thNZtYxs78Ba16HxregcWWy/Nj5eUdlFeYkYWYd8/aLEE0b1qMR3n4ht3AsG04SZtYx2x4JNbUb1mtqYduj8ovHMuEkYWYds8e3YcTHQDXJa8SJsPsFeUdlFeaGazPrmF694ZAb4MDrAEFN37wjsgw4SZhZ59T0yzsCy5BvN5mZWVFOEmZmVpSThJmZFeUkYWZmRTlJmJlZUU4SZmZWlJOEmZkV5SRhZmZFOUmYmVlRmT5xLWkScBzwWkTs0cr2rwOfLojl/UBdOnXpC8BbQBPQGBH1WcZqZmabyvpKYjIwrtjGiLg8IkZHxGjgAuD+FvNYH5Fud4IwM8tBpkkiIh4Alra7Y+Ik4KYMwzEzszJVRZuEpFqSK47fFhQHME3SLEln5hOZmVnPVi2jwB4PPNTiVtNhEbFI0ruB6ZKeSa9MNpImkDMBRo4c2TXRmpn1EFVxJQFMoMWtpohYlP58DbgNOKC1ihExMSLqI6K+rq4u80DNzHqS3JOEpC2Bw4HfF5QNkDSoeRkYCzyVT4RmZj1X1l1gbwLGAEMlLQQuBPoARMTV6W4fA6ZFxNsFVbcBbpPUHOOvIuLPWcZqZmabyjRJRMRJJewzmaSrbGHZAmDvbKIys4p57QGYdyUg2OVL8O7D8o7IKqxaGq7NbHPz6r1w33HQtCpZf/lPMOZO2ObwfOOyisq9TcLMNlNzLtuQIACaVsPT388vHsuEk4SZdUw0tlK2ruvjsEw5SZhZx+zyRaip3bBeU5u0S1i34iRhZh0zfDwccj0MORCGHASH3AjDjss7KqswN1ybWceN+JfkZd2WryTMzKwoJwkzMyvKScLMzIpykjAzs6KcJMzMrCgnCTMzK8pJwszMinKSMDOzopwkzMysKCcJMzMryknCzMyKyjRJSJok6TVJrc5PLWmMpOWSHktf3ynYNk7SPEnzJZ2fZZxmJYmAVQvhnSV5R2LWZbIe4G8y8GPgl23s85eI2GjoSEk1wE+Ao4CFwExJUyLi6awCNWvT2mVw95GwYi5EE4z8Vzj4lyBfjFv3luk3PCIeAJZ2oOoBwPyIWBARa4GbgfEVDc6sHI+cBcufSmZfW78WXroNnvtZ3lGZZa4a/gw6WNLjku6UtHtaNgx4qWCfhWmZWT7eeCRJDs2aVsGSv+YXj1kXyTtJPArsEBF7A1cBt5d7AElnSmqQ1LBkie8VW0YGvRdUs2G9V3/Y8v35xWPWRXJNEhGxIiJWpst3AH0kDQUWASMKdh2elrV2jIkRUR8R9XV1dZnHbD3UAROh31Do8y7oPRC22gN2/WreUZllLteZ6SRtC7waESHpAJKk9QbwJrCzpB1JksME4FO5BWo2cBQc/1xy26lXPxh6EPTyxI7W/ZX0LU9/gUdEzJS0GzAOeCb967+tejcBY4ChkhYCFwJ9SA52NfBx4CxJjcBqYEJEBNAo6RxgKlADTIqIOR35gGYV02cQbPvhvKMw61JKfie3sYN0IXA0SUKZDhwI3EvSPXVqRFySdZClqq+vj4aGhrzDMDPbrEiaFRH1rW0r5Uri48BooB/wCjA8IlZI+h/gb0DVJAkzM6usUhquGyOiKSJWAc9HxAqAiFgNrM80OjMzy1UpSWKtpNp0eb/mQklb4iRhZtatlXK76YMRsQYgIgqTQh/glEyiMjOzqtBukmhOEJK2brFpPfBMFkGZmVl1KOdhukeBJcCzwHPp8guSHpW0X5s1zcxss1ROkpgOHBMRQyNiCEm32D8CXwB+mkVwZmaWr3KSxEERMbV5JSKmAQdHxAyS7rFmZtbNlDOuwGJJ3yAZthvgk8Cr6dwP7uVkZtYNlXMl8SmSgfZuT18j07Ia4BOVDszMzPJX8pVERLwOnFtk8/zKhGNmZtWk5CQhqQ74D2B3oH9zeUR8KIO4zMysCpRzu+lGkucidgT+E3gBmJlBTGZmViXKSRJDIuI6YF1E3B8RnwN8FWFm1o2V07tpXfpzsaRjgZeBlk9hm5lZN1JOkrg4HdTvqyTzUb8L+HImUZmZWVUop3fTH9PF5cARpdSRNAk4DngtIvZoZfungW8AAt4CzoqIx9NtL6RlTSTDlbc6IYaZ9SCrX4FFfwDVwPDx0G9I3hHlb+0yeOl2iHWw/XFQu31FD99ukpB0FVB0+rqI+GIb1ScDPwZ+WWT734HDI2KZpKOBiSQz3zU7Iu16a11t5QJ49X7o8y4YdjzU9M07IuvpVjwLUw+E9WuT9ccugKMfhdph+caVp9WvwJ37wLoVyfrsr8PYh2HL3Sr2FqVcSXR4PtCIeEDSqDa2/7VgdQbJw3qWt9cegHuPSZYlGLQzjP0r1PRvu55ZlmZ/Nf1lmA7wsH4NPPEdOOi6XMPK1VMXw5rXIRqT9SbBrPPgQ9Mq9halDBX+i1IOJOmqiCj2sF0pTgfuLHxrYJqkAK6JiImdOLaVY8Zp0PT2hvUVz8CCybDz53MLyYzVi9loBKBogtUv5xZOVVi1cEOCACDS81Q55XSBbc+hHa0o6QiSJPGNguLDImJfktFmz5b0wSJ1z5TUIKlhyZIlHQ3BCq1pcYev6R3/Z7T8bX8s1NRuWK+pTcp6su2PhZoBG9ZrtoDtj6noW1QySXSIpL2Aa4HxEfFGc3lELEp/vgbcBhzQWv2ImBgR9RFRX1dX1xUhd391h0GvgjaImi3g3YfnF48ZwB7/D3aYAOqTfD93/jy87+y8o8rXe8+AXc5Nzod6w4iPw94XV/QtyukCW3GSRgK/Az4TEc8WlA8AekXEW+nyWOCinMLseQ6+Hh74KLz+UPIfcu//gm0/nHdU1tP16p20Pxx4bbIu5RtPNZBg9KXJ/1ECVPm/+yuZJDb5F5N0EzAGGCppIXAhydzYRMTVwHeAIcBPlfyDN3d13Qa4LS3rDfwqIv5cwVitLf22hqMegKa10KuP/zNadfH3cVMSrfwKroiSkkQ6Z8T3I+Jrbez2o5YFEXFSW8eNiDOAM1opXwDsXUpsliF3ezXr8Uq6NomIJuCwdvaZXImAzMysepRzu2m2pCnALcA/+0dGxO8qHpWZmVWFcpJEf+ANNh75NUgans3MrBsqZ+ym07IMxMzMqk/J/aUkvU/S3ZKeStf3kvTt7EIzM7O8ldOp9v+AC0jnlYiIJ4AJWQRlZmbVoZwkURsRj7Qoa2x1TzMz6xbKSRKvS3oP6bDhkj4OVHYkKTMzqyrl9G46m2S+h10lLSKZC+LkTKIyM7OqUE7vpgXAkYXjKmUXlpmZVYNyejd9SdK7gFXAFZIelTQ2u9DMzCxv5bRJfC4iVpCMyDoE+AxwWSZRmZlZVSgnSTQPMXgM8MuImENWww6amVlVKCdJzJI0jSRJTJU0iI3mEjQzs+6mnN5NpwOjgQURsUrSEMBDdZiZdWPlJInmocL3kif9MDPrEcpJEl8vWO5PMuf0LDYeFdbMzLqRktskIuL4gtdRwB7AsrbqSJok6bXmQQFb2S5JV0qaL+kJSfsWbDtF0nPp65RS4zQzs8rpzKzZC4H3t7PPZGBcG9uPBnZOX2cCPwOQtDXJfNgHklyxXChpcCdiNTOzDij5dpOkq0jHbSJJLqOBR9uqExEPSBrVxi7jSbrTBjBD0laStgPGANMjYmn63tNJks1NpcZrZmadV06bREPBciNwU0Q81Mn3Hwa8VLC+MC0rVm5mZl2onLGbfpFlIB0l6UySW1WMHDky52jMzLqXdpOEpAvZcJupLfdFxANlvv8iYETB+vC0bBHJLafC8vtaO0BETCQZnZb6+vpS4jQzsxKVciXxQonHerMD7z8FOEfSzSSN1MsjYrGkqcB/FTRWjyWZFc/MzLpQu0miM7eZJN1EckUwVNJCkh5LfdLjXg3cQTLMx3yS0WVPS7ctlfQ9YGZ6qIuaG7HNzKzrlNO76TutlUfERcXqRMRJbR0z7dV0dpFtk4BJpcZnZmaVV07vprcLlvsDxwFzKxuOmZlVk3J6N/2gcF3S/wBTKx6RmZlVjc48cV1L0uvIzMy6qXLaJJ5kQ1fYGqAOKNoeYWZmm79y2iSOK1huBF6NiMYKx2NmZlWknDaJF9NRWg8juaJ4EJidVWBmZpa/ktsk0i6wvwCGAEOByZK+nVVgZmaWv3JuN30a2Dsi3gGQdBnwGHBxBnGZmVkVKKd308skz0c060cyxpKZmXVT5VxJLAfmpHM7BHAU8IikKwEi4osZxGdmZjkqJ0nclr6a3VfZUMzMrNqUkyRuBd6JiCYASTVAv4hYlUlkZmaWu3LaJO4GtihY3wK4q7LhmJlZNSknSfSPiJXNK+lybeVDMjOzalFOkng7fZgOAEn7AasrH5KZmVWLctokzgNukfQyIGBb4JNZBGVmZtWhnGE5ZkraFdglLZoXEeuyCcvMzKpBWUOFR8S6iHgqfa2TtG17dSSNkzRP0nxJ57ey/QpJj6WvZyW9WbCtqWDblHJiNTOzzivndlNrrgOOLbYx7Sb7E5IH7xYCMyVNiYinm/eJiC8X7H8usE/BIVZHxOhOxmhmZh3UmUmHiIiiCSJ1ADA/IhZExFrgZmB8G/ufBNzUmZjMzKxyyrqSkDQYGFFYLyIebaPKMOClgvWFwIFFjr0DsCNwT0Fxf0kNJPNXXBYRt5cTr5mZdU45M9N9DzgVeJ4NM9QF8KEKxTIBuLX5ie7UDhGxSNJOwD2SnoyI51vEdSZwJsDIkSMrFIqZmUF5VxKfAN6T3jYq1SKSK49mwyk+cuwE4OzCgohYlP5cIOk+kvaK51vsMxGYCFBfXx+YmVnFlNMm8RSwVZnHnwnsLGlHSX1JEsEmvZTSrrWDgYcLygZL6pcuDwUOBZ5uWdfMzLJTzpXEpcBsSU8Ba5oLI+KEYhUiolHSOcBUoAaYFBFzJF0ENEREc8KYANwcEYVXAu8HrpG0niSZXVbYK8rMzLKnjX8vt7GjNAe4BngSWN9cHhH3ZxNa+err66OhoSHvMMzMNiuSZkVEfWvbyrmSWBURV1YoJusKr94Lz/wQEOz6Zdjm8LwjMrPNTDlJ4i+SLiVpUyi83dRWF1jLyyt3wf3joSmd7uOV6TDmT7DNmFzDMrPNSzlJovlJ6IMKyirZBdYqac73NyQISJaf/m8nCTMrSzkD/B2RZSBWaU2bFkUrZWZmbSi5C6ykbSRdJ+nOdH03SadnF5p1yi5fhpqCOaFqtoBdz8stHDPbPJXznMRkkq6s26frz5LMMWHVaPjxcOivoO5QqPsAHPYb2P7ovKMys81MOW0SQyPiN5IugH8+A+H7F9Vs+PjkZWbWQeVOXzqEdNwmSQcByzOJyszMqkI5VxJfIen++h5JDwF1wL9mEpWZmVWFcpLEHOBwkulLBcyjk/NRmJlZdSvnl/zDEdEYEXOapy+lYEA+MzPrftq9kkjnsR4GbCFpH5KrCIB3AbVFK5qZ2WavlNtNHyGZbGg48AM2JIm3gG9mE5aZmVWDdpNERPwC+IWkEyPit10Qk5mZVYly2iSGS3qXEtdKelTS2MwiMzOz3JWTJD4XESuAscAQ4DPAZZlEZWZmVaGcJNHcFnEM8MuImFNQZmZm3VA5SWKWpGkkSWKqpEEUzFBXjKRxkuZJmi/p/Fa2nyppiaTH0tcZBdtOkfRc+jqljFjNzKwCynmY7nRgNLAgIlalQ3Sc1lYFSTXAT4CjgIXATElTWpmr+tcRcU6LulsDFwL1JEOBzErrLisjZjMz64RyriRuAbYDVgBExBsR8UQ7dQ4A5kfEgohYC9wMlDri3EeA6RGxNE0M04FxZcRrZmadVE6S+BnwKeA5SZdJ2qWEOsOAlwrWF6ZlLZ0o6QlJt0oaUWZdMzPLSMlJIiLuiohPA/sCLwB3SfqrpNMk9elEDH8ARkXEXiRXC78op7KkMyU1SGpYsmRJJ8IwM7OWyhqgL22HOBU4A5gN/IgkaUwvUmURMKJgfXha9k/pbas16eq1wH6l1k3rT4yI+oior6urK+fjmJlZO8qZvvQ24C8k4zUdHxEnRMSvI+JcYGCRajOBnSXtKKkvMIFkuPHC425XsHoCMDddngqMlTRY0mCS5zOmlhqvmZl1Xjm9m24C/hwRKyR9W9K+wMUR8WhE1LdWIZ297hySX+41wKSImCPpIqAhIqYAX5R0AtAILCW5UiEilkr6HkmiAbgoIpZ25EOamVnHKCJK21F6IiL2knQYcDFwOfCdiDgwywDLUV9fHw0NDXmHYWa2WZE0q9gf++W0STTPZ30sMDEi/gT07WxwZmZWvcpJEoskXQN8ErhDUr8y65uZ2WamnF/ynyBpW/hIRLwJbA18PYugzMysOpTccB0Rq4DfFawvBhZnEZSZmVUH3y4yM7OinCTMzKwoJwkzMyvKScLMzIpykjAzs6KcJMzMrCgnCTMzK8pJwszMinKSMDOzopwkzMysKCcJMzMryknCzMyKyjxJSBonaZ6k+ZLOb2X7VyQ9LekJSXdL2qFgW5Okx9LXlJZ1zcwsW+VMX1o2STXAT4CjgIXATElTIuLpgt1mA/URsUrSWcB/k8xZAbA6IkZnGWMeHnkEHn4YttsOTjwRamryjsjMKmbtMvjHb2H9Gtj+WBg4Ku+IOiXTJAEcAMyPiAUAkm4GxgP/TBIRcW/B/jOAkzOOKVfXXQdf/CI0NUHv3nDNNTBtmhOFWbfwzmtwx2hYtxxYD4+dD0c+AFvvk3dkHZb17aZhwEsF6wvTsmJOB+4sWO8vqUHSDEkfzSC+LrV+PZxzDqxaBWvWwNtvJ1cVU6fmHZmZVcTT34c1r0PTKmh6BxpXQsO5eUfVKVlfSZRM0slAPXB4QfEOEbFI0k7APZKejIjnW9Q7EzgTYOTIkV0Wb0esWQPr1m1a/vrrXR+LmWVg9WKIFv/J17yWTywVkvWVxCJgRMH68LRsI5KOBL4FnBARa5rLI2JR+nMBcB+wyTVbREyMiPqIqK+rq6ts9BW2xRaw++4b31pavx4OOSS/mMysgoYdDzW1G9ZrtkjaJTZjWSeJmcDOknaU1BeYAGzUS0nSPsA1JAnitYLywZL6pctDgUMpaMvYXN15J+y7b5Iohg6FW2+F974376jMrCJ2mAC7X5Akh159YMS/wOjL8o6qUxQR2b6BdAzwQ6AGmBQRl0i6CGiIiCmS7gL2ZMN82f+IiBMkHUKSPNaTJLMfRsR1bb1XfX19NDQ0ZPVRKioCpLyjMLPMbEb/ySXNioj6VrdlnSS60uaSJJatXsbc1+ey7cBt2WnwTnmHY2Y9XFtJomoarvO0ciU8+yy8+90wfHi27/XgPx7k6BuPgfW9WBdr+PJB53HpkZe2W2/NGnjmGRg4EHbaabP5A8XMNnM9fliORx+FkSPhiCNg553hgguye6+I4LgbPsrKtW+xsnE5a5re4QcPXcnDLz3cZr0XX0zaLT7wAdhzTzjppKTB28wsaz0+SYwfD8uWwYoV8M47cNVV8MAD2bzXqnWrWb7mzY3K1q0Vdzwyr816J58MixfDW2/B6tXwhz/A9ddnE6OZWaEenSSammBRiw6569fD01n1oVq3BawasnGZAr2+W5vV5s5NYm22ahU8/ngG8ZmZtdCjk0RNTTJ+UqFevWDXXbN5v9paseUdf4TVg+GdQdDYj94Pf5Nj9j6gzXq77JLEteE4yW0nM7Os9fiG69//Ho46KumttmYNnHUWjBlT2fdYuXYlW/TegppeNdzxf/sz7viXWL/VfBqXb8N/fGFbDjqo7fo33ACHHZY0sDc2wrhxcMoplY3RzKw1PT5J1NcnDcPPPJP0bho1qnLHXrRiEeNuGMczbzxDL/XiR+N+xOcP+TwvLRjAvHl7s+22SaN5e3bcEebPT26DDRwI73ufezeZWdfwcxIZ2v//9mf24tk0RdKgUNunlrs/ezcHDW/n0sHMrAu19ZxEj26TyFphggBoWt/EjIUzcozIzKw8ThIZGlK7cU+m3r16s/aN7Rk0KLldVFMDl7b/HJ2ZWW56fJJYuhTGjoUBA2DECJg+vfS68+bBXntt6G00d+7G26//2PXU9qllUN9BDOw7kENGHMKFnziRlSuT7evXwze/CffcU7nPY2ZWST2+TeKDH4QZMzbM81BbC7NnJ43DbVm9OmnkXrJkwzheQ4fC3/+eJJxmC5Yt4OGXHmZo7VD2G3wUdUM3zcv/9m8wcWJZYZuZVYzHbiqiqQkeemjTIS7uv7/9JDF3bpIomnNscxfauXOTHlPNdhq80z8H8WtsbP1Y22/fwQ9gZpaxHn27qVcv6N9/07Kttmq/7uDBm84yt25d23V794ZPfWrjsi23TG45mZlVox6dJCS44orkFlNNTfJzl12S8Zzas+OOyZhKAwYkiWXAAJgwof0JhG68Ea65Bo47Dr7wBXjlFejbtzKfx8ys0np8mwTAgw8mg/pts03yi79fv9LqRcDttycPub3//fCxj/khNzPb/HjSITMzKyrXh+kkjZM0T9J8See3sr2fpF+n2/8maVTBtgvS8nmSPpJ1rKWKgOuug0MPTcZReuSRvCMyM8tGpklCUg3wE+BoYDfgJEktx8U+HVgWEe8FrgC+n9bdDZgA7A6MA36aHi93V10FD954AxeN+TCf3+2jnPfZ2TzxRGl1Gxvhkkvg8MPhs5+Fl1/ONtYOiYDnroG7PgQPnAjL57Zfx8y6pay7wB4AzI+IBQCSbgbGA4UzNowHvpsu3wr8WJLS8psjYg3wd0nz0+O1PY1bF3j1oZ/x45O/xoD+q1i/Ho7c4y5+cvMj7LVX2/NCAJxxBtxySzInRO/eMG1aMrhgKT2qusycS2HOJdC0ChC8Mh2OeQIGjso7MjPrYlnfbhoGvFSwvjAta3WfiGgElgNDSqybi3//wOUM6L8KSHo21fZ7mwOHXtduvXXrkmG/VyVVaWyEt9+GqVOzjLYD5l2RJgiAgKbV8OJNuYZkZvnY7LvASjpTUoOkhiVLlnTJew7eatPG/j337HgHgKrrO9BaQFUXpJl1hayTxCJgRMH68LSs1X0k9Qa2BN4osS4RMTEi6iOivq6uroKhFzdo/6/QSO2GGHoNYMj+n2u3Xp8+ybMUtWnV5mczPlI1TfKp950LNc2fT1DTH0ZNyDUkM8tH1m0SM4GdJe1I8gt+AtDimWOmAKeQtDV8HLgnIkLSFOBXkv4X2B7YGaiOfkTvO4fevQfCgp9Dn0HU7Pld2GqPkqpOnpw8cDdtWjLh0OWXJ09vV5U9vwP9hsA/boa+Q2DvS2DgTnlHZWY5yPw5CUnHAD8EaoBJEXGJpIuAhoiYIqk/cD2wD7AUmFDQ0P0t4HNAI3BeRNzZ1nv5OQkzs/L5YTozMyvKM9OZmVmHOEmYmVlRThJmZlaUk4SZmRXlJGFmZkV1q95NkpYAL3biEEOB1ysUTlYcY2U4xspwjJWTZ5w7RESrTyN3qyTRWZIainUDqxaOsTIcY2U4xsqp1jh9u8nMzIpykjAzs6KcJDY2Me8ASuAYK8MxVoZjrJyqjNNtEmZmVpSvJMzMrKgekSQkjZM0T9J8See3sr2fpF+n2/8maVTBtgvS8nmSMpv5oYQYvyLpaUlPSLpb0g4F25okPZa+pmQVY4lxnippSUE8ZxRsO0XSc+nrlBxjvKIgvmclvVmwLfNzKWmSpNckPVVkuyRdmcb/hKR9C7Z11TlsL8ZPp7E9KemvkvYu2PZCWv6YpMxG3CwhxjGSlhf8e36nYFub35EujvPrBTE+lX4Ht063dcm5bFNEdOsXyRDlzwM7AX2Bx4HdWuzzBeDqdHkC8Ot0ebd0/37AjulxanKK8QigNl0+qznGdH1lFZ3LU4Eft1J3a2BB+nNwujw4jxhb7H8uyRD2XXYugQ8C+wJPFdl+DHAnIOAg4G9deQ5LjPGQ5vcGjm6OMV1/ARhaBedxDPDHzn5Hso6zxb7Hk8yp06Xnsq1XT7iSOACYHxELImItcDMwvsU+44FfpMu3Ah+WpLT85ohYExF/B+anx+vyGCPi3ohonnh6BslMfV2tlHNZzEeA6RGxNCKWAdOBcVUQ40lAl07gHREPkMydUsx44JeRmAFsJWk7uu4cthtjRPw1jQFy+j6WcB6L6cz3uGxlxtnl38f29IQkMQx4qWB9YVrW6j4R0QgsB4aUWLerYix0Oslfms36K5nne4akj2YQX7NS4zwxvRVxq6TmKWir7lymt+x2BO4pKO6qc9mWYp+hq85huVp+HwOYJmmWpDNziqnZwZIel3SnpN3Tsqo8j5JqSZL+bwuKcz+XWU9fahUm6WSgHji8oHiHiFgkaSfgHklPRsTz+UTIH4CbImKNpH8nuUL7UE6xtGcCcGtENBWUVdO5rHqSjiBJEocVFB+WnsN3A9MlPZP+Nd3VHiX591ypZIbM20mmQa5WxwMPRUThVUfu57InXEksAkYUrA9Py1rdR1JvYEvgjRLrdlWMSDoS+BZwQkSsaS6PiEXpzwXAfSRTwWah3Tgj4o2C2K4F9iu1blfFWGACLS7tu/BctqXYZ+iqc1gSSXuR/BuPj4g3mssLzuFrwG1kc4u2XRGxIiJWpst3AH0kDaXKzmOBtr6P+Z3LPBtEuuJFcrW0gOS2QnMj1e4t9jmbjRuuf5Mu787GDdcLyKbhupQY9yFpbNu5RflgoF+6PBR4jowa4UqMc7uC5Y8BM9LlrYG/p/EOTpe3ziPGdL9dSRoFldO5HEXxBtdj2bjh+pGuPIclxjiSpI3ukBblA4BBBct/BcblFOO2zf++JL9c/5Ge05K+I10VZ7p9S5J2iwF5ncuisXX1G+bxIukt8mz6S/ZbadlFJH+RA/QHbkm/9I8AOxXU/VZabx5wdI4x3gW8CjyWvqak5YcAT6Zf9CeB03M+l5cCc9J47gV2Laj7ufQczwdOyyvGdP27wGUt6nXJuST5a3ExsI7kfvjpwOeBz6fbBfwkjf9JoD6Hc9hejNcCywq+jw1p+U7p+Xs8/R58K8cYzyn4Ls6gIKG19h3JK850n1NJOskU1uuyc9nWy09cm5lZUT2hTcLMzDrIScLMzIpykjAzs6KcJMzMrCgnCTMzK8pJwszMinKSsG4nHa58+7zjaEsa43dzet8fd6J+m8NeW/fjJGHd0alAVSeJrKXDy2RhMhmNPGvVyUnCqo6kAZL+lI7e+ZSkT0q6vWD7UZJuk1QjaXK6z5OSvizp4yQDIN6YTtSyhaT9JN2fjqQ5NR12G0n3KZmAqEHSXEn7S/pdOqHPxUVi2z+dZOdxSY9IGpT+df779HjPSbow3XdU4V/ckr7W2tVDy7/uJf0xnTBnk8+Xbn+PpD+nn+cvknZNyydLulrS34D/LuE8H69kkq3Zku6StE1aXidpuqQ5kq6V9GI65hHR8eG5bTPlUWCtGo0DXo6IYwEkbQn8p6S6iFgCnAZMAkYDwyJij3S/rSLiTUnnAF+LiAZJfYCrSAahWyLpk8AlJMNbAKyNiHpJXwJ+TzIg4VLgeUlXRMHAdZL6Ar8GPhkRMyW9C1idbj4A2ANYBcyU9Cfg9U6eh00+X1o+kWRIh+ckHQj8lA0j7Q4nGX6iifY9CBwUEaFkBsH/AL4KXEgy8c2lksaRDCNhPZSThFWjJ4EfSPo+ycxif5F0PXCypJ8DBwOfBQYBO0m6CvgTMK2VY+1C8st7uiRIZiVbXLC9eYrSJ4E5EbEYQNICkpFC3yjYdxdgcUTMhGSU0XRfSCYDeiNd/x3J0Nm3d+IcQDII3UafT9JAkjGmbknfF5IBKJvdUmKCgCSh/Dq9supLMmAgaewfA4iIP0taVqS+9QBOElZ1IuJZJfM6HwNcLOlukgHl/gC8Q/KLsBFYpmRu5Y+QDJj2CTZcITQTyS//g4u8XfOw5usLlpvXy/n/0XIQtAAa2fiWbv8idVvdLyJa+3znAW9GxOgix3q7jJivAv43IqZIGkMy6KHZRtwmYVUn7Zm0KiJuAC4H9o2Il4GXgW8DP0/3Gwr0iojfpuX7pod4i+QqA5LRe+skHZzW6aMNM5SVax6wnaT902MNKmggPkrS1pK2AD4KPEQyau+7JQ2R1A84rshxXwBGS+qlZCa/A4p9vvTq5e+S/jXdR2ki6Ygt2TCPwikF5Q+RJCQkjSUZltx6KF9JWDXaE7hc0nqS4ZXPSstvBOoiYm66Pgz4uaTmP3YuSH9OBq6WtJrk1tTHgSvTto3ewA9Jhl4uiaQ7gDMi4uW0TeOqNBmsBo5Md3uEZNrJ4cANEdGQ1r0o3bYIeKbIWzxEcqvnaWAuyYxqbX2+TwM/k/RtoA/JHM2Pl/p5CnyX5LbVMpIpXHdMy/8TuEnSZ4CHgVdIEi+SbgLGAEMlLQQujIjrOvDetpnwUOG22Uh7AM2utl9Kkk4lmfPhnDLrjIqI72YUVoelVz1NEdGYXoH9rI3bW9bN+UrCNguSZpHcb/9q3rH0ACOB36RXMGuBf8s5HsuRryTMciBpNLBVRNyX0fFPA77UovihiDg7i/ez7stJwszMinLvJjMzK8pJwszMinKSMDOzopwkzMysKCcJMzMr6v8Dn1LqrOxHXscAAAAASUVORK5CYII=\n",
+ "text/plain": [
+ "<Figure size 432x288 with 1 Axes>"
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "# get list of feature pair combinations\n",
+ "feature_pairs = list(itertools.combinations(df_heatmap.columns, 2))\n",
+ "feature_pairs_sample = random.sample(feature_pairs,5) # just sample 5 pairs\n",
+ "\n",
+ "# define colors of points\n",
+ "point_colors = ['blue' for i in range(10)] + ['orange' for i in range(10)] + ['green' for i in range(2)]\n",
+ "\n",
+ "# plot each pair of features\n",
+ "for feature_pair in feature_pairs_sample:\n",
+ " df_heatmap.plot.scatter(x=feature_pair[0], y=feature_pair[1], color=point_colors, subplots=True)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "..."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "You made it! Congratualtions to getting all the way to the end (its ok if you just skipped here too), as a reward [here](https://youtube.com/playlist?list=PL6Zhl9mK2r0Ja7SKX72rD4dfBNnaQbbwR) is a playlist of interesting data science and machine learning youtube videos that you might like considering you just read a jupyter notebook about using kmeans to do unsupervised anomaly detection :)"
+ ]
+ }
+ ],
+ "metadata": {
+ "colab": {
+ "collapsed_sections": [],
+ "name": "Netdata Anomaly Detection Deepdive.ipynb",
+ "provenance": []
+ },
+ "kernelspec": {
+ "display_name": "Python 3 (ipykernel)",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.8.1"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+} \ No newline at end of file