diff options
Diffstat (limited to '')
-rw-r--r-- | ml/Config.cc | 8 | ||||
-rw-r--r-- | ml/Dimension.cc | 8 | ||||
-rw-r--r-- | ml/Dimension.h | 14 | ||||
-rw-r--r-- | ml/Host.cc | 45 | ||||
-rw-r--r-- | ml/Host.h | 1 | ||||
-rw-r--r-- | ml/Query.h | 2 | ||||
-rw-r--r-- | ml/README.md | 51 | ||||
-rw-r--r-- | ml/ml.h | 4 |
8 files changed, 117 insertions, 16 deletions
diff --git a/ml/Config.cc b/ml/Config.cc index 99109e05f..37d0bb29e 100644 --- a/ml/Config.cc +++ b/ml/Config.cc @@ -32,7 +32,7 @@ void Config::readMLConfig(void) { unsigned MinTrainSamples = config_get_number(ConfigSectionML, "minimum num samples to train", 1 * 3600); unsigned TrainEvery = config_get_number(ConfigSectionML, "train every", 1 * 3600); - unsigned DBEngineAnomalyRateEvery = config_get_number(ConfigSectionML, "dbengine anomaly rate every", 60); + unsigned DBEngineAnomalyRateEvery = config_get_number(ConfigSectionML, "dbengine anomaly rate every", 30); unsigned DiffN = config_get_number(ConfigSectionML, "num samples to diff", 1); unsigned SmoothN = config_get_number(ConfigSectionML, "num samples to smooth", 3); @@ -42,7 +42,7 @@ void Config::readMLConfig(void) { unsigned MaxKMeansIters = config_get_number(ConfigSectionML, "maximum number of k-means iterations", 1000); double DimensionAnomalyScoreThreshold = config_get_float(ConfigSectionML, "dimension anomaly score threshold", 0.99); - double HostAnomalyRateThreshold = config_get_float(ConfigSectionML, "host anomaly rate threshold", 0.01); + double HostAnomalyRateThreshold = config_get_float(ConfigSectionML, "host anomaly rate threshold", 0.02); double ADMinWindowSize = config_get_float(ConfigSectionML, "minimum window size", 30); double ADMaxWindowSize = config_get_float(ConfigSectionML, "maximum window size", 600); @@ -58,8 +58,8 @@ void Config::readMLConfig(void) { * Clamp */ - MaxTrainSamples = clamp(MaxTrainSamples, 1 * 3600u, 6 * 3600u); - MinTrainSamples = clamp(MinTrainSamples, 1 * 3600u, 6 * 3600u); + MaxTrainSamples = clamp(MaxTrainSamples, 1 * 3600u, 24 * 3600u); + MinTrainSamples = clamp(MinTrainSamples, 1 * 900u, 6 * 3600u); TrainEvery = clamp(TrainEvery, 1 * 3600u, 6 * 3600u); DBEngineAnomalyRateEvery = clamp(DBEngineAnomalyRateEvery, 1 * 30u, 15 * 60u); diff --git a/ml/Dimension.cc b/ml/Dimension.cc index 290d4c743..3146e45a6 100644 --- a/ml/Dimension.cc +++ b/ml/Dimension.cc @@ -161,8 +161,10 @@ void PredictableDimension::addValue(CalculatedNumber Value, bool Exists) { std::pair<MLResult, bool> PredictableDimension::predict() { unsigned N = Cfg.DiffN + Cfg.SmoothN + Cfg.LagN; - if (CNs.size() != N) + if (CNs.size() != N) { + AnomalyBit = false; return { MLResult::MissingData, AnomalyBit }; + } CalculatedNumber *TmpCNs = new CalculatedNumber[N * (Cfg.LagN + 1)](); std::memcpy(TmpCNs, CNs.data(), N * sizeof(CalculatedNumber)); @@ -172,8 +174,10 @@ std::pair<MLResult, bool> PredictableDimension::predict() { AnomalyScore = computeAnomalyScore(SB); delete[] TmpCNs; - if (AnomalyScore == std::numeric_limits<CalculatedNumber>::quiet_NaN()) + if (AnomalyScore == std::numeric_limits<CalculatedNumber>::quiet_NaN()) { + AnomalyBit = false; return { MLResult::NaN, AnomalyBit }; + } AnomalyBit = AnomalyScore >= (100 * Cfg.DimensionAnomalyScoreThreshold); return { MLResult::Success, AnomalyBit }; diff --git a/ml/Dimension.h b/ml/Dimension.h index 44b348e9b..e4f8bd1c7 100644 --- a/ml/Dimension.h +++ b/ml/Dimension.h @@ -30,6 +30,16 @@ public: return SS.str(); } + bool isActive() const { + if (rrdset_flag_check(RD->rrdset, RRDSET_FLAG_OBSOLETE)) + return false; + + if (rrddim_flag_check(RD, RRDDIM_FLAG_OBSOLETE)) + return false; + + return true; + } + void setAnomalyRateRD(RRDDIM *ARRD) { AnomalyRateRD = ARRD; } RRDDIM *getAnomalyRateRD() const { return AnomalyRateRD; } @@ -38,14 +48,14 @@ public: } virtual ~RrdDimension() { - rrddim_free_custom(AnomalyRateRD->rrdset, AnomalyRateRD, 0); + rrddim_free(AnomalyRateRD->rrdset, AnomalyRateRD); } private: RRDDIM *RD; RRDDIM *AnomalyRateRD; - struct rrddim_volatile::rrddim_query_ops *Ops; + struct rrddim_query_ops *Ops; std::string ID; }; diff --git a/ml/Host.cc b/ml/Host.cc index 3166720cc..f8cba9d64 100644 --- a/ml/Host.cc +++ b/ml/Host.cc @@ -358,6 +358,10 @@ void TrainableHost::trainDimension(Dimension *D, const TimePoint &NowTP) { void TrainableHost::train() { Duration<double> MaxSleepFor = Seconds{10 * updateEvery()}; + worker_register("MLTRAIN"); + worker_register_job_name(0, "dimensions"); + + worker_is_busy(0); while (!netdata_exit) { netdata_thread_testcancel(); netdata_thread_disable_cancelability(); @@ -378,11 +382,23 @@ void TrainableHost::train() { if (RealDuration >= AllottedDuration) continue; + worker_is_idle(); SleepFor = std::min(AllottedDuration - RealDuration, MaxSleepFor); std::this_thread::sleep_for(SleepFor); + worker_is_busy(0); } } +#define WORKER_JOB_DETECT_DIMENSION 0 +#define WORKER_JOB_UPDATE_DETECTION_CHART 1 +#define WORKER_JOB_UPDATE_ANOMALY_RATES 2 +#define WORKER_JOB_UPDATE_CHARTS 3 +#define WORKER_JOB_SAVE_ANOMALY_EVENT 4 + +#if WORKER_UTILIZATION_MAX_JOB_TYPES < 5 +#error WORKER_UTILIZATION_MAX_JOB_TYPES has to be at least 5 +#endif + void DetectableHost::detectOnce() { auto P = BRW.insert(WindowAnomalyRate >= Cfg.HostAnomalyRateThreshold); BitRateWindow::Edge Edge = P.first; @@ -397,6 +413,7 @@ void DetectableHost::detectOnce() { size_t NumAnomalousDimensions = 0; size_t NumNormalDimensions = 0; size_t NumTrainedDimensions = 0; + size_t NumActiveDimensions = 0; bool CollectAnomalyRates = (++AnomalyRateTimer == Cfg.DBEngineAnomalyRateEvery); if (CollectAnomalyRates) @@ -408,8 +425,17 @@ void DetectableHost::detectOnce() { DimsOverThreshold.reserve(DimensionsMap.size()); for (auto &DP : DimensionsMap) { + worker_is_busy(WORKER_JOB_DETECT_DIMENSION); + Dimension *D = DP.second; + if (!D->isActive()) { + D->updateAnomalyBitCounter(AnomalyRateRS, AnomalyRateTimer, false); + continue; + } + + NumActiveDimensions++; + auto P = D->detect(WindowLength, ResetBitCounter); bool IsAnomalous = P.first; double AnomalyScore = P.second; @@ -426,14 +452,15 @@ void DetectableHost::detectOnce() { } if (NumAnomalousDimensions) - WindowAnomalyRate = static_cast<double>(NumAnomalousDimensions) / DimensionsMap.size(); + WindowAnomalyRate = static_cast<double>(NumAnomalousDimensions) / NumActiveDimensions; else WindowAnomalyRate = 0.0; - NumNormalDimensions = DimensionsMap.size() - NumAnomalousDimensions; + NumNormalDimensions = NumActiveDimensions - NumAnomalousDimensions; } if (CollectAnomalyRates) { + worker_is_busy(WORKER_JOB_UPDATE_ANOMALY_RATES); AnomalyRateTimer = 0; rrdset_done(AnomalyRateRS); } @@ -441,7 +468,9 @@ void DetectableHost::detectOnce() { this->NumAnomalousDimensions = NumAnomalousDimensions; this->NumNormalDimensions = NumNormalDimensions; this->NumTrainedDimensions = NumTrainedDimensions; + this->NumActiveDimensions = NumActiveDimensions; + worker_is_busy(WORKER_JOB_UPDATE_CHARTS); updateDimensionsChart(getRH(), NumTrainedDimensions, NumNormalDimensions, NumAnomalousDimensions); updateRateChart(getRH(), WindowAnomalyRate * 10000.0); updateWindowLengthChart(getRH(), WindowLength); @@ -454,6 +483,8 @@ void DetectableHost::detectOnce() { if (!NewAnomalyEvent || (DimsOverThreshold.size() == 0)) return; + worker_is_busy(WORKER_JOB_SAVE_ANOMALY_EVENT); + std::sort(DimsOverThreshold.begin(), DimsOverThreshold.end()); std::reverse(DimsOverThreshold.begin(), DimsOverThreshold.end()); @@ -476,6 +507,13 @@ void DetectableHost::detectOnce() { } void DetectableHost::detect() { + worker_register("MLDETECT"); + worker_register_job_name(WORKER_JOB_DETECT_DIMENSION, "dimensions"); + worker_register_job_name(WORKER_JOB_UPDATE_DETECTION_CHART, "detection chart"); + worker_register_job_name(WORKER_JOB_UPDATE_ANOMALY_RATES, "anomaly rates"); + worker_register_job_name(WORKER_JOB_UPDATE_CHARTS, "charts"); + worker_register_job_name(WORKER_JOB_SAVE_ANOMALY_EVENT, "anomaly event"); + std::this_thread::sleep_for(Seconds{10}); heartbeat_t HB; @@ -483,10 +521,13 @@ void DetectableHost::detect() { while (!netdata_exit) { netdata_thread_testcancel(); + worker_is_idle(); heartbeat_next(&HB, updateEvery() * USEC_PER_SEC); netdata_thread_disable_cancelability(); detectOnce(); + + worker_is_busy(WORKER_JOB_UPDATE_DETECTION_CHART); updateDetectionChart(getRH()); netdata_thread_enable_cancelability(); } @@ -127,6 +127,7 @@ private: size_t NumAnomalousDimensions{0}; size_t NumNormalDimensions{0}; size_t NumTrainedDimensions{0}; + size_t NumActiveDimensions{0}; unsigned AnomalyRateTimer{0}; diff --git a/ml/Query.h b/ml/Query.h index cbaf6c297..8b84bb73e 100644 --- a/ml/Query.h +++ b/ml/Query.h @@ -40,7 +40,7 @@ public: private: RRDDIM *RD; - struct rrddim_volatile::rrddim_query_ops *Ops; + struct rrddim_query_ops *Ops; struct rrddim_query_handle Handle; }; diff --git a/ml/README.md b/ml/README.md index 06979ea15..cb8384a66 100644 --- a/ml/README.md +++ b/ml/README.md @@ -10,7 +10,7 @@ keywords: [machine learning, anomaly detection, Netdata ML] As of [`v1.32.0`](https://github.com/netdata/netdata/releases/tag/v1.32.0), Netdata comes with some ML powered [anomaly detection](https://en.wikipedia.org/wiki/Anomaly_detection) capabilities built into it and available to use out of the box, with minimal configuration required. -🚧 **Note**: This functionality is still under active development and considered experimental. Changes might cause the feature to break. We dogfood it internally and among early adopters within the Netdata community to build the feature. If you would like to get involved and help us with some feedback, email us at analytics-ml-team@netdata.cloud or come join us in the [🤖-ml-powered-monitoring](https://discord.gg/4eRSEUpJnc) channel of the Netdata discord. +🚧 **Note**: This functionality is still under active development and considered experimental. Changes might cause the feature to break. We dogfood it internally and among early adopters within the Netdata community to build the feature. If you would like to get involved and help us with some feedback, email us at analytics-ml-team@netdata.cloud, comment on the [beta launch post](https://community.netdata.cloud/t/anomaly-advisor-beta-launch/2717) in the Netdata community, or come join us in the [🤖-ml-powered-monitoring](https://discord.gg/4eRSEUpJnc) channel of the Netdata discord. Once ML is enabled, Netdata will begin training a model for each dimension. By default this model is a [k-means clustering](https://en.wikipedia.org/wiki/K-means_clustering) model trained on the most recent 4 hours of data. Rather than just using the most recent value of each raw metric, the model works on a preprocessed ["feature vector"](#feature-vector) of recent smoothed and differenced values. This should enable the model to detect a wider range of potentially anomalous patterns in recent observations as opposed to just point anomalies like big spikes or drops. ([This infographic](https://user-images.githubusercontent.com/2178292/144414415-275a3477-5b47-43d6-8959-509eb48ebb20.png) shows some different types of anomalies.) @@ -160,7 +160,7 @@ Below is a list of all the available configuration params and their default valu # maximum num samples to train = 14400 # minimum num samples to train = 3600 # train every = 3600 - # dbengine anomaly rate every = 60 + # dbengine anomaly rate every = 30 # num samples to diff = 1 # num samples to smooth = 3 # num samples to lag = 5 @@ -177,10 +177,55 @@ Below is a list of all the available configuration params and their default valu # charts to skip from training = netdata.* ``` +### Configuration Examples + +If you would like to run ML on a parent instead of at the edge, some configuration options are illustrated below. + +This example assumes 3 child nodes [streaming](https://learn.netdata.cloud/docs/agent/streaming) to 1 parent node and illustrates the main ways you might want to configure running ml for the children on the parent, running ML on the children themselves, or even a mix of approaches. + +![parent_child_options](https://user-images.githubusercontent.com/2178292/164439761-8fb7dddd-c4d8-4329-9f44-9a794937a086.png) + +``` +# parent will run ml for itself and child 1,2. +# child 0 will run its own ml at the edge and just stream its ml charts to parent. +# child 1 will run its own ml at the edge, even though parent will also run ml for it, a bit wasteful potentially to run ml in both places but is possible. +# child 2 will not run ml at the edge, it will be run in the parent only. + +# parent-ml-ml-stress-0 +# run ml on all hosts apart from child-ml-ml-stress-0 +[ml] + enabled = yes + minimum num samples to train = 900 + train every = 900 + charts to skip from training = !* + hosts to skip from training = child-ml-ml-stress-0 + +# child-ml-ml-stress-0 +# run ml on child-ml-ml-stress-0 and stream ml charts to parent +[ml] + enabled = yes + minimum num samples to train = 900 + train every = 900 + stream anomaly detection charts = yes + +# child-ml-ml-stress-1 +# run ml on child-ml-ml-stress-1 and stream ml charts to parent +[ml] + enabled = yes + minimum num samples to train = 900 + train every = 900 + stream anomaly detection charts = yes + +# child-ml-ml-stress-2 +# don't run ml on child-ml-ml-stress-2, it will instead run on parent-ml-ml-stress-0 +[ml] + enabled = no +``` + ### Descriptions (min/max) - `enabled`: `yes` to enable, `no` to disable. -- `maximum num samples to train`: (`3600`/`21600`) This is the maximum amount of time you would like to train each model on. For example, the default of `14400` trains on the preceding 4 hours of data, assuming an `update every` of 1 second. +- `maximum num samples to train`: (`3600`/`86400`) This is the maximum amount of time you would like to train each model on. For example, the default of `14400` trains on the preceding 4 hours of data, assuming an `update every` of 1 second. - `minimum num samples to train`: (`900`/`21600`) This is the minimum amount of data required to be able to train a model. For example, the default of `3600` implies that once at least 1 hour of data is available for training, a model is trained, otherwise it is skipped and checked again at the next training run. - `train every`: (`1800`/`21600`) This is how often each model will be retrained. For example, the default of `3600` means that each model is retrained every hour. Note: The training of all models is spread out across the `train every` period for efficiency, so in reality, it means that each model will be trained in a staggered manner within each `train every` period. - `dbengine anomaly rate every`: (`30`/`900`) This is how often netdata will aggregate all the anomaly bits into a single chart (`anomaly_detection.anomaly_rates`). The aggregation into a single chart allows enabling anomaly rate ranking over _all_ metrics with one API call as opposed to a call per chart. @@ -10,9 +10,9 @@ extern "C" { #include "daemon/common.h" #include "web/api/queries/rrdr.h" -// This is an internal DBEngine function redeclared here so that we can free +// This is a DBEngine function redeclared here so that we can free // the anomaly rate dimension, whenever its backing dimension is freed. -extern void rrddim_free_custom(RRDSET *st, RRDDIM *rd, int db_rotated); +extern void rrddim_free(RRDSET *st, RRDDIM *rd); typedef void* ml_host_t; typedef void* ml_dimension_t; |