summaryrefslogtreecommitdiffstats
path: root/ml
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2023-06-14 19:20:36 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2023-06-14 19:20:36 +0000
commitdd24e74edfbafc09eaeb2dde0fda7eb3e1e86d0b (patch)
tree1e52f4dac2622ab377c7649f218fb49003b4cbb9 /ml
parentReleasing debian version 1.39.1-2. (diff)
downloadnetdata-dd24e74edfbafc09eaeb2dde0fda7eb3e1e86d0b.tar.xz
netdata-dd24e74edfbafc09eaeb2dde0fda7eb3e1e86d0b.zip
Merging upstream version 1.40.0.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'ml')
-rw-r--r--ml/Config.cc8
-rw-r--r--ml/README.md14
-rw-r--r--ml/ml.cc2
3 files changed, 12 insertions, 12 deletions
diff --git a/ml/Config.cc b/ml/Config.cc
index c5129c49d..c00d2e8ee 100644
--- a/ml/Config.cc
+++ b/ml/Config.cc
@@ -25,10 +25,10 @@ void ml_config_load(ml_config_t *cfg) {
* Read values
*/
- unsigned max_train_samples = config_get_number(config_section_ml, "maximum num samples to train", 4 * 3600);
+ unsigned max_train_samples = config_get_number(config_section_ml, "maximum num samples to train", 6 * 3600);
unsigned min_train_samples = config_get_number(config_section_ml, "minimum num samples to train", 1 * 900);
- unsigned train_every = config_get_number(config_section_ml, "train every", 1 * 3600);
- unsigned num_models_to_use = config_get_number(config_section_ml, "number of models per dimension", 2);
+ unsigned train_every = config_get_number(config_section_ml, "train every", 3 * 3600);
+ unsigned num_models_to_use = config_get_number(config_section_ml, "number of models per dimension", 9);
unsigned diff_n = config_get_number(config_section_ml, "num samples to diff", 1);
unsigned smooth_n = config_get_number(config_section_ml, "num samples to smooth", 3);
@@ -86,7 +86,7 @@ void ml_config_load(ml_config_t *cfg) {
error("invalid min/max train samples found (%u >= %u)", min_train_samples, max_train_samples);
min_train_samples = 1 * 3600;
- max_train_samples = 4 * 3600;
+ max_train_samples = 6 * 3600;
}
/*
diff --git a/ml/README.md b/ml/README.md
index 60f38f22e..06baf509b 100644
--- a/ml/README.md
+++ b/ml/README.md
@@ -127,10 +127,10 @@ Below is a list of all the available configuration params and their default valu
```
[ml]
# enabled = yes
- # maximum num samples to train = 14400
- # minimum num samples to train = 3600
- # train every = 3600
- # number of models per dimension = 2
+ # maximum num samples to train = 21600
+ # minimum num samples to train = 900
+ # train every = 10800
+ # number of models per dimension = 9
# dbengine anomaly rate every = 30
# num samples to diff = 1
# num samples to smooth = 3
@@ -186,10 +186,10 @@ This example assumes 3 child nodes [streaming](https://github.com/netdata/netdat
### Descriptions (min/max)
- `enabled`: `yes` to enable, `no` to disable.
-- `maximum num samples to train`: (`3600`/`86400`) This is the maximum amount of time you would like to train each model on. For example, the default of `14400` trains on the preceding 4 hours of data, assuming an `update every` of 1 second.
+- `maximum num samples to train`: (`3600`/`86400`) This is the maximum amount of time you would like to train each model on. For example, the default of `21600` trains on the preceding 6 hours of data, assuming an `update every` of 1 second.
- `minimum num samples to train`: (`900`/`21600`) This is the minimum amount of data required to be able to train a model. For example, the default of `900` implies that once at least 15 minutes of data is available for training, a model is trained, otherwise it is skipped and checked again at the next training run.
-- `train every`: (`1800`/`21600`) This is how often each model will be retrained. For example, the default of `3600` means that each model is retrained every hour. Note: The training of all models is spread out across the `train every` period for efficiency, so in reality, it means that each model will be trained in a staggered manner within each `train every` period.
-- `number of models per dimension`: (`1`/`168`) This is the number of trained models that will be used for scoring. For example the default `number of models per dimension = 2` means that the two most recently trained models (covering up to the most recent `maximum num samples to train` of training data) for the dimension will be used to determine the corresponding anomaly bit. Alternatively, if you have `train every = 3600` and `number of models per dimension = 24` this means that netdata will store and use the last 24 trained models for each dimension when determining the anomaly bit, this means that for the latest feature vector in this configuration to be considered anomalous it would need to look anomalous across _all_ the models trained for that dimension in the last 24 hours. As such, increasing `number of models per dimension` may reduce some false positives since it will result in more models (covering a wider time frame of training) being used during scoring.
+- `train every`: (`1800`/`21600`) This is how often each model will be retrained. For example, the default of `10800` means that each model is retrained every 3 hours. Note: The training of all models is spread out across the `train every` period for efficiency, so in reality, it means that each model will be trained in a staggered manner within each `train every` period.
+- `number of models per dimension`: (`1`/`168`) This is the number of trained models that will be used for scoring. For example the default `number of models per dimension = 9` means that just the most recently trained 9 models for the dimension will be used to determine the corresponding anomaly bit. This means that under default settings of `maximum num samples to train = 21600`, `train every = 10800` and `number of models per dimension = 9`, netdata will store and use the last 9 trained models for each dimension when determining the anomaly bit. This means that for the latest feature vector in this configuration to be considered anomalous it would need to look anomalous across _all_ the models trained for that dimension in the last 9*(10800/3600) ~= 27 hours. As such, increasing `number of models per dimension` may reduce some false positives since it will result in more models (covering a wider time frame of training) being used during scoring.
- `dbengine anomaly rate every`: (`30`/`900`) This is how often netdata will aggregate all the anomaly bits into a single chart (`anomaly_detection.anomaly_rates`). The aggregation into a single chart allows enabling anomaly rate ranking over _all_ metrics with one API call as opposed to a call per chart.
- `num samples to diff`: (`0`/`1`) This is a `0` or `1` to determine if you want the model to operate on differences of the raw data or just the raw data. For example, the default of `1` means that we take differences of the raw values. Using differences is more general and works on dimensions that might naturally tend to have some trends or cycles in them that is normal behavior to which we don't want to be too sensitive.
- `num samples to smooth`: (`0`/`5`) This is a small integer that controls the amount of smoothing applied as part of the feature processing used by the model. For example, the default of `3` means that the rolling average of the last 3 values is used. Smoothing like this helps the model be a little more robust to spiky types of dimensions that naturally "jump" up or down as part of their normal behavior.
diff --git a/ml/ml.cc b/ml/ml.cc
index 34f2b93bd..a5f0fa062 100644
--- a/ml/ml.cc
+++ b/ml/ml.cc
@@ -1565,7 +1565,7 @@ void ml_init()
for (size_t idx = 0; idx != Cfg.num_training_threads; idx++) {
ml_training_thread_t *training_thread = &Cfg.training_threads[idx];
- size_t max_elements_needed_for_training = Cfg.max_train_samples * (Cfg.lag_n + 1);
+ size_t max_elements_needed_for_training = (size_t) Cfg.max_train_samples * (size_t) (Cfg.lag_n + 1);
training_thread->training_cns = new calculated_number_t[max_elements_needed_for_training]();
training_thread->scratch_training_cns = new calculated_number_t[max_elements_needed_for_training]();