diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2023-10-17 09:30:20 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2023-10-17 09:30:20 +0000 |
commit | 386ccdd61e8256c8b21ee27ee2fc12438fc5ca98 (patch) | |
tree | c9fbcacdb01f029f46133a5ba7ecd610c2bcb041 /ml | |
parent | Adding upstream version 1.42.4. (diff) | |
download | netdata-386ccdd61e8256c8b21ee27ee2fc12438fc5ca98.tar.xz netdata-386ccdd61e8256c8b21ee27ee2fc12438fc5ca98.zip |
Adding upstream version 1.43.0.upstream/1.43.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'ml')
-rw-r--r-- | ml/Config.cc | 7 | ||||
-rw-r--r-- | ml/README.md | 14 | ||||
-rw-r--r-- | ml/ad_charts.cc | 54 | ||||
-rw-r--r-- | ml/ml-private.h | 14 | ||||
-rw-r--r-- | ml/ml.cc | 128 | ||||
-rw-r--r-- | ml/ml.h | 1 |
6 files changed, 201 insertions, 17 deletions
diff --git a/ml/Config.cc b/ml/Config.cc index f733bcf7d..8f2ef894c 100644 --- a/ml/Config.cc +++ b/ml/Config.cc @@ -28,7 +28,9 @@ void ml_config_load(ml_config_t *cfg) { unsigned max_train_samples = config_get_number(config_section_ml, "maximum num samples to train", 6 * 3600); unsigned min_train_samples = config_get_number(config_section_ml, "minimum num samples to train", 1 * 900); unsigned train_every = config_get_number(config_section_ml, "train every", 3 * 3600); - unsigned num_models_to_use = config_get_number(config_section_ml, "number of models per dimension", 9); + + unsigned num_models_to_use = config_get_number(config_section_ml, "number of models per dimension", 18); + unsigned delete_models_older_than = config_get_number(config_section_ml, "delete models older than", 60 * 60 * 24 * 7); unsigned diff_n = config_get_number(config_section_ml, "num samples to diff", 1); unsigned smooth_n = config_get_number(config_section_ml, "num samples to smooth", 3); @@ -58,7 +60,9 @@ void ml_config_load(ml_config_t *cfg) { max_train_samples = clamp<unsigned>(max_train_samples, 1 * 3600, 24 * 3600); min_train_samples = clamp<unsigned>(min_train_samples, 1 * 900, 6 * 3600); train_every = clamp<unsigned>(train_every, 1 * 3600, 6 * 3600); + num_models_to_use = clamp<unsigned>(num_models_to_use, 1, 7 * 24); + delete_models_older_than = clamp<unsigned>(delete_models_older_than, 60 * 60 * 24 * 1, 60 * 60 * 24 * 7); diff_n = clamp(diff_n, 0u, 1u); smooth_n = clamp(smooth_n, 0u, 5u); @@ -100,6 +104,7 @@ void ml_config_load(ml_config_t *cfg) { cfg->train_every = train_every; cfg->num_models_to_use = num_models_to_use; + cfg->delete_models_older_than = delete_models_older_than; cfg->diff_n = diff_n; cfg->smooth_n = smooth_n; diff --git a/ml/README.md b/ml/README.md index 06baf509b..8e0225f32 100644 --- a/ml/README.md +++ b/ml/README.md @@ -130,7 +130,7 @@ Below is a list of all the available configuration params and their default valu # maximum num samples to train = 21600 # minimum num samples to train = 900 # train every = 10800 - # number of models per dimension = 9 + # number of models per dimension = 18 # dbengine anomaly rate every = 30 # num samples to diff = 1 # num samples to smooth = 3 @@ -144,7 +144,8 @@ Below is a list of all the available configuration params and their default valu # hosts to skip from training = !* # charts to skip from training = netdata.* # dimension anomaly rate suppression window = 900 - # dimension anomaly rate suppression threshold = 450 + # dimension anomaly rate suppression threshold = 450 + # delete models older than = 604800 ``` ### Configuration Examples @@ -189,7 +190,7 @@ This example assumes 3 child nodes [streaming](https://github.com/netdata/netdat - `maximum num samples to train`: (`3600`/`86400`) This is the maximum amount of time you would like to train each model on. For example, the default of `21600` trains on the preceding 6 hours of data, assuming an `update every` of 1 second. - `minimum num samples to train`: (`900`/`21600`) This is the minimum amount of data required to be able to train a model. For example, the default of `900` implies that once at least 15 minutes of data is available for training, a model is trained, otherwise it is skipped and checked again at the next training run. - `train every`: (`1800`/`21600`) This is how often each model will be retrained. For example, the default of `10800` means that each model is retrained every 3 hours. Note: The training of all models is spread out across the `train every` period for efficiency, so in reality, it means that each model will be trained in a staggered manner within each `train every` period. -- `number of models per dimension`: (`1`/`168`) This is the number of trained models that will be used for scoring. For example the default `number of models per dimension = 9` means that just the most recently trained 9 models for the dimension will be used to determine the corresponding anomaly bit. This means that under default settings of `maximum num samples to train = 21600`, `train every = 10800` and `number of models per dimension = 9`, netdata will store and use the last 9 trained models for each dimension when determining the anomaly bit. This means that for the latest feature vector in this configuration to be considered anomalous it would need to look anomalous across _all_ the models trained for that dimension in the last 9*(10800/3600) ~= 27 hours. As such, increasing `number of models per dimension` may reduce some false positives since it will result in more models (covering a wider time frame of training) being used during scoring. +- `number of models per dimension`: (`1`/`168`) This is the number of trained models that will be used for scoring. For example the default `number of models per dimension = 18` means that the most recently trained 18 models for the dimension will be used to determine the corresponding anomaly bit. This means that under default settings of `maximum num samples to train = 21600`, `train every = 10800` and `number of models per dimension = 18`, netdata will store and use the last 18 trained models for each dimension when determining the anomaly bit. This means that for the latest feature vector in this configuration to be considered anomalous it would need to look anomalous across _all_ the models trained for that dimension in the last 18*(10800/3600) ~= 54 hours. As such, increasing `number of models per dimension` may reduce some false positives since it will result in more models (covering a wider time frame of training) being used during scoring. - `dbengine anomaly rate every`: (`30`/`900`) This is how often netdata will aggregate all the anomaly bits into a single chart (`anomaly_detection.anomaly_rates`). The aggregation into a single chart allows enabling anomaly rate ranking over _all_ metrics with one API call as opposed to a call per chart. - `num samples to diff`: (`0`/`1`) This is a `0` or `1` to determine if you want the model to operate on differences of the raw data or just the raw data. For example, the default of `1` means that we take differences of the raw values. Using differences is more general and works on dimensions that might naturally tend to have some trends or cycles in them that is normal behavior to which we don't want to be too sensitive. - `num samples to smooth`: (`0`/`5`) This is a small integer that controls the amount of smoothing applied as part of the feature processing used by the model. For example, the default of `3` means that the rolling average of the last 3 values is used. Smoothing like this helps the model be a little more robust to spiky types of dimensions that naturally "jump" up or down as part of their normal behavior. @@ -201,7 +202,8 @@ This example assumes 3 child nodes [streaming](https://github.com/netdata/netdat - `anomaly detection grouping method`: The grouping method used when calculating node level anomaly rate. - `anomaly detection grouping duration`: (`60`/`900`) The duration across which to calculate the node level anomaly rate, the default of `900` means that the node level anomaly rate is calculated across a rolling 5 minute window. - `hosts to skip from training`: This parameter allows you to turn off anomaly detection for any child hosts on a parent host by defining those you would like to skip from training here. For example, a value like `dev-*` skips all hosts on a parent that begin with the "dev-" prefix. The default value of `!*` means "don't skip any". -- `charts to skip from training`: This parameter allows you to exclude certain charts from anomaly detection. By default, only netdata related charts are excluded. This is to avoid the scenario where accessing the netdata dashboard could itself trigger some anomalies if you don't access them regularly. If you want to include charts that are excluded by default, add them in small groups and then measure any impact on performance before adding additional ones. Example: If you want to include system, apps, and user charts:`!system.* !apps.* !user.* *`. +- `charts to skip from training`: This parameter allows you to exclude certain charts from anomaly detection. By default, only netdata related charts are excluded. This is to avoid the scenario where accessing the netdata dashboard could itself trigger some anomalies if you don't access them regularly. If you want to include charts that are excluded by default, add them in small groups and then measure any impact on performance before adding additional ones. Example: If you want to include system, apps, and user charts:`!system.* !apps.* !user.* *`. +- `delete models older than`: (`86400`/`604800`) Delete old models from the database that are unused, by default models will be deleted after 7 days. ## Charts @@ -212,7 +214,7 @@ Once enabled, the "Anomaly Detection" menu and charts will be available on the d In terms of anomaly detection, the most interesting charts would be the `anomaly_detection.dimensions` and `anomaly_detection.anomaly_rate` ones, which hold the `anomalous` and `anomaly_rate` dimensions that show the overall number of dimensions considered anomalous at any time and the corresponding anomaly rate. - `anomaly_detection.dimensions`: Total count of dimensions considered anomalous or normal. -- `anomaly_detection.dimensions`: Percentage of anomalous dimensions. +- `anomaly_detection.anomaly_rate`: Percentage of anomalous dimensions. - `anomaly_detection.anomaly_detection`: Flags (0 or 1) to show when an anomaly event has been triggered by the detector. Below is an example of how these charts may look in the presence of an anomaly event. @@ -267,7 +269,7 @@ The anomaly rate across all dimensions of a node. - We would love to hear any feedback relating to this functionality, please email us at analytics-ml-team@netdata.cloud or come join us in the [🤖-ml-powered-monitoring](https://discord.gg/4eRSEUpJnc) channel of the Netdata discord. - We are working on additional UI/UX based features that build on these core components to make them as useful as possible out of the box. -- Although not yet a core focus of this work, users could leverage the `anomaly_detection` chart dimensions and/or `anomaly-bit` options in defining alarms based on ML driven anomaly detection models. +- Although not yet a core focus of this work, users could leverage the `anomaly_detection` chart dimensions and/or `anomaly-bit` options in defining alerts based on ML driven anomaly detection models. - [This presentation](https://docs.google.com/presentation/d/18zkCvU3nKP-Bw_nQZuXTEa4PIVM6wppH3VUnAauq-RU/edit?usp=sharing) walks through some of the main concepts covered above in a more informal way. - After restart Netdata will wait until `minimum num samples to train` observations of data are available before starting training and prediction. - Netdata uses [dlib](https://github.com/davisking/dlib) under the hood for its core ML features. diff --git a/ml/ad_charts.cc b/ml/ad_charts.cc index ca4dca139..4b70cb43f 100644 --- a/ml/ad_charts.cc +++ b/ml/ad_charts.cc @@ -222,7 +222,7 @@ void ml_update_dimensions_chart(ml_host_t *host, const ml_machine_learning_stats void ml_update_host_and_detection_rate_charts(ml_host_t *host, collected_number AnomalyRate) { /* - * Anomaly rate + * Host anomaly rate */ { if (!host->anomaly_rate_rs) { @@ -259,6 +259,56 @@ void ml_update_host_and_detection_rate_charts(ml_host_t *host, collected_number } /* + * Type anomaly rate + */ + { + if (!host->type_anomaly_rate_rs) { + char id_buf[1024]; + char name_buf[1024]; + + snprintfz(id_buf, 1024, "type_anomaly_rate_on_%s", localhost->machine_guid); + snprintfz(name_buf, 1024, "type_anomaly_rate_on_%s", rrdhost_hostname(localhost)); + + host->type_anomaly_rate_rs = rrdset_create( + host->rh, + "anomaly_detection", // type + id_buf, // id + name_buf, // name + "anomaly_rate", // family + "anomaly_detection.type_anomaly_rate", // ctx + "Percentage of anomalous dimensions by type", // title + "percentage", // units + NETDATA_ML_PLUGIN, // plugin + NETDATA_ML_MODULE_DETECTION, // module + ML_CHART_PRIO_TYPE_ANOMALY_RATE, // priority + localhost->rrd_update_every, // update_every + RRDSET_TYPE_STACKED // chart_type + ); + + rrdset_flag_set(host->type_anomaly_rate_rs, RRDSET_FLAG_ANOMALY_DETECTION); + } + + for (auto &entry : host->type_anomaly_rate) { + ml_type_anomaly_rate_t &type_anomaly_rate = entry.second; + + if (!type_anomaly_rate.rd) + type_anomaly_rate.rd = rrddim_add(host->type_anomaly_rate_rs, string2str(entry.first), NULL, 1, 100, RRD_ALGORITHM_ABSOLUTE); + + double ar = 0.0; + size_t n = type_anomaly_rate.anomalous_dimensions + type_anomaly_rate.normal_dimensions; + if (n) + ar = static_cast<double>(type_anomaly_rate.anomalous_dimensions) / n; + + rrddim_set_by_pointer(host->type_anomaly_rate_rs, type_anomaly_rate.rd, ar * 10000.0); + + type_anomaly_rate.anomalous_dimensions = 0; + type_anomaly_rate.normal_dimensions = 0; + } + + rrdset_done(host->type_anomaly_rate_rs); + } + + /* * Detector Events */ { @@ -277,7 +327,7 @@ void ml_update_host_and_detection_rate_charts(ml_host_t *host, collected_number "anomaly_detection", // family "anomaly_detection.detector_events", // ctx "Anomaly detection events", // title - "percentage", // units + "status", // units NETDATA_ML_PLUGIN, // plugin NETDATA_ML_MODULE_DETECTION, // module ML_CHART_PRIO_DETECTOR_EVENTS, // priority diff --git a/ml/ml-private.h b/ml/ml-private.h index f0e2e7eaf..f373456fa 100644 --- a/ml/ml-private.h +++ b/ml/ml-private.h @@ -8,6 +8,7 @@ #include <vector> #include <queue> +#include <unordered_map> typedef double calculated_number_t; typedef dlib::matrix<calculated_number_t, 6, 1> DSample; @@ -211,6 +212,12 @@ typedef struct { void ml_chart_update_dimension(ml_chart_t *chart, ml_dimension_t *dim, bool is_anomalous); typedef struct { + RRDDIM *rd; + size_t normal_dimensions; + size_t anomalous_dimensions; +} ml_type_anomaly_rate_t; + +typedef struct { RRDHOST *rh; std::atomic<bool> ml_running; @@ -255,6 +262,9 @@ typedef struct { RRDSET *detector_events_rs; RRDDIM *detector_events_above_threshold_rd; RRDDIM *detector_events_new_anomaly_event_rd; + + RRDSET *type_anomaly_rate_rs; + std::unordered_map<STRING *, ml_type_anomaly_rate_t> type_anomaly_rate; } ml_host_t; typedef struct { @@ -291,6 +301,9 @@ typedef struct { RRDDIM *training_results_not_enough_collected_values_rd; RRDDIM *training_results_null_acquired_dimension_rd; RRDDIM *training_results_chart_under_replication_rd; + + size_t num_db_transactions; + size_t num_models_to_prune; } ml_training_thread_t; typedef struct { @@ -301,6 +314,7 @@ typedef struct { unsigned train_every; unsigned num_models_to_use; + unsigned delete_models_older_than; unsigned db_engine_anomaly_rate_every; @@ -9,6 +9,8 @@ #include "ad_charts.h" #include "database/sqlite/sqlite3.h" +#define ML_METADATA_VERSION 2 + #define WORKER_TRAIN_QUEUE_POP 0 #define WORKER_TRAIN_ACQUIRE_DIMENSION 1 #define WORKER_TRAIN_QUERY 2 @@ -436,6 +438,10 @@ const char *db_models_delete = "DELETE FROM models " "WHERE dim_id = @dim_id AND before < @before;"; +const char *db_models_prune = + "DELETE FROM models " + "WHERE after < @after LIMIT @n;"; + static int ml_dimension_add_model(const uuid_t *metric_uuid, const ml_kmeans_t *km) { @@ -563,6 +569,58 @@ bind_fail: return rc; } +static int +ml_prune_old_models(size_t num_models_to_prune) +{ + static __thread sqlite3_stmt *res = NULL; + int rc = 0; + int param = 0; + + if (unlikely(!db)) { + error_report("Database has not been initialized"); + return 1; + } + + if (unlikely(!res)) { + rc = prepare_statement(db, db_models_prune, &res); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to prepare statement to prune models, rc = %d", rc); + return rc; + } + } + + int after = (int) (now_realtime_sec() - Cfg.delete_models_older_than); + + rc = sqlite3_bind_int(res, ++param, after); + if (unlikely(rc != SQLITE_OK)) + goto bind_fail; + + rc = sqlite3_bind_int(res, ++param, num_models_to_prune); + if (unlikely(rc != SQLITE_OK)) + goto bind_fail; + + rc = execute_insert(res); + if (unlikely(rc != SQLITE_DONE)) { + error_report("Failed to prune old models, rc = %d", rc); + return rc; + } + + rc = sqlite3_reset(res); + if (unlikely(rc != SQLITE_OK)) { + error_report("Failed to reset statement when pruning old models, rc = %d", rc); + return rc; + } + + return 0; + +bind_fail: + error_report("Failed to bind parameter %d to prune old models, rc = %d", param, rc); + rc = sqlite3_reset(res); + if (unlikely(rc != SQLITE_OK)) + error_report("Failed to reset statement to prune old models, rc = %d", rc); + return rc; +} + int ml_dimension_load_models(RRDDIM *rd) { ml_dimension_t *dim = (ml_dimension_t *) rd->ml_dimension; if (!dim) @@ -1026,6 +1084,21 @@ ml_host_detect_once(ml_host_t *host) host->mls.num_anomalous_dimensions += chart_mls.num_anomalous_dimensions; host->mls.num_normal_dimensions += chart_mls.num_normal_dimensions; + + STRING *key = rs->parts.type; + auto &um = host->type_anomaly_rate; + auto it = um.find(key); + if (it == um.end()) { + um[key] = ml_type_anomaly_rate_t { + .rd = NULL, + .normal_dimensions = 0, + .anomalous_dimensions = 0 + }; + it = um.find(key); + } + + it->second.anomalous_dimensions += chart_mls.num_anomalous_dimensions; + it->second.normal_dimensions += chart_mls.num_normal_dimensions; } rrdset_foreach_done(rsp); @@ -1039,6 +1112,15 @@ ml_host_detect_once(ml_host_t *host) netdata_mutex_unlock(&host->mutex); } else { host->host_anomaly_rate = 0.0; + + auto &um = host->type_anomaly_rate; + for (auto &entry: um) { + entry.second = ml_type_anomaly_rate_t { + .rd = NULL, + .normal_dimensions = 0, + .anomalous_dimensions = 0 + }; + } } worker_is_busy(WORKER_JOB_DETECTION_DIM_CHART); @@ -1072,7 +1154,7 @@ ml_acquired_dimension_get(char *machine_guid, STRING *chart_id, STRING *dimensio acq_rs = rrdset_find_and_acquire(rh, string2str(chart_id)); if (acq_rs) { RRDSET *rs = rrdset_acquired_to_rrdset(acq_rs); - if (rs && !rrdset_flag_check(rs, RRDSET_FLAG_ARCHIVED | RRDSET_FLAG_OBSOLETE)) { + if (rs && !rrdset_flag_check(rs, RRDSET_FLAG_OBSOLETE)) { acq_rd = rrddim_find_and_acquire(rs, string2str(dimension_id)); if (acq_rd) { RRDDIM *rd = rrddim_acquired_to_rrddim(acq_rd); @@ -1217,6 +1299,7 @@ void ml_host_new(RRDHOST *rh) host->rh = rh; host->mls = ml_machine_learning_stats_t(); host->host_anomaly_rate = 0.0; + host->anomaly_rate_rs = NULL; static std::atomic<size_t> times_called(0); host->training_queue = Cfg.training_threads[times_called++ % Cfg.num_training_threads].training_queue; @@ -1497,9 +1580,12 @@ bool ml_dimension_is_anomalous(RRDDIM *rd, time_t curr_time, double value, bool } static void ml_flush_pending_models(ml_training_thread_t *training_thread) { - int rc = db_execute(db, "BEGIN TRANSACTION;"); int op_no = 1; + // begin transaction + int rc = db_execute(db, "BEGIN TRANSACTION;"); + + // add/delete models if (!rc) { op_no++; @@ -1512,12 +1598,22 @@ static void ml_flush_pending_models(ml_training_thread_t *training_thread) { } } + // prune old models + if (!rc) { + if ((training_thread->num_db_transactions % 64) == 0) { + rc = ml_prune_old_models(training_thread->num_models_to_prune); + if (!rc) + training_thread->num_models_to_prune = 0; + } + } + + // commit transaction if (!rc) { op_no++; rc = db_execute(db, "COMMIT TRANSACTION;"); } - // try to rollback transaction if we got any failures + // rollback transaction on failure if (rc) { netdata_log_error("Trying to rollback ML transaction because it failed with rc=%d, op_no=%d", rc, op_no); op_no++; @@ -1526,6 +1622,13 @@ static void ml_flush_pending_models(ml_training_thread_t *training_thread) { netdata_log_error("ML transaction rollback failed with rc=%d", rc); } + if (!rc) { + training_thread->num_db_transactions++; + training_thread->num_models_to_prune += training_thread->pending_model_info.size(); + } + + vacuum_database(db, "ML", 0, 0); + training_thread->pending_model_info.clear(); } @@ -1676,15 +1779,24 @@ void ml_init() db = NULL; } + // create table if (db) { - char *err = NULL; - int rc = sqlite3_exec(db, db_models_create_table, NULL, NULL, &err); - if (rc != SQLITE_OK) { - error_report("Failed to create models table (%s, %s)", sqlite3_errstr(rc), err ? err : ""); + int target_version = perform_ml_database_migration(db, ML_METADATA_VERSION); + if (configure_sqlite_database(db, target_version)) { + error_report("Failed to setup ML database"); sqlite3_close(db); - sqlite3_free(err); db = NULL; } + else { + char *err = NULL; + int rc = sqlite3_exec(db, db_models_create_table, NULL, NULL, &err); + if (rc != SQLITE_OK) { + error_report("Failed to create models table (%s, %s)", sqlite3_errstr(rc), err ? err : ""); + sqlite3_close(db); + sqlite3_free(err); + db = NULL; + } + } } } @@ -9,6 +9,7 @@ extern "C" { #include "daemon/common.h" #include "web/api/queries/rrdr.h" +#include "database/sqlite/sqlite_db_migration.h" bool ml_capable(); bool ml_enabled(RRDHOST *rh); |