diff options
Diffstat (limited to 'ml/ml.cc')
-rw-r--r-- | ml/ml.cc | 85 |
1 files changed, 67 insertions, 18 deletions
@@ -63,6 +63,8 @@ ml_training_status_to_string(enum ml_training_status ts) return "trained"; case TRAINING_STATUS_UNTRAINED: return "untrained"; + case TRAINING_STATUS_SILENCED: + return "silenced"; default: return "unknown"; } @@ -490,12 +492,16 @@ ml_dimension_add_model(const uuid_t *metric_uuid, const ml_kmeans_t *km) } rc = execute_insert(res); - if (unlikely(rc != SQLITE_DONE)) + if (unlikely(rc != SQLITE_DONE)) { error_report("Failed to store model, rc = %d", rc); + return rc; + } rc = sqlite3_reset(res); - if (unlikely(rc != SQLITE_OK)) + if (unlikely(rc != SQLITE_OK)) { error_report("Failed to reset statement when storing model, rc = %d", rc); + return rc; + } return 0; @@ -504,7 +510,7 @@ bind_fail: rc = sqlite3_reset(res); if (unlikely(rc != SQLITE_OK)) error_report("Failed to reset statement to store model, rc = %d", rc); - return 1; + return rc; } static int @@ -523,7 +529,7 @@ ml_dimension_delete_models(const uuid_t *metric_uuid, time_t before) rc = prepare_statement(db, db_models_delete, &res); if (unlikely(rc != SQLITE_OK)) { error_report("Failed to prepare statement to delete models, rc = %d", rc); - return 1; + return rc; } } @@ -536,12 +542,16 @@ ml_dimension_delete_models(const uuid_t *metric_uuid, time_t before) goto bind_fail; rc = execute_insert(res); - if (unlikely(rc != SQLITE_DONE)) + if (unlikely(rc != SQLITE_DONE)) { error_report("Failed to delete models, rc = %d", rc); + return rc; + } rc = sqlite3_reset(res); - if (unlikely(rc != SQLITE_OK)) + if (unlikely(rc != SQLITE_OK)) { error_report("Failed to reset statement when deleting models, rc = %d", rc); + return rc; + } return 0; @@ -550,7 +560,7 @@ bind_fail: rc = sqlite3_reset(res); if (unlikely(rc != SQLITE_OK)) error_report("Failed to reset statement to delete models, rc = %d", rc); - return 1; + return rc; } int ml_dimension_load_models(RRDDIM *rd) { @@ -671,6 +681,8 @@ ml_dimension_train_model(ml_training_thread_t *training_thread, ml_dimension_t * break; } + dim->suppression_anomaly_counter = 0; + dim->suppression_window_counter = 0; dim->tr = training_response; dim->last_training_time = training_response.last_entry_on_response; @@ -727,6 +739,10 @@ ml_dimension_train_model(ml_training_thread_t *training_thread, ml_dimension_t * dim->mt = METRIC_TYPE_CONSTANT; dim->ts = TRAINING_STATUS_TRAINED; + + dim->suppression_anomaly_counter = 0; + dim->suppression_window_counter = 0; + dim->tr = training_response; dim->last_training_time = rrddim_last_entry_s(dim->rd); @@ -763,6 +779,7 @@ ml_dimension_schedule_for_training(ml_dimension_t *dim, time_t curr_time) schedule_for_training = true; dim->ts = TRAINING_STATUS_PENDING_WITHOUT_MODEL; break; + case TRAINING_STATUS_SILENCED: case TRAINING_STATUS_TRAINED: if ((dim->last_training_time + (Cfg.train_every * dim->rd->update_every)) < curr_time) { schedule_for_training = true; @@ -848,6 +865,7 @@ ml_dimension_predict(ml_dimension_t *dim, time_t curr_time, calculated_number_t switch (dim->ts) { case TRAINING_STATUS_UNTRAINED: case TRAINING_STATUS_PENDING_WITHOUT_MODEL: { + case TRAINING_STATUS_SILENCED: netdata_mutex_unlock(&dim->mutex); return false; } @@ -855,6 +873,8 @@ ml_dimension_predict(ml_dimension_t *dim, time_t curr_time, calculated_number_t break; } + dim->suppression_window_counter++; + /* * Use the KMeans models to check if the value is anomalous */ @@ -878,6 +898,13 @@ ml_dimension_predict(ml_dimension_t *dim, time_t curr_time, calculated_number_t sum += 1; } + dim->suppression_anomaly_counter += sum ? 1 : 0; + + if ((dim->suppression_anomaly_counter >= Cfg.suppression_threshold) && + (dim->suppression_window_counter >= Cfg.suppression_window)) { + dim->ts = TRAINING_STATUS_SILENCED; + } + netdata_mutex_unlock(&dim->mutex); global_statistics_ml_models_consulted(models_consulted); @@ -934,6 +961,13 @@ ml_chart_update_dimension(ml_chart_t *chart, ml_dimension_t *dim, bool is_anomal chart->mls.num_anomalous_dimensions += is_anomalous; chart->mls.num_normal_dimensions += !is_anomalous; return; + case TRAINING_STATUS_SILENCED: + chart->mls.num_training_status_silenced++; + chart->mls.num_training_status_trained++; + + chart->mls.num_anomalous_dimensions += is_anomalous; + chart->mls.num_normal_dimensions += !is_anomalous; + return; } return; @@ -987,6 +1021,7 @@ ml_host_detect_once(ml_host_t *host) host->mls.num_training_status_pending_without_model += chart_mls.num_training_status_pending_without_model; host->mls.num_training_status_trained += chart_mls.num_training_status_trained; host->mls.num_training_status_pending_with_model += chart_mls.num_training_status_pending_with_model; + host->mls.num_training_status_silenced += chart_mls.num_training_status_silenced; host->mls.num_anomalous_dimensions += chart_mls.num_anomalous_dimensions; host->mls.num_normal_dimensions += chart_mls.num_normal_dimensions; @@ -1370,23 +1405,37 @@ bool ml_dimension_is_anomalous(RRDDIM *rd, time_t curr_time, double value, bool return is_anomalous; } -static int ml_flush_pending_models(ml_training_thread_t *training_thread) { - (void) db_execute(db, "BEGIN TRANSACTION;"); +static void ml_flush_pending_models(ml_training_thread_t *training_thread) { + int rc = db_execute(db, "BEGIN TRANSACTION;"); + int op_no = 1; - for (const auto &pending_model: training_thread->pending_model_info) { - int rc = ml_dimension_add_model(&pending_model.metric_uuid, &pending_model.kmeans); - if (rc) - return rc; + if (!rc) { + op_no++; - rc = ml_dimension_delete_models(&pending_model.metric_uuid, pending_model.kmeans.before - (Cfg.num_models_to_use * Cfg.train_every)); - if (rc) - return rc; + for (const auto &pending_model: training_thread->pending_model_info) { + if (!rc) + rc = ml_dimension_add_model(&pending_model.metric_uuid, &pending_model.kmeans); + + if (!rc) + rc = ml_dimension_delete_models(&pending_model.metric_uuid, pending_model.kmeans.before - (Cfg.num_models_to_use * Cfg.train_every)); + } + } + + if (!rc) { + op_no++; + rc = db_execute(db, "COMMIT TRANSACTION;"); } - (void) db_execute(db, "COMMIT TRANSACTION;"); + // try to rollback transaction if we got any failures + if (rc) { + error("Trying to rollback ML transaction because it failed with rc=%d, op_no=%d", rc, op_no); + op_no++; + rc = db_execute(db, "ROLLBACK;"); + if (rc) + error("ML transaction rollback failed with rc=%d", rc); + } training_thread->pending_model_info.clear(); - return 0; } static void *ml_train_main(void *arg) { |