summaryrefslogtreecommitdiffstats
path: root/ml/ml.cc
diff options
context:
space:
mode:
Diffstat (limited to 'ml/ml.cc')
-rw-r--r--ml/ml.cc85
1 files changed, 67 insertions, 18 deletions
diff --git a/ml/ml.cc b/ml/ml.cc
index 1f49f4bf1..34f2b93bd 100644
--- a/ml/ml.cc
+++ b/ml/ml.cc
@@ -63,6 +63,8 @@ ml_training_status_to_string(enum ml_training_status ts)
return "trained";
case TRAINING_STATUS_UNTRAINED:
return "untrained";
+ case TRAINING_STATUS_SILENCED:
+ return "silenced";
default:
return "unknown";
}
@@ -490,12 +492,16 @@ ml_dimension_add_model(const uuid_t *metric_uuid, const ml_kmeans_t *km)
}
rc = execute_insert(res);
- if (unlikely(rc != SQLITE_DONE))
+ if (unlikely(rc != SQLITE_DONE)) {
error_report("Failed to store model, rc = %d", rc);
+ return rc;
+ }
rc = sqlite3_reset(res);
- if (unlikely(rc != SQLITE_OK))
+ if (unlikely(rc != SQLITE_OK)) {
error_report("Failed to reset statement when storing model, rc = %d", rc);
+ return rc;
+ }
return 0;
@@ -504,7 +510,7 @@ bind_fail:
rc = sqlite3_reset(res);
if (unlikely(rc != SQLITE_OK))
error_report("Failed to reset statement to store model, rc = %d", rc);
- return 1;
+ return rc;
}
static int
@@ -523,7 +529,7 @@ ml_dimension_delete_models(const uuid_t *metric_uuid, time_t before)
rc = prepare_statement(db, db_models_delete, &res);
if (unlikely(rc != SQLITE_OK)) {
error_report("Failed to prepare statement to delete models, rc = %d", rc);
- return 1;
+ return rc;
}
}
@@ -536,12 +542,16 @@ ml_dimension_delete_models(const uuid_t *metric_uuid, time_t before)
goto bind_fail;
rc = execute_insert(res);
- if (unlikely(rc != SQLITE_DONE))
+ if (unlikely(rc != SQLITE_DONE)) {
error_report("Failed to delete models, rc = %d", rc);
+ return rc;
+ }
rc = sqlite3_reset(res);
- if (unlikely(rc != SQLITE_OK))
+ if (unlikely(rc != SQLITE_OK)) {
error_report("Failed to reset statement when deleting models, rc = %d", rc);
+ return rc;
+ }
return 0;
@@ -550,7 +560,7 @@ bind_fail:
rc = sqlite3_reset(res);
if (unlikely(rc != SQLITE_OK))
error_report("Failed to reset statement to delete models, rc = %d", rc);
- return 1;
+ return rc;
}
int ml_dimension_load_models(RRDDIM *rd) {
@@ -671,6 +681,8 @@ ml_dimension_train_model(ml_training_thread_t *training_thread, ml_dimension_t *
break;
}
+ dim->suppression_anomaly_counter = 0;
+ dim->suppression_window_counter = 0;
dim->tr = training_response;
dim->last_training_time = training_response.last_entry_on_response;
@@ -727,6 +739,10 @@ ml_dimension_train_model(ml_training_thread_t *training_thread, ml_dimension_t *
dim->mt = METRIC_TYPE_CONSTANT;
dim->ts = TRAINING_STATUS_TRAINED;
+
+ dim->suppression_anomaly_counter = 0;
+ dim->suppression_window_counter = 0;
+
dim->tr = training_response;
dim->last_training_time = rrddim_last_entry_s(dim->rd);
@@ -763,6 +779,7 @@ ml_dimension_schedule_for_training(ml_dimension_t *dim, time_t curr_time)
schedule_for_training = true;
dim->ts = TRAINING_STATUS_PENDING_WITHOUT_MODEL;
break;
+ case TRAINING_STATUS_SILENCED:
case TRAINING_STATUS_TRAINED:
if ((dim->last_training_time + (Cfg.train_every * dim->rd->update_every)) < curr_time) {
schedule_for_training = true;
@@ -848,6 +865,7 @@ ml_dimension_predict(ml_dimension_t *dim, time_t curr_time, calculated_number_t
switch (dim->ts) {
case TRAINING_STATUS_UNTRAINED:
case TRAINING_STATUS_PENDING_WITHOUT_MODEL: {
+ case TRAINING_STATUS_SILENCED:
netdata_mutex_unlock(&dim->mutex);
return false;
}
@@ -855,6 +873,8 @@ ml_dimension_predict(ml_dimension_t *dim, time_t curr_time, calculated_number_t
break;
}
+ dim->suppression_window_counter++;
+
/*
* Use the KMeans models to check if the value is anomalous
*/
@@ -878,6 +898,13 @@ ml_dimension_predict(ml_dimension_t *dim, time_t curr_time, calculated_number_t
sum += 1;
}
+ dim->suppression_anomaly_counter += sum ? 1 : 0;
+
+ if ((dim->suppression_anomaly_counter >= Cfg.suppression_threshold) &&
+ (dim->suppression_window_counter >= Cfg.suppression_window)) {
+ dim->ts = TRAINING_STATUS_SILENCED;
+ }
+
netdata_mutex_unlock(&dim->mutex);
global_statistics_ml_models_consulted(models_consulted);
@@ -934,6 +961,13 @@ ml_chart_update_dimension(ml_chart_t *chart, ml_dimension_t *dim, bool is_anomal
chart->mls.num_anomalous_dimensions += is_anomalous;
chart->mls.num_normal_dimensions += !is_anomalous;
return;
+ case TRAINING_STATUS_SILENCED:
+ chart->mls.num_training_status_silenced++;
+ chart->mls.num_training_status_trained++;
+
+ chart->mls.num_anomalous_dimensions += is_anomalous;
+ chart->mls.num_normal_dimensions += !is_anomalous;
+ return;
}
return;
@@ -987,6 +1021,7 @@ ml_host_detect_once(ml_host_t *host)
host->mls.num_training_status_pending_without_model += chart_mls.num_training_status_pending_without_model;
host->mls.num_training_status_trained += chart_mls.num_training_status_trained;
host->mls.num_training_status_pending_with_model += chart_mls.num_training_status_pending_with_model;
+ host->mls.num_training_status_silenced += chart_mls.num_training_status_silenced;
host->mls.num_anomalous_dimensions += chart_mls.num_anomalous_dimensions;
host->mls.num_normal_dimensions += chart_mls.num_normal_dimensions;
@@ -1370,23 +1405,37 @@ bool ml_dimension_is_anomalous(RRDDIM *rd, time_t curr_time, double value, bool
return is_anomalous;
}
-static int ml_flush_pending_models(ml_training_thread_t *training_thread) {
- (void) db_execute(db, "BEGIN TRANSACTION;");
+static void ml_flush_pending_models(ml_training_thread_t *training_thread) {
+ int rc = db_execute(db, "BEGIN TRANSACTION;");
+ int op_no = 1;
- for (const auto &pending_model: training_thread->pending_model_info) {
- int rc = ml_dimension_add_model(&pending_model.metric_uuid, &pending_model.kmeans);
- if (rc)
- return rc;
+ if (!rc) {
+ op_no++;
- rc = ml_dimension_delete_models(&pending_model.metric_uuid, pending_model.kmeans.before - (Cfg.num_models_to_use * Cfg.train_every));
- if (rc)
- return rc;
+ for (const auto &pending_model: training_thread->pending_model_info) {
+ if (!rc)
+ rc = ml_dimension_add_model(&pending_model.metric_uuid, &pending_model.kmeans);
+
+ if (!rc)
+ rc = ml_dimension_delete_models(&pending_model.metric_uuid, pending_model.kmeans.before - (Cfg.num_models_to_use * Cfg.train_every));
+ }
+ }
+
+ if (!rc) {
+ op_no++;
+ rc = db_execute(db, "COMMIT TRANSACTION;");
}
- (void) db_execute(db, "COMMIT TRANSACTION;");
+ // try to rollback transaction if we got any failures
+ if (rc) {
+ error("Trying to rollback ML transaction because it failed with rc=%d, op_no=%d", rc, op_no);
+ op_no++;
+ rc = db_execute(db, "ROLLBACK;");
+ if (rc)
+ error("ML transaction rollback failed with rc=%d", rc);
+ }
training_thread->pending_model_info.clear();
- return 0;
}
static void *ml_train_main(void *arg) {