ml/ml-private.h


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335

// SPDX-License-Identifier: GPL-3.0-or-later

#ifndef NETDATA_ML_PRIVATE_H
#define NETDATA_ML_PRIVATE_H

#include "dlib/matrix.h"
#include "ml/ml.h"

#include <vector>
#include <queue>

typedef double calculated_number_t;
typedef dlib::matrix<calculated_number_t, 6, 1> DSample;

/*
 * Features
 */

typedef struct {
    size_t diff_n;
    size_t smooth_n;
    size_t lag_n;

    calculated_number_t *dst;
    size_t dst_n;

    calculated_number_t *src;
    size_t src_n;

    std::vector<DSample> &preprocessed_features;
} ml_features_t;

/*
 * KMeans
 */

typedef struct {
    std::vector<DSample> cluster_centers;

    calculated_number_t min_dist;
    calculated_number_t max_dist;

    uint32_t after;
    uint32_t before;
} ml_kmeans_t;

typedef struct machine_learning_stats_t {
    size_t num_machine_learning_status_enabled;
    size_t num_machine_learning_status_disabled_sp;

    size_t num_metric_type_constant;
    size_t num_metric_type_variable;

    size_t num_training_status_untrained;
    size_t num_training_status_pending_without_model;
    size_t num_training_status_trained;
    size_t num_training_status_pending_with_model;

    size_t num_anomalous_dimensions;
    size_t num_normal_dimensions;
} ml_machine_learning_stats_t;

typedef struct training_stats_t {
    size_t queue_size;
    size_t num_popped_items;

    usec_t allotted_ut;
    usec_t consumed_ut;
    usec_t remaining_ut;

    size_t training_result_ok;
    size_t training_result_invalid_query_time_range;
    size_t training_result_not_enough_collected_values;
    size_t training_result_null_acquired_dimension;
    size_t training_result_chart_under_replication;
} ml_training_stats_t;

enum ml_metric_type {
    // The dimension has constant values, no need to train
    METRIC_TYPE_CONSTANT,

    // The dimension's values fluctuate, we need to generate a model
    METRIC_TYPE_VARIABLE,
};

enum ml_machine_learning_status {
    // Enable training/prediction
    MACHINE_LEARNING_STATUS_ENABLED,

    // Disable because configuration pattern matches the chart's id
    MACHINE_LEARNING_STATUS_DISABLED_DUE_TO_EXCLUDED_CHART,
};

enum ml_training_status {
    // We don't have a model for this dimension
    TRAINING_STATUS_UNTRAINED,

    // Request for training sent, but we don't have any models yet
    TRAINING_STATUS_PENDING_WITHOUT_MODEL,

    // Request to update existing models sent
    TRAINING_STATUS_PENDING_WITH_MODEL,

    // Have a valid, up-to-date model
    TRAINING_STATUS_TRAINED,
};

enum ml_training_result {
    // We managed to create a KMeans model
    TRAINING_RESULT_OK,

    // Could not query DB with a correct time range
    TRAINING_RESULT_INVALID_QUERY_TIME_RANGE,

    // Did not gather enough data from DB to run KMeans
    TRAINING_RESULT_NOT_ENOUGH_COLLECTED_VALUES,

    // Acquired a null dimension
    TRAINING_RESULT_NULL_ACQUIRED_DIMENSION,

    // Chart is under replication
    TRAINING_RESULT_CHART_UNDER_REPLICATION,
};

typedef struct {
    // Chart/dimension we want to train
    char machine_guid[GUID_LEN + 1];
    STRING *chart_id;
    STRING *dimension_id;

    // Creation time of request
    time_t request_time;

    // First/last entry of this dimension in DB
    // at the point the request was made
    time_t first_entry_on_request;
    time_t last_entry_on_request;
} ml_training_request_t;

typedef struct {
    // Time when the request for this response was made
    time_t request_time;

    // First/last entry of the dimension in DB when generating the request
    time_t first_entry_on_request;
    time_t last_entry_on_request;

    // First/last entry of the dimension in DB when generating the response
    time_t first_entry_on_response;
    time_t last_entry_on_response;

    // After/Before timestamps of our DB query
    time_t query_after_t;
    time_t query_before_t;

    // Actual after/before returned by the DB query ops
    time_t db_after_t;
    time_t db_before_t;

    // Number of doubles returned by the DB query
    size_t collected_values;

    // Number of values we return to the caller
    size_t total_values;

    // Result of training response
    enum ml_training_result result;
} ml_training_response_t;

/*
 * Queue
*/

typedef struct {
    std::queue<ml_training_request_t> internal;
    netdata_mutex_t mutex;
    pthread_cond_t cond_var;
    std::atomic<bool> exit;
} ml_queue_t;

typedef struct {
    RRDDIM *rd;

    enum ml_metric_type mt;
    enum ml_training_status ts;
    enum ml_machine_learning_status mls;

    ml_training_response_t tr;
    time_t last_training_time;

    std::vector<calculated_number_t> cns;

    std::vector<ml_kmeans_t> km_contexts;
    netdata_mutex_t mutex;
    ml_kmeans_t kmeans;
    std::vector<DSample> feature;
} ml_dimension_t;

typedef struct {
    RRDSET *rs;
    ml_machine_learning_stats_t mls;

    netdata_mutex_t mutex;
} ml_chart_t;

void ml_chart_update_dimension(ml_chart_t *chart, ml_dimension_t *dim, bool is_anomalous);

typedef struct {
    RRDHOST *rh;

    ml_machine_learning_stats_t mls;

    calculated_number_t host_anomaly_rate;

    netdata_mutex_t mutex;

    ml_queue_t *training_queue;

    /*
     * bookkeeping for anomaly detection charts
    */

    RRDSET *machine_learning_status_rs;
    RRDDIM *machine_learning_status_enabled_rd;
    RRDDIM *machine_learning_status_disabled_sp_rd;

    RRDSET *metric_type_rs;
    RRDDIM *metric_type_constant_rd;
    RRDDIM *metric_type_variable_rd;

    RRDSET *training_status_rs;
    RRDDIM *training_status_untrained_rd;
    RRDDIM *training_status_pending_without_model_rd;
    RRDDIM *training_status_trained_rd;
    RRDDIM *training_status_pending_with_model_rd;

    RRDSET *dimensions_rs;
    RRDDIM *dimensions_anomalous_rd;
    RRDDIM *dimensions_normal_rd;

    RRDSET *anomaly_rate_rs;
    RRDDIM *anomaly_rate_rd;

    RRDSET *detector_events_rs;
    RRDDIM *detector_events_above_threshold_rd;
    RRDDIM *detector_events_new_anomaly_event_rd;
} ml_host_t;

typedef struct {
    uuid_t metric_uuid;
    ml_kmeans_t kmeans;
} ml_model_info_t;

typedef struct {
    size_t id;
    netdata_thread_t nd_thread;
    netdata_mutex_t nd_mutex;

    ml_queue_t *training_queue;
    ml_training_stats_t training_stats;

    calculated_number_t *training_cns;
    calculated_number_t *scratch_training_cns;
    std::vector<DSample> training_samples;

    std::vector<ml_model_info_t> pending_model_info;

    RRDSET *queue_stats_rs;
    RRDDIM *queue_stats_queue_size_rd;
    RRDDIM *queue_stats_popped_items_rd;

    RRDSET *training_time_stats_rs;
    RRDDIM *training_time_stats_allotted_rd;
    RRDDIM *training_time_stats_consumed_rd;
    RRDDIM *training_time_stats_remaining_rd;

    RRDSET *training_results_rs;
    RRDDIM *training_results_ok_rd;
    RRDDIM *training_results_invalid_query_time_range_rd;
    RRDDIM *training_results_not_enough_collected_values_rd;
    RRDDIM *training_results_null_acquired_dimension_rd;
    RRDDIM *training_results_chart_under_replication_rd;
} ml_training_thread_t;

typedef struct {
    bool enable_anomaly_detection;

    unsigned max_train_samples;
    unsigned min_train_samples;
    unsigned train_every;

    unsigned num_models_to_use;

    unsigned db_engine_anomaly_rate_every;

    unsigned diff_n;
    unsigned smooth_n;
    unsigned lag_n;

    double random_sampling_ratio;
    unsigned max_kmeans_iters;

    double dimension_anomaly_score_threshold;

    double host_anomaly_rate_threshold;
    RRDR_TIME_GROUPING anomaly_detection_grouping_method;
    time_t anomaly_detection_query_duration;

    bool stream_anomaly_detection_charts;

    std::string hosts_to_skip;
    SIMPLE_PATTERN *sp_host_to_skip;

    std::string charts_to_skip;
    SIMPLE_PATTERN *sp_charts_to_skip;

    std::vector<uint32_t> random_nums;

    netdata_thread_t detection_thread;
    std::atomic<bool> detection_stop;

    size_t num_training_threads;
    size_t flush_models_batch_size;

    std::vector<ml_training_thread_t> training_threads;
    std::atomic<bool> training_stop;

    bool enable_statistics_charts;
} ml_config_t;

void ml_config_load(ml_config_t *cfg);

extern ml_config_t Cfg;

#endif /* NETDATA_ML_PRIVATE_H */