summaryrefslogtreecommitdiffstats
path: root/src/health.h
blob: 7028a914b2581e0bb798a5fd1d77732d02021d3a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
#ifndef NETDATA_HEALTH_H
#define NETDATA_HEALTH_H

extern int default_health_enabled;

extern int rrdvar_compare(void *a, void *b);

#define RRDVAR_TYPE_CALCULATED              1
#define RRDVAR_TYPE_TIME_T                  2
#define RRDVAR_TYPE_COLLECTED               3
#define RRDVAR_TYPE_TOTAL                   4
#define RRDVAR_TYPE_INT                     5
#define RRDVAR_TYPE_CALCULATED_ALLOCATED    6


// the variables as stored in the variables indexes
// there are 3 indexes:
// 1. at each chart   (RRDSET.variables_root_index)
// 2. at each context (RRDFAMILY.variables_root_index)
// 3. at each host    (RRDHOST.variables_root_index)
typedef struct rrdvar {
    avl avl;

    char *name;
    uint32_t hash;

    int type;
    void *value;

    time_t last_updated;
} RRDVAR;

// variables linked to charts
// We link variables to point to the values that are already
// calculated / processed by the normal data collection process
// This means, there will be no speed penalty for using
// these variables
typedef struct rrdsetvar {
    char *key_fullid;               // chart type.chart id.variable
    char *key_fullname;             // chart type.chart name.variable
    char *variable;             // variable

    int type;
    void *value;

    uint32_t options;

    RRDVAR *var_local;
    RRDVAR *var_family;
    RRDVAR *var_host;
    RRDVAR *var_family_name;
    RRDVAR *var_host_name;

    struct rrdset *rrdset;

    struct rrdsetvar *next;
} RRDSETVAR;


// variables linked to individual dimensions
// We link variables to point the values that are already
// calculated / processed by the normal data collection process
// This means, there will be no speed penalty for using
// these variables
typedef struct rrddimvar {
    char *prefix;
    char *suffix;

    char *key_id;                   // dimension id
    char *key_name;                 // dimension name
    char *key_contextid;            // context + dimension id
    char *key_contextname;          // context + dimension name
    char *key_fullidid;             // chart type.chart id + dimension id
    char *key_fullidname;           // chart type.chart id + dimension name
    char *key_fullnameid;           // chart type.chart name + dimension id
    char *key_fullnamename;         // chart type.chart name + dimension name

    int type;
    void *value;

    uint32_t options;

    RRDVAR *var_local_id;
    RRDVAR *var_local_name;

    RRDVAR *var_family_id;
    RRDVAR *var_family_name;
    RRDVAR *var_family_contextid;
    RRDVAR *var_family_contextname;

    RRDVAR *var_host_chartidid;
    RRDVAR *var_host_chartidname;
    RRDVAR *var_host_chartnameid;
    RRDVAR *var_host_chartnamename;

    struct rrddim *rrddim;

    struct rrddimvar *next;
} RRDDIMVAR;

// calculated variables (defined in health configuration)
// These aggregate time-series data at fixed intervals
// (defined in their update_every member below)
// These increase the overhead of netdata.
//
// These calculations are allocated and linked (->next)
// under RRDHOST.
// Then are also linked to RRDSET (of course only when the
// chart is found, via ->rrdset_next and ->rrdset_prev).
// This double-linked list is maintained sorted at all times
// having as RRDSET.calculations the RRDCALC to be processed
// next.

#define RRDCALC_STATUS_REMOVED       -2
#define RRDCALC_STATUS_UNDEFINED     -1
#define RRDCALC_STATUS_UNINITIALIZED  0
#define RRDCALC_STATUS_CLEAR          1
#define RRDCALC_STATUS_RAISED         2
#define RRDCALC_STATUS_WARNING        3
#define RRDCALC_STATUS_CRITICAL       4

#define RRDCALC_FLAG_DB_ERROR              0x00000001
#define RRDCALC_FLAG_DB_NAN                0x00000002
/* #define RRDCALC_FLAG_DB_STALE           0x00000004 */
#define RRDCALC_FLAG_CALC_ERROR            0x00000008
#define RRDCALC_FLAG_WARN_ERROR            0x00000010
#define RRDCALC_FLAG_CRIT_ERROR            0x00000020
#define RRDCALC_FLAG_RUNNABLE              0x00000040
#define RRDCALC_FLAG_NO_CLEAR_NOTIFICATION 0x80000000

typedef struct rrdcalc {
    uint32_t id;                    // the unique id of this alarm
    uint32_t next_event_id;         // the next event id that will be used for this alarm

    char *name;                     // the name of this alarm
    uint32_t hash;      

    char *exec;                     // the command to execute when this alarm switches state
    char *recipient;                // the recipient of the alarm (the first parameter to exec)

    char *chart;                    // the chart id this should be linked to
    uint32_t hash_chart;

    char *source;                   // the source of this alarm
    char *units;                    // the units of the alarm
    char *info;                     // a short description of the alarm

    int update_every;               // update frequency for the alarm

    // the red and green threshold of this alarm (to be set to the chart)
    calculated_number green;
    calculated_number red;

    // ------------------------------------------------------------------------
    // database lookup settings

    char *dimensions;               // the chart dimensions
    int group;                      // grouping method: average, max, etc.
    int before;                     // ending point in time-series
    int after;                      // starting point in time-series
    uint32_t options;               // calculation options

    // ------------------------------------------------------------------------
    // expressions related to the alarm

    EVAL_EXPRESSION *calculation;   // expression to calculate the value of the alarm
    EVAL_EXPRESSION *warning;       // expression to check the warning condition
    EVAL_EXPRESSION *critical;      // expression to check the critical condition

    // ------------------------------------------------------------------------
    // notification delay settings

    int delay_up_duration;         // duration to delay notifications when alarm raises
    int delay_down_duration;       // duration to delay notifications when alarm lowers
    int delay_max_duration;        // the absolute max delay to apply to this alarm
    float delay_multiplier;        // multiplier for all delays when alarms switch status
                                   // while now < delay_up_to

    // ------------------------------------------------------------------------
    // runtime information

    int status;                     // the current status of the alarm

    calculated_number value;        // the current value of the alarm
    calculated_number old_value;    // the previous value of the alarm

    uint32_t rrdcalc_flags;         // check RRDCALC_FLAG_*

    time_t last_updated;            // the last update timestamp of the alarm
    time_t next_update;             // the next update timestamp of the alarm
    time_t last_status_change;      // the timestamp of the last time this alarm changed status

    time_t db_after;                // the first timestamp evaluated by the db lookup
    time_t db_before;               // the last timestamp evaluated by the db lookup

    time_t delay_up_to_timestamp;   // the timestamp up to which we should delay notifications
    int delay_up_current;           // the current up notification delay duration
    int delay_down_current;         // the current down notification delay duration
    int delay_last;                 // the last delay we used

    // ------------------------------------------------------------------------
    // variables this alarm exposes to the rest of the alarms

    RRDVAR *local;
    RRDVAR *family;
    RRDVAR *hostid;
    RRDVAR *hostname;

    // ------------------------------------------------------------------------
    // the chart this alarm it is linked to

    struct rrdset *rrdset;

    // linking of this alarm on its chart
    struct rrdcalc *rrdset_next;
    struct rrdcalc *rrdset_prev;

    struct rrdcalc *next;
} RRDCALC;

#define RRDCALC_HAS_DB_LOOKUP(rc) ((rc)->after)

// RRDCALCTEMPLATE
// these are to be applied to charts found dynamically
// based on their context.
typedef struct rrdcalctemplate {
    char *name;
    uint32_t hash_name;

    char *exec;
    char *recipient;

    char *context;
    uint32_t hash_context;

    char *family_match;
    SIMPLE_PATTERN *family_pattern;

    char *source;                   // the source of this alarm
    char *units;                    // the units of the alarm
    char *info;                     // a short description of the alarm

    int update_every;               // update frequency for the alarm

    // the red and green threshold of this alarm (to be set to the chart)
    calculated_number green;
    calculated_number red;

    // ------------------------------------------------------------------------
    // database lookup settings

    char *dimensions;               // the chart dimensions
    int group;                      // grouping method: average, max, etc.
    int before;                     // ending point in time-series
    int after;                      // starting point in time-series
    uint32_t options;               // calculation options

    // ------------------------------------------------------------------------
    // notification delay settings

    int delay_up_duration;         // duration to delay notifications when alarm raises
    int delay_down_duration;       // duration to delay notifications when alarm lowers
    int delay_max_duration;        // the absolute max delay to apply to this alarm
    float delay_multiplier;        // multiplier for all delays when alarms switch status

    // ------------------------------------------------------------------------
    // expressions related to the alarm

    EVAL_EXPRESSION *calculation;
    EVAL_EXPRESSION *warning;
    EVAL_EXPRESSION *critical;

    struct rrdcalctemplate *next;
} RRDCALCTEMPLATE;

#define RRDCALCTEMPLATE_HAS_CALCULATION(rt) ((rt)->after)

#define HEALTH_ENTRY_FLAG_PROCESSED             0x00000001
#define HEALTH_ENTRY_FLAG_UPDATED               0x00000002
#define HEALTH_ENTRY_FLAG_EXEC_RUN              0x00000004
#define HEALTH_ENTRY_FLAG_EXEC_FAILED           0x00000008
#define HEALTH_ENTRY_FLAG_SAVED                 0x10000000
#define HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION 0x80000000

typedef struct alarm_entry {
    uint32_t unique_id;
    uint32_t alarm_id;
    uint32_t alarm_event_id;

    time_t when;
    time_t duration;
    time_t non_clear_duration;

    char *name;
    uint32_t hash_name;

    char *chart;
    uint32_t hash_chart;

    char *family;

    char *exec;
    char *recipient;
    time_t exec_run_timestamp;
    int exec_code;

    char *source;
    char *units;
    char *info;

    calculated_number old_value;
    calculated_number new_value;

    char *old_value_string;
    char *new_value_string;

    int old_status;
    int new_status;

    uint32_t flags;

    int delay;
    time_t delay_up_to_timestamp;

    uint32_t updated_by_id;
    uint32_t updates_id;
    
    struct alarm_entry *next;
} ALARM_ENTRY;

typedef struct alarm_log {
    uint32_t next_log_id;
    uint32_t next_alarm_id;
    unsigned int count;
    unsigned int max;
    ALARM_ENTRY *alarms;
    netdata_rwlock_t alarm_log_rwlock;
} ALARM_LOG;

#include "rrd.h"

extern void rrdsetvar_rename_all(RRDSET *st);
extern RRDSETVAR *rrdsetvar_create(RRDSET *st, const char *variable, int type, void *value, uint32_t options);
extern void rrdsetvar_free(RRDSETVAR *rs);

extern void rrddimvar_rename_all(RRDDIM *rd);
extern RRDDIMVAR *rrddimvar_create(RRDDIM *rd, int type, const char *prefix, const char *suffix, void *value, uint32_t options);
extern void rrddimvar_free(RRDDIMVAR *rs);

extern void rrdsetcalc_link_matching(RRDSET *st);
extern void rrdsetcalc_unlink(RRDCALC *rc);
extern void rrdcalctemplate_link_matching(RRDSET *st);
extern RRDCALC *rrdcalc_find(RRDSET *st, const char *name);

extern void health_init(void);
extern void *health_main(void *ptr);

extern void health_reload(void);

extern int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *rc, calculated_number *result);
extern void health_alarms2json(RRDHOST *host, BUFFER *wb, int all);
extern void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after);

void health_api_v1_chart_variables2json(RRDSET *st, BUFFER *buf);

extern RRDVAR *rrdvar_custom_host_variable_create(RRDHOST *host, const char *name);
extern void rrdvar_custom_host_variable_destroy(RRDHOST *host, const char *name);
extern void rrdvar_custom_host_variable_set(RRDVAR *rv, calculated_number value);

extern const char *rrdcalc_status2string(int status);


extern int health_alarm_log_open(RRDHOST *host);
extern void health_alarm_log_close(RRDHOST *host);
extern void health_log_rotate(RRDHOST *host);
extern void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae);
extern ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filename);
extern void health_alarm_log_load(RRDHOST *host);
extern void health_alarm_log(
        RRDHOST *host,
        uint32_t alarm_id,
        uint32_t alarm_event_id,
        time_t when,
        const char *name,
        const char *chart,
        const char *family,
        const char *exec,
        const char *recipient,
        time_t duration,
        calculated_number old_value,
        calculated_number new_value,
        int old_status,
        int new_status,
        const char *source,
        const char *units,
        const char *info,
        int delay,
        uint32_t flags
);

extern void health_readdir(RRDHOST *host, const char *path);
extern char *health_config_dir(void);
extern void health_reload_host(RRDHOST *host);
extern void health_alarm_log_free(RRDHOST *host);

extern void rrdcalc_free(RRDHOST *host, RRDCALC *rc);
extern void rrdcalctemplate_free(RRDHOST *host, RRDCALCTEMPLATE *rt);

#ifdef NETDATA_HEALTH_INTERNALS
#define RRDVAR_MAX_LENGTH 1024

extern int rrdcalc_exists(RRDHOST *host, const char *chart, const char *name, uint32_t hash_chart, uint32_t hash_name);
extern uint32_t rrdcalc_get_unique_id(RRDHOST *host, const char *chart, const char *name, uint32_t *next_event_id);
extern int rrdvar_fix_name(char *variable);

extern RRDCALC *rrdcalc_create(RRDHOST *host, RRDCALCTEMPLATE *rt, const char *chart);
extern void rrdcalc_create_part2(RRDHOST *host, RRDCALC *rc);

extern RRDVAR *rrdvar_create_and_index(const char *scope, avl_tree_lock *tree, const char *name, int type, void *value);
extern void rrdvar_free(RRDHOST *host, avl_tree_lock *tree, RRDVAR *rv);

extern void health_alarm_log_free_one_nochecks_nounlink(ALARM_ENTRY *ae);

#endif // NETDATA_HEALTH_INTERNALS

#endif //NETDATA_HEALTH_H