diff options
Diffstat (limited to '')
-rw-r--r-- | src/health/health_prototypes.c | 685 |
1 files changed, 685 insertions, 0 deletions
diff --git a/src/health/health_prototypes.c b/src/health/health_prototypes.c new file mode 100644 index 000000000..ceefbc222 --- /dev/null +++ b/src/health/health_prototypes.c @@ -0,0 +1,685 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "health_internals.h" + +// --------------------------------------------------------------------------------------------------------------------- + +static struct { + ALERT_LOOKUP_DIMS_GROUPING group; + const char *name; +} dims_grouping[] = { + { .group = ALERT_LOOKUP_DIMS_SUM, .name = "sum" }, + { .group = ALERT_LOOKUP_DIMS_MIN, .name = "min" }, + { .group = ALERT_LOOKUP_DIMS_MAX, .name = "max" }, + { .group = ALERT_LOOKUP_DIMS_AVERAGE, .name = "average" }, + { .group = ALERT_LOOKUP_DIMS_MIN2MAX, .name = "min2max" }, + + // terminator + { .group = 0, .name = NULL }, +}; + +ALERT_LOOKUP_DIMS_GROUPING alerts_dims_grouping2id(const char *group) { + if(!group || !*group) + return dims_grouping[0].group; + + for(size_t i = 0; dims_grouping[i].name ;i++) { + if(strcmp(dims_grouping[i].name, group) == 0) + return dims_grouping[i].group; + } + + nd_log(NDLS_DAEMON, NDLP_WARNING, "Alert lookup dimensions grouping '%s' is not valid", group); + return dims_grouping[0].group; +} + +const char *alerts_dims_grouping_id2group(ALERT_LOOKUP_DIMS_GROUPING grouping) { + for(size_t i = 0; dims_grouping[i].name ;i++) { + if(grouping == dims_grouping[i].group) + return dims_grouping[i].name; + } + + nd_log(NDLS_DAEMON, NDLP_WARNING, "Alert lookup dimensions grouping %d is not valid", grouping); + return dims_grouping[0].name; +} + +// --------------------------------------------------------------------------------------------------------------------- + +static struct { + ALERT_LOOKUP_DATA_SOURCE source; + const char *name; +} data_sources[] = { + { .source = ALERT_LOOKUP_DATA_SOURCE_SAMPLES, .name = "samples" }, + { .source = ALERT_LOOKUP_DATA_SOURCE_PERCENTAGES, .name = "percentages" }, + { .source = ALERT_LOOKUP_DATA_SOURCE_ANOMALIES, .name = "anomalies" }, + + // terminator + { .source = 0, .name = NULL }, +}; + +ALERT_LOOKUP_DATA_SOURCE alerts_data_sources2id(const char *source) { + if(!source || !*source) + return data_sources[0].source; + + for(size_t i = 0; data_sources[i].name ;i++) { + if(strcmp(data_sources[i].name, source) == 0) + return data_sources[i].source; + } + + nd_log(NDLS_DAEMON, NDLP_WARNING, "Alert data source '%s' is not valid", source); + return data_sources[0].source; +} + +const char *alerts_data_source_id2source(ALERT_LOOKUP_DATA_SOURCE source) { + for(size_t i = 0; data_sources[i].name ;i++) { + if(source == data_sources[i].source) + return data_sources[i].name; + } + + nd_log(NDLS_DAEMON, NDLP_WARNING, "Alert data source %d is not valid", source); + return data_sources[0].name; +} + +// --------------------------------------------------------------------------------------------------------------------- + +static struct { + ALERT_LOOKUP_TIME_GROUP_CONDITION condition; + const char *name; +} group_conditions[] = { + { .condition = ALERT_LOOKUP_TIME_GROUP_CONDITION_EQUAL, .name = "=" }, + { .condition = ALERT_LOOKUP_TIME_GROUP_CONDITION_NOT_EQUAL, .name = "!=" }, + { .condition = ALERT_LOOKUP_TIME_GROUP_CONDITION_GREATER, .name = ">" }, + { .condition = ALERT_LOOKUP_TIME_GROUP_CONDITION_GREATER_EQUAL, .name = ">=" }, + { .condition = ALERT_LOOKUP_TIME_GROUP_CONDITION_LESS, .name = "<" }, + { .condition = ALERT_LOOKUP_TIME_GROUP_CONDITION_LESS_EQUAL, .name = "<=" }, + + // terminator + { .condition = 0, .name = NULL }, +}; + +ALERT_LOOKUP_TIME_GROUP_CONDITION alerts_group_condition2id(const char *source) { + if(!source || !*source) + return group_conditions[0].condition; + + for(size_t i = 0; group_conditions[i].name ;i++) { + if(strcmp(group_conditions[i].name, source) == 0) + return group_conditions[i].condition; + } + + nd_log(NDLS_DAEMON, NDLP_WARNING, "Alert data source '%s' is not valid", source); + return group_conditions[0].condition; +} + +const char *alerts_group_conditions_id2txt(ALERT_LOOKUP_TIME_GROUP_CONDITION source) { + for(size_t i = 0; group_conditions[i].name ;i++) { + if(source == group_conditions[i].condition) + return group_conditions[i].name; + } + + nd_log(NDLS_DAEMON, NDLP_WARNING, "Alert data source %d is not valid", source); + return group_conditions[0].name; +} + +// --------------------------------------------------------------------------------------------------------------------- + +static struct { + const char *name; + uint32_t hash; + ALERT_ACTION_OPTIONS value; +} alert_action_options[] = { + { "no-clear-notification", 0 , ALERT_ACTION_OPTION_NO_CLEAR_NOTIFICATION} + + // terminator + , {NULL, 0, 0} +}; + +inline ALERT_ACTION_OPTIONS alert_action_options_parse_one(const char *o) { + ALERT_ACTION_OPTIONS ret = 0; + + if(!o || !*o) return ret; + + uint32_t hash = simple_hash(o); + int i; + for(i = 0; alert_action_options[i].name ; i++) { + if (unlikely(hash == alert_action_options[i].hash && !strcmp(o, alert_action_options[i].name))) { + ret |= alert_action_options[i].value; + break; + } + } + + return ret; +} + +inline ALERT_ACTION_OPTIONS alert_action_options_parse(char *o) { + ALERT_ACTION_OPTIONS ret = 0; + char *tok; + + while(o && *o && (tok = strsep_skip_consecutive_separators(&o, ", |"))) { + if(!*tok) continue; + ret |= alert_action_options_parse_one(tok); + } + + return ret; +} + +void alert_action_options_to_buffer_json_array(BUFFER *wb, const char *key, ALERT_ACTION_OPTIONS options) { + buffer_json_member_add_array(wb, key); + + RRDR_OPTIONS used = 0; // to prevent adding duplicates + for(int i = 0; alert_action_options[i].name ; i++) { + if (unlikely((alert_action_options[i].value & options) && !(alert_action_options[i].value & used))) { + const char *name = alert_action_options[i].name; + used |= alert_action_options[i].value; + + buffer_json_add_array_item_string(wb, name); + } + } + + buffer_json_array_close(wb); +} + +static void alert_action_options_init(void) { + for(int i = 0; alert_action_options[i].name ; i++) + alert_action_options[i].hash = simple_hash(alert_action_options[i].name); +} + + +// --------------------------------------------------------------------------------------------------------------------- + +static void health_prototype_cleanup_one_unsafe(RRD_ALERT_PROTOTYPE *ap) { + rrd_alert_match_cleanup(&ap->match); + rrd_alert_config_cleanup(&ap->config); +} + +void health_prototype_cleanup(RRD_ALERT_PROTOTYPE *ap) { + spinlock_lock(&ap->_internal.spinlock); + + while(ap->_internal.next) { + RRD_ALERT_PROTOTYPE *t = ap->_internal.next; + DOUBLE_LINKED_LIST_REMOVE_ITEM_UNSAFE(ap->_internal.next, t, _internal.prev, _internal.next); + health_prototype_cleanup_one_unsafe(t); + freez(t); + } + + spinlock_unlock(&ap->_internal.spinlock); + + health_prototype_cleanup_one_unsafe(ap); +} + +void health_prototype_free(RRD_ALERT_PROTOTYPE *ap) { + if(!ap) return; + health_prototype_cleanup(ap); + freez(ap); +} + +void health_prototype_insert_cb(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *data __maybe_unused) { + RRD_ALERT_PROTOTYPE *ap = value; + spinlock_init(&ap->_internal.spinlock); + if(ap->config.source_type != DYNCFG_SOURCE_TYPE_DYNCFG) + ap->_internal.is_on_disk = true; +} + +bool health_prototype_conflict_cb(const DICTIONARY_ITEM *item __maybe_unused, void *old_value, void *new_value, void *data __maybe_unused) { + RRD_ALERT_PROTOTYPE *ap = old_value; + RRD_ALERT_PROTOTYPE *nap = new_value; + + bool replace = nap->config.source_type == DYNCFG_SOURCE_TYPE_DYNCFG; + + if(ap->config.source_type != DYNCFG_SOURCE_TYPE_DYNCFG || nap->config.source_type != DYNCFG_SOURCE_TYPE_DYNCFG) + ap->_internal.is_on_disk = nap->_internal.is_on_disk = true; + + if(!replace) { + if(ap->config.source_type == DYNCFG_SOURCE_TYPE_DYNCFG) { + // the existing is a dyncfg and the new one is read from the config + health_prototype_cleanup(nap); + memset(nap, 0, sizeof(*nap)); + } + else { + // alerts with the same name are appended to the existing one + nap = callocz(1, sizeof(*nap)); + memcpy(nap, new_value, sizeof(*nap)); + + spinlock_lock(&ap->_internal.spinlock); + DOUBLE_LINKED_LIST_APPEND_ITEM_UNSAFE(ap->_internal.next, nap, _internal.prev, _internal.next); + spinlock_unlock(&ap->_internal.spinlock); + + if(nap->_internal.enabled) + ap->_internal.enabled = true; + } + } + else { + // alerts with the same name replace the existing one + spinlock_init(&nap->_internal.spinlock); + nap->_internal.uses = ap->_internal.uses; + + spinlock_lock(&nap->_internal.spinlock); + spinlock_lock(&ap->_internal.spinlock); + SWAP(*ap, *nap); + spinlock_unlock(&ap->_internal.spinlock); + spinlock_unlock(&nap->_internal.spinlock); + + health_prototype_cleanup(nap); + memset(nap, 0, sizeof(*nap)); + } + + return true; +} + +void health_prototype_delete_cb(const DICTIONARY_ITEM *item __maybe_unused, void *value, void *data __maybe_unused) { + RRD_ALERT_PROTOTYPE *ap = value; + health_prototype_cleanup(ap); +} + +void health_init_prototypes(void) { + if(health_globals.prototypes.dict) + return; + + health_globals.prototypes.dict = dictionary_create(DICT_OPTION_DONT_OVERWRITE_VALUE); + dictionary_register_insert_callback(health_globals.prototypes.dict, health_prototype_insert_cb, NULL); + dictionary_register_conflict_callback(health_globals.prototypes.dict, health_prototype_conflict_cb, NULL); + dictionary_register_delete_callback(health_globals.prototypes.dict, health_prototype_delete_cb, NULL); + + alert_action_options_init(); +} + +// --------------------------------------------------------------------------------------------------------------------- + +static inline struct pattern_array *health_config_add_key_to_values(struct pattern_array *pa, const char *input_key, char *value) +{ + char key[HEALTH_CONF_MAX_LINE + 1]; + char data[HEALTH_CONF_MAX_LINE + 1]; + + char *s = value; + size_t i = 0; + + char pair[HEALTH_CONF_MAX_LINE + 1]; + if (input_key) + strncpyz(key, input_key, HEALTH_CONF_MAX_LINE); + else + key[0] = '\0'; + + while(*s) { + if (*s == '=') { + //hold the key + data[i]='\0'; + strncpyz(key, data, HEALTH_CONF_MAX_LINE); + i=0; + } else if (*s == ' ') { + data[i]='\0'; + if (data[0]=='!') + snprintfz(pair, HEALTH_CONF_MAX_LINE, "!%s=%s ", key, data + 1); + else + snprintfz(pair, HEALTH_CONF_MAX_LINE, "%s=%s ", key, data); + + pa = pattern_array_add_key_simple_pattern(pa, key, simple_pattern_create(pair, NULL, SIMPLE_PATTERN_EXACT, true)); + i=0; + } else { + data[i++] = *s; + } + s++; + } + data[i]='\0'; + if (data[0]) { + if (data[0]=='!') + snprintfz(pair, HEALTH_CONF_MAX_LINE, "!%s=%s ", key, data + 1); + else + snprintfz(pair, HEALTH_CONF_MAX_LINE, "%s=%s ", key, data); + + pa = pattern_array_add_key_simple_pattern(pa, key, simple_pattern_create(pair, NULL, SIMPLE_PATTERN_EXACT, true)); + } + + return pa; +} + +static char *simple_pattern_trim_around_equal(const char *src) { + char *store = mallocz(strlen(src) + 1); + + char *dst = store; + while (*src) { + if (*src == '=') { + if (*(dst -1) == ' ') + dst--; + + *dst++ = *src++; + if (*src == ' ') + src++; + } + + *dst++ = *src++; + } + *dst = 0x00; + + return store; +} + +struct pattern_array *trim_and_add_key_to_values(struct pattern_array *pa, const char *key, STRING *input) { + char *tmp = simple_pattern_trim_around_equal(string2str(input)); + pa = health_config_add_key_to_values(pa, key, tmp); + freez(tmp); + return pa; +} + +static void health_prototype_activate_match_patterns(struct rrd_alert_match *am) { + if(am->host_labels) { + pattern_array_free(am->host_labels_pattern); + am->host_labels_pattern = NULL; + am->host_labels_pattern = trim_and_add_key_to_values(am->host_labels_pattern, NULL, am->host_labels); + } + + if(am->chart_labels) { + pattern_array_free(am->chart_labels_pattern); + am->chart_labels_pattern = NULL; + am->chart_labels_pattern = trim_and_add_key_to_values(am->chart_labels_pattern, NULL, am->chart_labels); + } +} + +void health_prototype_hash_id(RRD_ALERT_PROTOTYPE *ap) { + CLEAN_BUFFER *wb = buffer_create(100, NULL); + health_prototype_to_json(wb, ap, true); + UUID uuid = UUID_generate_from_hash(buffer_tostring(wb), buffer_strlen(wb)); + uuid_copy(ap->config.hash_id, uuid.uuid); + + (void) sql_alert_store_config(ap); +} + +bool health_prototype_add(RRD_ALERT_PROTOTYPE *ap) { + if(!ap->match.is_template) { + if(!ap->match.on.chart) { + netdata_log_error( + "HEALTH: alert '%s' does not define a instance (parameter 'on'). Source: %s", + string2str(ap->config.name), string2str(ap->config.source)); + return false; + } + } + else { + if(!ap->match.on.context) { + netdata_log_error( + "HEALTH: alert '%s' does not define a context (parameter 'on'). Source: %s", + string2str(ap->config.name), string2str(ap->config.source)); + return false; + } + } + + if(!ap->config.update_every) { + netdata_log_error( + "HEALTH: alert '%s' has no frequency (parameter 'every'). Source: %s", + string2str(ap->config.name), string2str(ap->config.source)); + return false; + } + + if(!RRDCALC_HAS_DB_LOOKUP(ap) && !ap->config.calculation && !ap->config.warning && !ap->config.critical) { + netdata_log_error( + "HEALTH: alert '%s' is useless (no db lookup, no calculation, no warning and no critical expressions). Source: %s", + string2str(ap->config.name), string2str(ap->config.source)); + return false; + } + + // activate the match patterns in it + bool enabled = false; + for(RRD_ALERT_PROTOTYPE *t = ap; t ;t = t->_internal.next) { + // we need to generate config_hash_id for each instance included + // so, let's break the linked list for this iteration + + RRD_ALERT_PROTOTYPE *prev = t->_internal.prev; + RRD_ALERT_PROTOTYPE *next = t->_internal.next; + t->_internal.prev = t; + t->_internal.next = NULL; + + if(t->match.enabled) + enabled = true; + + if(!t->config.name) + t->config.name = string_dup(ap->config.name); + + health_prototype_hash_id(t); + + health_prototype_activate_match_patterns(&t->match); + + if (!t->config.exec) + t->config.exec = string_dup(health_globals.config.default_exec); + + if (!t->config.recipient) + t->config.recipient = string_dup(health_globals.config.default_recipient); + + // restore the linked list + t->_internal.prev = prev; + t->_internal.next = next; + } + ap->_internal.enabled = enabled; + + // add it to the prototypes + dictionary_set_advanced(health_globals.prototypes.dict, + string2str(ap->config.name), string_strlen(ap->config.name), + ap, sizeof(*ap), + NULL); + + return true; +} + +// --------------------------------------------------------------------------------------------------------------------- + +void health_reload_prototypes(void) { + // remove all dyncfg related to prototypes + health_dyncfg_unregister_all_prototypes(); + + // clear old prototypes from memory + dictionary_flush(health_globals.prototypes.dict); + + // load the prototypes from disk + recursive_config_double_dir_load( + health_user_config_dir(), + health_globals.config.stock_enabled ? health_stock_config_dir() : NULL, + NULL, + health_readfile, + NULL, 0); + + // register all loaded prototypes + health_dyncfg_register_all_prototypes(); +} + +// --------------------------------------------------------------------------------------------------------------------- + +static bool prototype_matches_host(RRDHOST *host, RRD_ALERT_PROTOTYPE *ap) { + if(health_globals.config.enabled_alerts && + !simple_pattern_matches(health_globals.config.enabled_alerts, string2str(ap->config.name))) + return false; + + if (host->rrdlabels && ap->match.host_labels_pattern && + !pattern_array_label_match(ap->match.host_labels_pattern, host->rrdlabels, '=', NULL)) + return false; + + return true; +} + +static bool prototype_matches_rrdset(RRDSET *st, RRD_ALERT_PROTOTYPE *ap) { + // match the chart id + if(!ap->match.is_template && ap->match.on.chart && + ap->match.on.chart != st->id && ap->match.on.chart != st->name) + return false; + + // match the chart context + if(ap->match.is_template && ap->match.on.context && + ap->match.on.context != st->context) + return false; + + if (st->rrdlabels && ap->match.chart_labels_pattern && + !pattern_array_label_match(ap->match.chart_labels_pattern, st->rrdlabels, '=', NULL)) + return false; + + return true; +} + +void health_prototype_copy_match_without_patterns(struct rrd_alert_match *dst, struct rrd_alert_match *src) { + dst->enabled = src->enabled; + dst->is_template = src->is_template; + + if(dst->is_template) + dst->on.context = string_dup(src->on.context); + else + dst->on.chart = string_dup(src->on.chart); + + dst->host_labels = string_dup(src->host_labels); + dst->chart_labels = string_dup(src->chart_labels); +} + +void health_prototype_copy_config(struct rrd_alert_config *dst, struct rrd_alert_config *src) { + uuid_copy(dst->hash_id, src->hash_id); + + dst->name = string_dup(src->name); + + dst->exec = string_dup(src->exec); + dst->recipient = string_dup(src->recipient); + + dst->classification = string_dup(src->classification); + dst->component = string_dup(src->component); + dst->type = string_dup(src->type); + + dst->source_type = src->source_type; + dst->source = string_dup(src->source); + dst->units = string_dup(src->units); + dst->summary = string_dup(src->summary); + dst->info = string_dup(src->info); + + dst->update_every = src->update_every; + + dst->alert_action_options = src->alert_action_options; + + dst->dimensions = string_dup(src->dimensions); + + dst->time_group = src->time_group; + dst->time_group_condition = src->time_group_condition; + dst->time_group_value = src->time_group_value; + dst->dims_group = src->dims_group; + dst->data_source = src->data_source; + dst->before = src->before; + dst->after = src->after; + dst->options = src->options; + + const char *failed_at = NULL; + int error = 0; + + dst->calculation = expression_parse(expression_source(src->calculation), &failed_at, &error); + dst->warning = expression_parse(expression_source(src->warning), &failed_at, &error); + dst->critical = expression_parse(expression_source(src->critical), &failed_at, &error); + + dst->delay_up_duration = src->delay_up_duration; + dst->delay_down_duration = src->delay_down_duration; + dst->delay_max_duration = src->delay_max_duration; + dst->delay_multiplier = src->delay_multiplier; + + dst->has_custom_repeat_config = src->has_custom_repeat_config; + dst->warn_repeat_every = src->warn_repeat_every; + dst->crit_repeat_every = src->crit_repeat_every; +} + +static void health_prototype_apply_to_rrdset(RRDSET *st, RRD_ALERT_PROTOTYPE *ap) { + if(!ap->_internal.enabled) + return; + + spinlock_lock(&ap->_internal.spinlock); + for(RRD_ALERT_PROTOTYPE *t = ap; t ; t = t->_internal.next) { + if(!t->match.enabled) + continue; + + if(!prototype_matches_host(st->rrdhost, t)) + continue; + + if(!prototype_matches_rrdset(st, t)) + continue; + + if(rrdcalc_add_from_prototype(st->rrdhost, st, t)) + ap->_internal.uses++; + } + spinlock_unlock(&ap->_internal.spinlock); +} + +void health_prototype_alerts_for_rrdset_incrementally(RRDSET *st) { + RRD_ALERT_PROTOTYPE *ap; + dfe_start_read(health_globals.prototypes.dict, ap) { + health_prototype_apply_to_rrdset(st, ap); + } + dfe_done(ap); +} + +void health_prototype_reset_alerts_for_rrdset(RRDSET *st) { + rrdcalc_unlink_and_delete_all_rrdset_alerts(st); + health_prototype_alerts_for_rrdset_incrementally(st); +} + +// --------------------------------------------------------------------------------------------------------------------- + +void health_apply_prototype_to_host(RRDHOST *host, RRD_ALERT_PROTOTYPE *ap) { + if(!ap->_internal.enabled) + return; + + if(unlikely(!host->health.health_enabled) && !rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH)) + return; + + RRDSET *st; + rrdset_foreach_read(st, host) { + health_prototype_apply_to_rrdset(st, ap); + } + rrdset_foreach_done(st); +} + +void health_prototype_apply_to_all_hosts(RRD_ALERT_PROTOTYPE *ap) { + if(!ap->_internal.enabled) + return; + + RRDHOST *host; + dfe_start_reentrant(rrdhost_root_index, host){ + health_apply_prototype_to_host(host, ap); + } + dfe_done(host); +} + +// --------------------------------------------------------------------------------------------------------------------- + +void health_apply_prototypes_to_host(RRDHOST *host) { + if(unlikely(!host->health.health_enabled) && !rrdhost_flag_check(host, RRDHOST_FLAG_INITIALIZED_HEALTH)) + return; + + // free all running alarms + rrdcalc_delete_all(host); + + // invalidate all previous entries in the alarm log + rw_spinlock_read_lock(&host->health_log.spinlock); + ALARM_ENTRY *t; + for(t = host->health_log.alarms ; t ; t = t->next) { + if(t->new_status != RRDCALC_STATUS_REMOVED) + t->flags |= HEALTH_ENTRY_FLAG_UPDATED; + } + rw_spinlock_read_unlock(&host->health_log.spinlock); + + // apply all the prototypes for the charts of the host + RRDSET *st; + rrdset_foreach_read(st, host) { + health_prototype_reset_alerts_for_rrdset(st); + } + rrdset_foreach_done(st); + +#ifdef ENABLE_ACLK + if (netdata_cloud_enabled) { + struct aclk_sync_cfg_t *wc = host->aclk_config; + if (likely(wc)) { + wc->alert_queue_removed = SEND_REMOVED_AFTER_HEALTH_LOOPS; + } + } +#endif +} + +void health_apply_prototypes_to_all_hosts(void) { + RRDHOST *host; + dfe_start_reentrant(rrdhost_root_index, host){ + health_apply_prototypes_to_host(host); + } + dfe_done(host); +} + +// --------------------------------------------------------------------------------------------------------------------- + +void health_prototype_metadata_foreach(void *data, prototype_metadata_cb_t cb) { + RRD_ALERT_PROTOTYPE *ap; + dfe_start_read(health_globals.prototypes.dict, ap) { + cb(data, ap->config.type, ap->config.component, ap->config.classification, ap->config.recipient); + } + dfe_done(ap); +} |