summaryrefslogtreecommitdiffstats
path: root/src/health/health_notifications.c
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/health/health_notifications.c569
1 files changed, 569 insertions, 0 deletions
diff --git a/src/health/health_notifications.c b/src/health/health_notifications.c
new file mode 100644
index 000000000..79426f48c
--- /dev/null
+++ b/src/health/health_notifications.c
@@ -0,0 +1,569 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+#include "health_internals.h"
+
+// the queue of executed alarm notifications that haven't been waited for yet
+static struct {
+ ALARM_ENTRY *head; // oldest
+ ALARM_ENTRY *tail; // latest
+} alarm_notifications_in_progress = {NULL, NULL};
+
+struct health_raised_summary {
+ RRDHOST *host;
+ DICTIONARY *rrdcalc_dict;
+
+ struct {
+ size_t size;
+ size_t used;
+ const DICTIONARY_ITEM **array;
+ } active_alerts;
+};
+
+void health_alarm_wait_for_execution(ALARM_ENTRY *ae) {
+ if (!(ae->flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS))
+ return;
+
+ spawn_wait_cmd(ae->exec_spawn_serial, &ae->exec_code, &ae->exec_run_timestamp);
+ netdata_log_debug(D_HEALTH, "done executing command - returned with code %d", ae->exec_code);
+ ae->flags &= ~HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
+
+ if(ae->exec_code != 0)
+ ae->flags |= HEALTH_ENTRY_FLAG_EXEC_FAILED;
+
+ unlink_alarm_notify_in_progress(ae);
+}
+
+void wait_for_all_notifications_to_finish_before_allowing_health_to_be_cleaned_up(void) {
+ ALARM_ENTRY *ae;
+ while (NULL != (ae = alarm_notifications_in_progress.head)) {
+ if(unlikely(!service_running(SERVICE_HEALTH)))
+ break;
+
+ health_alarm_wait_for_execution(ae);
+ }
+}
+
+void unlink_alarm_notify_in_progress(ALARM_ENTRY *ae)
+{
+ struct alarm_entry *prev = ae->prev_in_progress;
+ struct alarm_entry *next = ae->next_in_progress;
+
+ if (NULL != prev) {
+ prev->next_in_progress = next;
+ }
+ if (NULL != next) {
+ next->prev_in_progress = prev;
+ }
+ if (ae == alarm_notifications_in_progress.head) {
+ alarm_notifications_in_progress.head = next;
+ }
+ if (ae == alarm_notifications_in_progress.tail) {
+ alarm_notifications_in_progress.tail = prev;
+ }
+}
+
+static inline void enqueue_alarm_notify_in_progress(ALARM_ENTRY *ae)
+{
+ ae->prev_in_progress = NULL;
+ ae->next_in_progress = NULL;
+
+ if (NULL != alarm_notifications_in_progress.tail) {
+ ae->prev_in_progress = alarm_notifications_in_progress.tail;
+ alarm_notifications_in_progress.tail->next_in_progress = ae;
+ }
+ if (NULL == alarm_notifications_in_progress.head) {
+ alarm_notifications_in_progress.head = ae;
+ }
+ alarm_notifications_in_progress.tail = ae;
+
+}
+
+static bool prepare_command(BUFFER *wb,
+ const char *exec,
+ const char *recipient,
+ const char *registry_hostname,
+ uint32_t unique_id,
+ uint32_t alarm_id,
+ uint32_t alarm_event_id,
+ uint32_t when,
+ const char *alert_name,
+ const char *alert_chart_name,
+ const char *new_status,
+ const char *old_status,
+ NETDATA_DOUBLE new_value,
+ NETDATA_DOUBLE old_value,
+ const char *alert_source,
+ uint32_t duration,
+ uint32_t non_clear_duration,
+ const char *alert_units,
+ const char *alert_info,
+ const char *new_value_string,
+ const char *old_value_string,
+ const char *source,
+ const char *error_msg,
+ int n_warn,
+ int n_crit,
+ const char *warn_alarms,
+ const char *crit_alarms,
+ const char *classification,
+ const char *edit_command,
+ const char *machine_guid,
+ nd_uuid_t *transition_id,
+ const char *summary,
+ const char *context,
+ const char *component,
+ const char *type
+) {
+ char buf[8192];
+ size_t n = sizeof(buf) - 1;
+
+ buffer_strcat(wb, "exec");
+
+ if (!sanitize_command_argument_string(buf, exec, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, recipient, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, registry_hostname, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ buffer_sprintf(wb, " '%u'", unique_id);
+
+ buffer_sprintf(wb, " '%u'", alarm_id);
+
+ buffer_sprintf(wb, " '%u'", alarm_event_id);
+
+ buffer_sprintf(wb, " '%u'", when);
+
+ if (!sanitize_command_argument_string(buf, alert_name, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, alert_chart_name, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, new_status, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, old_status, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ buffer_sprintf(wb, " '" NETDATA_DOUBLE_FORMAT_ZERO "'", new_value);
+
+ buffer_sprintf(wb, " '" NETDATA_DOUBLE_FORMAT_ZERO "'", old_value);
+
+ if (!sanitize_command_argument_string(buf, alert_source, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ buffer_sprintf(wb, " '%u'", duration);
+
+ buffer_sprintf(wb, " '%u'", non_clear_duration);
+
+ if (!sanitize_command_argument_string(buf, alert_units, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, alert_info, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, new_value_string, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, old_value_string, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, source, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, error_msg, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ buffer_sprintf(wb, " '%d'", n_warn);
+
+ buffer_sprintf(wb, " '%d'", n_crit);
+
+ if (!sanitize_command_argument_string(buf, warn_alarms, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, crit_alarms, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, classification, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, edit_command, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, machine_guid, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ char tr_id[UUID_STR_LEN];
+ uuid_unparse_lower(*transition_id, tr_id);
+ if (!sanitize_command_argument_string(buf, tr_id, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, summary, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, context, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, component, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ if (!sanitize_command_argument_string(buf, type, n))
+ return false;
+ buffer_sprintf(wb, " '%s'", buf);
+
+ return true;
+}
+
+static inline int compare_raised_alerts(const void *a, const void *b) {
+ const DICTIONARY_ITEM *item1 = *(const DICTIONARY_ITEM **)a;
+ const DICTIONARY_ITEM *item2 = *(const DICTIONARY_ITEM **)b;
+
+ RRDCALC *rc1 = dictionary_acquired_item_value(item1);
+ RRDCALC *rc2 = dictionary_acquired_item_value(item2);
+
+ return (int)(rc2->last_status_change - rc1->last_status_change);
+}
+
+static void health_raised_summary_add_alert(struct health_raised_summary *hrm, const DICTIONARY_ITEM *item) {
+ if(hrm->active_alerts.used >= hrm->active_alerts.size) {
+ if(hrm->active_alerts.size == 0)
+ hrm->active_alerts.size = 2;
+
+ hrm->active_alerts.size *= 2;
+ hrm->active_alerts.array = reallocz(hrm->active_alerts.array, sizeof(const DICTIONARY_ITEM *) * hrm->active_alerts.size);
+ }
+
+ hrm->active_alerts.array[hrm->active_alerts.used++] = dictionary_acquired_item_dup(hrm->rrdcalc_dict, item);
+}
+
+void alerts_raised_summary_free(struct health_raised_summary *hrm) {
+ for(size_t i = 0; i < hrm->active_alerts.used ;i++)
+ dictionary_acquired_item_release(hrm->rrdcalc_dict, hrm->active_alerts.array[i]);
+
+ freez(hrm->active_alerts.array);
+ freez(hrm);
+}
+
+struct health_raised_summary *alerts_raised_summary_create(RRDHOST *host) {
+ struct health_raised_summary *hrm = callocz(1, sizeof(*hrm));
+ hrm->rrdcalc_dict = host->rrdcalc_root_index;
+ hrm->host = host;
+ return hrm;
+}
+
+void alerts_raised_summary_populate(struct health_raised_summary *hrm) {
+ RRDCALC *rc;
+ foreach_rrdcalc_in_rrdhost_read(hrm->host, rc) {
+ if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec)) continue;
+ health_raised_summary_add_alert(hrm, rc_dfe.item);
+ }
+ foreach_rrdcalc_in_rrdhost_done(rc);
+
+ if (hrm->active_alerts.used > 1)
+ qsort(hrm->active_alerts.array, hrm->active_alerts.used, sizeof(const DICTIONARY_ITEM *), compare_raised_alerts);
+}
+
+static size_t
+health_raised_summary_entries(struct health_raised_summary *hrm, BUFFER *dst, ALARM_ENTRY *ae, RRDCALC_STATUS status) {
+ buffer_flush(dst);
+
+ size_t count = 0;
+ for(size_t i = 0; i < hrm->active_alerts.used ;i++) {
+ RRDCALC *rc = dictionary_acquired_item_value(hrm->active_alerts.array[i]);
+ if(rc->status != status) continue;
+ if(rc->id == ae->alarm_id) continue;
+
+ count++;
+ if(buffer_strlen(dst)) buffer_putc(dst, ',');
+ buffer_sprintf(dst, "%s=%" PRId64, string2str(rc->config.name), (int64_t)rc->last_status_change);
+ }
+
+ return count;
+}
+
+static const char *health_raised_summary_my_expression_source(struct health_raised_summary *hrm, ALARM_ENTRY *ae) {
+ for(size_t i = 0; i < hrm->active_alerts.used ;i++) {
+ RRDCALC *rc = dictionary_acquired_item_value(hrm->active_alerts.array[i]);
+ if(rc->id != ae->alarm_id) continue;
+
+ if(rc->status == RRDCALC_STATUS_CRITICAL)
+ return expression_source(rc->config.critical);
+ else
+ return expression_source(rc->config.warning);
+ }
+
+ return "";
+}
+
+static const char *health_raised_summary_my_expression_error(struct health_raised_summary *hrm, ALARM_ENTRY *ae) {
+ for(size_t i = 0; i < hrm->active_alerts.used ;i++) {
+ RRDCALC *rc = dictionary_acquired_item_value(hrm->active_alerts.array[i]);
+ if(rc->id != ae->alarm_id) continue;
+
+ if(rc->status == RRDCALC_STATUS_CRITICAL)
+ return expression_error_msg(rc->config.critical);
+ else
+ return expression_error_msg(rc->config.warning);
+ }
+
+ return "";
+}
+
+void health_send_notification(RRDHOST *host, ALARM_ENTRY *ae, struct health_raised_summary *hrm) {
+ netdata_log_debug(D_HEALTH, "Health alarm '%s.%s' = " NETDATA_DOUBLE_FORMAT_AUTO " - changed status from %s to %s",
+ ae->chart?ae_chart_id(ae):"NOCHART", ae_name(ae),
+ ae->new_value,
+ rrdcalc_status2string(ae->old_status),
+ rrdcalc_status2string(ae->new_status)
+ );
+
+ ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
+
+ if(unlikely(ae->new_status < RRDCALC_STATUS_CLEAR)) {
+ // do not send notifications for internal statuses
+ netdata_log_debug(D_HEALTH, "Health not sending notification for alarm '%s.%s' status %s (internal statuses)", ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
+ goto done;
+ }
+
+ if(unlikely(ae->new_status <= RRDCALC_STATUS_CLEAR && (ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
+ // do not send notifications for disabled statuses
+
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Health not sending notification for alarm '%s.%s' status %s (it has no-clear-notification enabled)",
+ rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
+
+ // mark it as run, so that we will send the same alarm if it happens again
+ goto done;
+ }
+
+ // find the previous notification for the same alarm
+ // which we have run the exec script
+ // exception: alarms with HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION set
+ RRDCALC_STATUS last_executed_status = -3;
+ if(likely(!(ae->flags & HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION))) {
+ int ret = sql_health_get_last_executed_event(host, ae, &last_executed_status);
+
+ if (likely(ret == 1)) {
+ // we have executed this alarm notification in the past
+ if(last_executed_status == ae->new_status && !(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING)) {
+ // don't send the notification for the same status again
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Health not sending again notification for alarm '%s.%s' status %s",
+ rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae),
+ rrdcalc_status2string(ae->new_status));
+ goto done;
+ }
+ }
+ else {
+ // we have not executed this alarm notification in the past
+ // so, don't send CLEAR notifications
+ if(unlikely(ae->new_status == RRDCALC_STATUS_CLEAR)) {
+ if((!(ae->flags & HEALTH_ENTRY_RUN_ONCE)) || (ae->flags & HEALTH_ENTRY_RUN_ONCE && ae->old_status < RRDCALC_STATUS_RAISED) ) {
+ netdata_log_debug(D_HEALTH, "Health not sending notification for first initialization of alarm '%s.%s' status %s"
+ , ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
+ goto done;
+ }
+ }
+ }
+ }
+
+ // Check if alarm notifications are silenced
+ if (ae->flags & HEALTH_ENTRY_FLAG_SILENCED) {
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Health not sending notification for alarm '%s.%s' status %s "
+ "(command API has disabled notifications)",
+ rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
+ goto done;
+ }
+
+ nd_log(NDLS_DAEMON, NDLP_DEBUG,
+ "[%s]: Sending notification for alarm '%s.%s' status %s.",
+ rrdhost_hostname(host), ae_chart_id(ae), ae_name(ae), rrdcalc_status2string(ae->new_status));
+
+ const char *exec = (ae->exec) ? ae_exec(ae) : string2str(host->health.health_default_exec);
+ const char *recipient = (ae->recipient) ? ae_recipient(ae) : string2str(host->health.health_default_recipient);
+
+ char *edit_command = ae->source ? health_edit_command_from_source(ae_source(ae)) : strdupz("UNKNOWN=0=UNKNOWN");
+
+ BUFFER *warn_alarms = buffer_create(1024, &netdata_buffers_statistics.buffers_health);
+ BUFFER *crit_alarms = buffer_create(1024, &netdata_buffers_statistics.buffers_health);
+
+ size_t n_warn = health_raised_summary_entries(hrm, warn_alarms, ae, RRDCALC_STATUS_WARNING);
+ size_t n_crit = health_raised_summary_entries(hrm, crit_alarms, ae, RRDCALC_STATUS_CRITICAL);
+
+ BUFFER *wb = buffer_create(8192, &netdata_buffers_statistics.buffers_health);
+ bool ok = prepare_command(wb,
+ exec,
+ recipient,
+ rrdhost_registry_hostname(host),
+ ae->unique_id,
+ ae->alarm_id,
+ ae->alarm_event_id,
+ (unsigned long)ae->when,
+ ae_name(ae),
+ ae->chart?ae_chart_id(ae):"NOCHART",
+ rrdcalc_status2string(ae->new_status),
+ rrdcalc_status2string(ae->old_status),
+ ae->new_value,
+ ae->old_value,
+ ae->source?ae_source(ae):"UNKNOWN",
+ (uint32_t)ae->duration,
+ (ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING && ae->new_status >= RRDCALC_STATUS_WARNING) ? (uint32_t)ae->duration : (uint32_t)ae->non_clear_duration,
+ ae_units(ae),
+ ae_info(ae),
+ ae_new_value_string(ae),
+ ae_old_value_string(ae),
+ health_raised_summary_my_expression_source(hrm, ae),
+ health_raised_summary_my_expression_error(hrm, ae),
+ n_warn,
+ n_crit,
+ buffer_tostring(warn_alarms),
+ buffer_tostring(crit_alarms),
+ ae->classification?ae_classification(ae):"Unknown",
+ edit_command,
+ host->machine_guid,
+ &ae->transition_id,
+ host->health.use_summary_for_notifications && ae->summary?ae_summary(ae):ae_name(ae),
+ string2str(ae->chart_context),
+ string2str(ae->component),
+ string2str(ae->type)
+ );
+
+ const char *command_to_run = buffer_tostring(wb);
+ if (ok) {
+ ae->flags |= HEALTH_ENTRY_FLAG_EXEC_RUN;
+ ae->exec_run_timestamp = now_realtime_sec(); /* will be updated by real time after spawning */
+
+ netdata_log_debug(D_HEALTH, "executing command '%s'", command_to_run);
+ ae->flags |= HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS;
+ ae->exec_spawn_serial = spawn_enq_cmd(command_to_run);
+ enqueue_alarm_notify_in_progress(ae);
+ health_alarm_log_save(host, ae);
+ } else {
+ netdata_log_error("Failed to format command arguments");
+ }
+
+ buffer_free(warn_alarms);
+ buffer_free(crit_alarms);
+ buffer_free(wb);
+ freez(edit_command);
+
+ return; //health_alarm_wait_for_execution
+done:
+ health_alarm_log_save(host, ae);
+}
+
+bool health_alarm_log_get_global_id_and_transition_id_for_rrdcalc(RRDCALC *rc, usec_t *global_id, nd_uuid_t *transitions_id) {
+ if(!rc->rrdset)
+ return false;
+
+ RRDHOST *host = rc->rrdset->rrdhost;
+
+ rw_spinlock_read_lock(&host->health_log.spinlock);
+
+ ALARM_ENTRY *ae;
+ for(ae = host->health_log.alarms; ae ; ae = ae->next) {
+ if(unlikely(ae->alarm_id == rc->id))
+ break;
+ }
+
+ if(ae) {
+ *global_id = ae->global_id;
+ uuid_copy(*transitions_id, ae->transition_id);
+ }
+ else {
+ *global_id = 0;
+ uuid_clear(*transitions_id);
+ }
+
+ rw_spinlock_read_unlock(&host->health_log.spinlock);
+
+ return ae != NULL;
+}
+
+void health_alarm_log_process_to_send_notifications(RRDHOST *host, struct health_raised_summary *hrm) {
+ uint32_t first_waiting = (host->health_log.alarms)?host->health_log.alarms->unique_id:0;
+ time_t now = now_realtime_sec();
+
+ rw_spinlock_read_lock(&host->health_log.spinlock);
+
+ ALARM_ENTRY *ae;
+ for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) {
+ if(unlikely(
+ !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
+ !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
+ )) {
+ if(unlikely(ae->unique_id < first_waiting))
+ first_waiting = ae->unique_id;
+
+ if(likely(now >= ae->delay_up_to_timestamp))
+ health_send_notification(host, ae, hrm);
+ }
+ }
+
+ rw_spinlock_read_unlock(&host->health_log.spinlock);
+
+ // remember this for the next iteration
+ host->health_last_processed_id = first_waiting;
+
+ //delete those that are updated, no in progress execution, and is not repeating
+ rw_spinlock_write_lock(&host->health_log.spinlock);
+
+ ALARM_ENTRY *prev = NULL, *next = NULL;
+ for(ae = host->health_log.alarms; ae ; ae = next) {
+ next = ae->next; // set it here, for the next iteration
+
+ if((likely(!(ae->flags & HEALTH_ENTRY_FLAG_IS_REPEATING)) &&
+ (ae->flags & HEALTH_ENTRY_FLAG_UPDATED) &&
+ (ae->flags & HEALTH_ENTRY_FLAG_SAVED) &&
+ !(ae->flags & HEALTH_ENTRY_FLAG_EXEC_IN_PROGRESS))
+ ||
+ ((ae->new_status == RRDCALC_STATUS_REMOVED) &&
+ (ae->flags & HEALTH_ENTRY_FLAG_SAVED) &&
+ (ae->when + 86400 < now_realtime_sec())))
+ {
+
+ if(host->health_log.alarms == ae) {
+ host->health_log.alarms = next;
+ // prev is also NULL here
+ }
+ else {
+ prev->next = next;
+ // prev should not be touched here - we need it for the next iteration
+ // because we may have to also remove the next item
+ }
+
+ health_alarm_log_free_one_nochecks_nounlink(ae);
+ }
+ else
+ prev = ae;
+ }
+
+ rw_spinlock_write_unlock(&host->health_log.spinlock);
+}