summaryrefslogtreecommitdiffstats
path: root/health/health.c
diff options
context:
space:
mode:
Diffstat (limited to 'health/health.c')
-rw-r--r--health/health.c112
1 files changed, 74 insertions, 38 deletions
diff --git a/health/health.c b/health/health.c
index 3c1e5693..9eb36a9c 100644
--- a/health/health.c
+++ b/health/health.c
@@ -11,6 +11,12 @@ static struct {
ALARM_ENTRY *tail; // latest
} alarm_notifications_in_progress = {NULL, NULL};
+typedef struct active_alerts {
+ char *name;
+ time_t last_status_change;
+ RRDCALC_STATUS status;
+} active_alerts_t;
+
static inline void enqueue_alarm_notify_in_progress(ALARM_ENTRY *ae)
{
ae->prev_in_progress = NULL;
@@ -219,10 +225,6 @@ static void health_reload_host(RRDHOST *host) {
* Reload the host configuration for all hosts.
*/
void health_reload(void) {
-#ifdef ENABLE_ACLK
- if (netdata_cloud_setting)
- aclk_single_update_disable();
-#endif
sql_refresh_hashes();
rrd_rdlock();
@@ -234,11 +236,7 @@ void health_reload(void) {
rrd_unlock();
#ifdef ENABLE_ACLK
if (netdata_cloud_setting) {
- aclk_single_update_enable();
- aclk_alarm_reload();
-#ifdef ENABLE_NEW_CLOUD_PROTOCOL
aclk_alert_reloaded = 1;
-#endif
}
#endif
}
@@ -246,13 +244,22 @@ void health_reload(void) {
// ----------------------------------------------------------------------------
// health main thread and friends
-static inline RRDCALC_STATUS rrdcalc_value2status(calculated_number n) {
+static inline RRDCALC_STATUS rrdcalc_value2status(NETDATA_DOUBLE n) {
if(isnan(n) || isinf(n)) return RRDCALC_STATUS_UNDEFINED;
if(n) return RRDCALC_STATUS_RAISED;
return RRDCALC_STATUS_CLEAR;
}
#define ALARM_EXEC_COMMAND_LENGTH 8192
+#define ACTIVE_ALARMS_LIST_EXAMINE 500
+#define ACTIVE_ALARMS_LIST 15
+
+static inline int compare_active_alerts(const void * a, const void * b) {
+ active_alerts_t *active_alerts_a = (active_alerts_t *)a;
+ active_alerts_t *active_alerts_b = (active_alerts_t *)b;
+
+ return ( active_alerts_b->last_status_change - active_alerts_a->last_status_change );
+}
static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
ae->flags |= HEALTH_ENTRY_FLAG_PROCESSED;
@@ -318,31 +325,28 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
RRDCALC *rc;
EVAL_EXPRESSION *expr=NULL;
BUFFER *warn_alarms, *crit_alarms;
+ active_alerts_t *active_alerts = callocz(ACTIVE_ALARMS_LIST_EXAMINE, sizeof(active_alerts_t));
warn_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
crit_alarms = buffer_create(NETDATA_WEB_RESPONSE_INITIAL_SIZE);
- for(rc = host->alarms; rc ; rc = rc->next) {
+ for(rc = host->alarms; rc && (n_warn + n_crit) < ACTIVE_ALARMS_LIST_EXAMINE ; rc = rc->next) {
if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
continue;
if (unlikely(rc->status == RRDCALC_STATUS_WARNING)) {
if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
- if (n_warn)
- buffer_strcat(warn_alarms, ",");
- buffer_strcat(warn_alarms, rc->name);
- buffer_strcat(warn_alarms, "=");
- buffer_snprintf(warn_alarms, 11, "%"PRId64"", (int64_t)rc->last_status_change);
+ active_alerts[n_warn+n_crit].name = rc->name;
+ active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change;
+ active_alerts[n_warn+n_crit].status = rc->status;
n_warn++;
} else if (ae->alarm_id == rc->id)
expr = rc->warning;
} else if (unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) {
if (likely(ae->alarm_id != rc->id) || likely(ae->alarm_event_id != rc->next_event_id - 1)) {
- if (n_crit)
- buffer_strcat(crit_alarms, ",");
- buffer_strcat(crit_alarms, rc->name);
- buffer_strcat(crit_alarms, "=");
- buffer_snprintf(crit_alarms, 11, "%"PRId64"", (int64_t)rc->last_status_change);
+ active_alerts[n_warn+n_crit].name = rc->name;
+ active_alerts[n_warn+n_crit].last_status_change = rc->last_status_change;
+ active_alerts[n_warn+n_crit].status = rc->status;
n_crit++;
} else if (ae->alarm_id == rc->id)
expr = rc->critical;
@@ -352,9 +356,34 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
}
}
+ if (n_warn+n_crit>1)
+ qsort (active_alerts, n_warn+n_crit, sizeof(active_alerts_t), compare_active_alerts);
+
+ int count_w = 0, count_c = 0;
+ while (count_w + count_c < n_warn + n_crit && count_w + count_c < ACTIVE_ALARMS_LIST) {
+ if (active_alerts[count_w+count_c].status == RRDCALC_STATUS_WARNING) {
+ if (count_w)
+ buffer_strcat(warn_alarms, ",");
+ buffer_strcat(warn_alarms, active_alerts[count_w+count_c].name);
+ buffer_strcat(warn_alarms, "=");
+ buffer_snprintf(warn_alarms, 11, "%"PRId64"", (int64_t)active_alerts[count_w+count_c].last_status_change);
+ count_w++;
+ }
+ else if (active_alerts[count_w+count_c].status == RRDCALC_STATUS_CRITICAL) {
+ if (count_c)
+ buffer_strcat(crit_alarms, ",");
+ buffer_strcat(crit_alarms, active_alerts[count_w+count_c].name);
+ buffer_strcat(crit_alarms, "=");
+ buffer_snprintf(crit_alarms, 11, "%"PRId64"", (int64_t)active_alerts[count_w+count_c].last_status_change);
+ count_c++;
+ }
+ }
+
char *edit_command = ae->source ? health_edit_command_from_source(ae->source) : strdupz("UNKNOWN=0=UNKNOWN");
- snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" CALCULATED_NUMBER_FORMAT_ZERO "' '" CALCULATED_NUMBER_FORMAT_ZERO "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'",
+ snprintfz(command_to_run, ALARM_EXEC_COMMAND_LENGTH, "exec %s '%s' '%s' '%u' '%u' '%u' '%lu' '%s' '%s' '%s' '%s' '%s' '" NETDATA_DOUBLE_FORMAT_ZERO
+ "' '" NETDATA_DOUBLE_FORMAT_ZERO
+ "' '%s' '%u' '%u' '%s' '%s' '%s' '%s' '%s' '%s' '%d' '%d' '%s' '%s' '%s' '%s' '%s'",
exec,
recipient,
host->registry_hostname,
@@ -398,6 +427,7 @@ static inline void health_alarm_execute(RRDHOST *host, ALARM_ENTRY *ae) {
freez(edit_command);
buffer_free(warn_alarms);
buffer_free(crit_alarms);
+ freez(active_alerts);
return; //health_alarm_wait_for_execution
done:
@@ -419,7 +449,7 @@ static inline void health_alarm_wait_for_execution(ALARM_ENTRY *ae) {
}
static inline void health_process_notifications(RRDHOST *host, ALARM_ENTRY *ae) {
- debug(D_HEALTH, "Health alarm '%s.%s' = " CALCULATED_NUMBER_FORMAT_AUTO " - changed status from %s to %s",
+ debug(D_HEALTH, "Health alarm '%s.%s' = " NETDATA_DOUBLE_FORMAT_AUTO " - changed status from %s to %s",
ae->chart?ae->chart:"NOCHART", ae->name,
ae->new_value,
rrdcalc_status2string(ae->old_status),
@@ -736,7 +766,7 @@ void *health_main(void *ptr) {
rrdcalc_labels_unlink();
unsigned int loop = 0;
-#if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL)
+#ifdef ENABLE_ACLK
unsigned int marked_aclk_reload_loop = 0;
#endif
while(!netdata_exit) {
@@ -765,7 +795,7 @@ void *health_main(void *ptr) {
}
}
-#if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL)
+#ifdef ENABLE_ACLK
if (aclk_alert_reloaded && !marked_aclk_reload_loop)
marked_aclk_reload_loop = loop;
#endif
@@ -795,6 +825,11 @@ void *health_main(void *ptr) {
host->health_delay_up_to = 0;
}
+ // wait until cleanup of obsolete charts on children is complete
+ if (host != localhost)
+ if (unlikely(host->trigger_chart_obsoletion_check == 1))
+ continue;
+
if(likely(!host->health_log_fp) && (loop == 1 || loop % cleanup_sql_every_loop == 0))
sql_health_alarm_log_cleanup(host);
@@ -818,7 +853,7 @@ void *health_main(void *ptr) {
worker_is_busy(WORKER_HEALTH_JOB_ALARM_LOG_ENTRY);
time_t now = now_realtime_sec();
ALARM_ENTRY *ae = health_create_alarm_entry(
- host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
+ host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context,
rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
rc->value, NAN, rc->status, RRDCALC_STATUS_REMOVED, rc->source, rc->units, rc->info, 0, 0);
if (ae) {
@@ -828,7 +863,7 @@ void *health_main(void *ptr) {
rc->last_status_change = now;
rc->last_updated = now;
rc->value = NAN;
-#if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL)
+#ifdef ENABLE_ACLK
if (netdata_cloud_setting && likely(!aclk_alert_reloaded))
sql_queue_alarm_to_aclk(host, ae, 1);
#endif
@@ -855,10 +890,12 @@ void *health_main(void *ptr) {
/* time_t old_db_timestamp = rc->db_before; */
int value_is_null = 0;
- int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rc->dimensions, 1, rc->after,
- rc->before, rc->group, 0, rc->options, &rc->db_after,
- &rc->db_before, &value_is_null, 0
- );
+ int ret = rrdset2value_api_v1(rc->rrdset, NULL, &rc->value, rc->dimensions, 1,
+ rc->after, rc->before, rc->group, NULL,
+ 0, rc->options,
+ &rc->db_after,&rc->db_before,
+ NULL, NULL, NULL,
+ &value_is_null, NULL, 0, 0);
if (unlikely(ret != 200)) {
// database lookup failed
@@ -898,8 +935,7 @@ void *health_main(void *ptr) {
} else
rc->rrdcalc_flags &= ~RRDCALC_FLAG_DB_NAN;
- debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value "
- CALCULATED_NUMBER_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
+ debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': database lookup gave value " NETDATA_DOUBLE_FORMAT, host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
rc->value
);
}
@@ -923,7 +959,7 @@ void *health_main(void *ptr) {
rc->rrdcalc_flags &= ~RRDCALC_FLAG_CALC_ERROR;
debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': expression '%s' gave value "
- CALCULATED_NUMBER_FORMAT
+ NETDATA_DOUBLE_FORMAT
": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART", rc->name,
rc->calculation->parsed_as, rc->calculation->result,
buffer_tostring(rc->calculation->error_msg), rc->source
@@ -972,7 +1008,7 @@ void *health_main(void *ptr) {
} else {
rc->rrdcalc_flags &= ~RRDCALC_FLAG_WARN_ERROR;
debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': warning expression gave value "
- CALCULATED_NUMBER_FORMAT
+ NETDATA_DOUBLE_FORMAT
": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART",
rc->name, rc->warning->result, buffer_tostring(rc->warning->error_msg), rc->source
);
@@ -998,7 +1034,7 @@ void *health_main(void *ptr) {
} else {
rc->rrdcalc_flags &= ~RRDCALC_FLAG_CRIT_ERROR;
debug(D_HEALTH, "Health on host '%s', alarm '%s.%s': critical expression gave value "
- CALCULATED_NUMBER_FORMAT
+ NETDATA_DOUBLE_FORMAT
": %s (source: %s)", host->hostname, rc->chart ? rc->chart : "NOCHART",
rc->name, rc->critical->result, buffer_tostring(rc->critical->error_msg),
rc->source
@@ -1077,7 +1113,7 @@ void *health_main(void *ptr) {
ALARM_ENTRY *ae = health_create_alarm_entry(
- host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
+ host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context,
rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info,
rc->delay_last,
@@ -1129,7 +1165,7 @@ void *health_main(void *ptr) {
rc->last_repeat = now;
if (likely(rc->times_repeat < UINT32_MAX)) rc->times_repeat++;
ALARM_ENTRY *ae = health_create_alarm_entry(
- host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id,
+ host, rc->id, rc->next_event_id++, rc->config_hash_id, now, rc->name, rc->rrdset->id, rc->rrdset->context,
rc->rrdset->family, rc->classification, rc->component, rc->type, rc->exec, rc->recipient, now - rc->last_status_change,
rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info,
rc->delay_last,
@@ -1178,7 +1214,7 @@ void *health_main(void *ptr) {
health_alarm_wait_for_execution(ae);
}
-#if defined(ENABLE_ACLK) && defined(ENABLE_NEW_CLOUD_PROTOCOL)
+#ifdef ENABLE_ACLK
if (netdata_cloud_setting && unlikely(aclk_alert_reloaded) && loop > (marked_aclk_reload_loop + 2)) {
rrdhost_foreach_read(host) {
if (unlikely(!host->health_enabled))