diff options
Diffstat (limited to '')
-rw-r--r-- | health/health_log.c | 235 |
1 files changed, 235 insertions, 0 deletions
diff --git a/health/health_log.c b/health/health_log.c new file mode 100644 index 00000000..fd124ce8 --- /dev/null +++ b/health/health_log.c @@ -0,0 +1,235 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +#include "health.h" + +// ---------------------------------------------------------------------------- + +inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) { + sql_health_alarm_log_save(host, ae); +} + + +void health_log_alert_transition_with_trace(RRDHOST *host, ALARM_ENTRY *ae, int line, const char *file, const char *function) { + ND_LOG_STACK lgs[] = { + ND_LOG_FIELD_UUID(NDF_MESSAGE_ID, &health_alert_transition_msgid), + ND_LOG_FIELD_STR(NDF_NIDL_NODE, host->hostname), + ND_LOG_FIELD_STR(NDF_NIDL_INSTANCE, ae->chart_name), + ND_LOG_FIELD_STR(NDF_NIDL_CONTEXT, ae->chart_context), + ND_LOG_FIELD_U64(NDF_ALERT_ID, ae->alarm_id), + ND_LOG_FIELD_U64(NDF_ALERT_UNIQUE_ID, ae->unique_id), + ND_LOG_FIELD_U64(NDF_ALERT_EVENT_ID, ae->alarm_event_id), + ND_LOG_FIELD_UUID(NDF_ALERT_CONFIG_HASH, &ae->config_hash_id), + ND_LOG_FIELD_UUID(NDF_ALERT_TRANSITION_ID, &ae->transition_id), + ND_LOG_FIELD_STR(NDF_ALERT_NAME, ae->name), + ND_LOG_FIELD_STR(NDF_ALERT_CLASS, ae->classification), + ND_LOG_FIELD_STR(NDF_ALERT_COMPONENT, ae->component), + ND_LOG_FIELD_STR(NDF_ALERT_TYPE, ae->type), + ND_LOG_FIELD_STR(NDF_ALERT_EXEC, ae->exec), + ND_LOG_FIELD_STR(NDF_ALERT_RECIPIENT, ae->recipient), + ND_LOG_FIELD_STR(NDF_ALERT_SOURCE, ae->exec), + ND_LOG_FIELD_STR(NDF_ALERT_UNITS, ae->units), + ND_LOG_FIELD_STR(NDF_ALERT_SUMMARY, ae->summary), + ND_LOG_FIELD_STR(NDF_ALERT_INFO, ae->info), + ND_LOG_FIELD_DBL(NDF_ALERT_VALUE, ae->new_value), + ND_LOG_FIELD_DBL(NDF_ALERT_VALUE_OLD, ae->old_value), + ND_LOG_FIELD_TXT(NDF_ALERT_STATUS, rrdcalc_status2string(ae->new_status)), + ND_LOG_FIELD_TXT(NDF_ALERT_STATUS_OLD, rrdcalc_status2string(ae->old_status)), + ND_LOG_FIELD_I64(NDF_ALERT_DURATION, ae->duration), + ND_LOG_FIELD_I64(NDF_RESPONSE_CODE, ae->exec_code), + ND_LOG_FIELD_U64(NDF_ALERT_NOTIFICATION_REALTIME_USEC, ae->delay_up_to_timestamp * USEC_PER_SEC), + ND_LOG_FIELD_END(), + }; + ND_LOG_STACK_PUSH(lgs); + + errno = 0; + + ND_LOG_FIELD_PRIORITY priority = NDLP_INFO; + + switch(ae->new_status) { + case RRDCALC_STATUS_UNDEFINED: + if(ae->old_status >= RRDCALC_STATUS_CLEAR) + priority = NDLP_NOTICE; + else + priority = NDLP_DEBUG; + break; + + default: + case RRDCALC_STATUS_UNINITIALIZED: + case RRDCALC_STATUS_REMOVED: + priority = NDLP_DEBUG; + break; + + case RRDCALC_STATUS_CLEAR: + priority = NDLP_INFO; + break; + + case RRDCALC_STATUS_WARNING: + if(ae->old_status < RRDCALC_STATUS_WARNING) + priority = NDLP_WARNING; + break; + + case RRDCALC_STATUS_CRITICAL: + if(ae->old_status < RRDCALC_STATUS_CRITICAL) + priority = NDLP_CRIT; + break; + } + + netdata_logger(NDLS_HEALTH, priority, file, function, line, + "ALERT '%s' of instance '%s' on node '%s', transitioned from %s to %s", + string2str(ae->name), string2str(ae->chart), string2str(host->hostname), + rrdcalc_status2string(ae->old_status), rrdcalc_status2string(ae->new_status) + ); +} + +// ---------------------------------------------------------------------------- +// health alarm log management + +inline ALARM_ENTRY* health_create_alarm_entry( + RRDHOST *host, + uint32_t alarm_id, + uint32_t alarm_event_id, + const uuid_t config_hash_id, + time_t when, + STRING *name, + STRING *chart, + STRING *chart_context, + STRING *chart_name, + STRING *class, + STRING *component, + STRING *type, + STRING *exec, + STRING *recipient, + time_t duration, + NETDATA_DOUBLE old_value, + NETDATA_DOUBLE new_value, + RRDCALC_STATUS old_status, + RRDCALC_STATUS new_status, + STRING *source, + STRING *units, + STRING *summary, + STRING *info, + int delay, + HEALTH_ENTRY_FLAGS flags +) { + + if (duration < 0) + duration = 0; + + netdata_log_debug(D_HEALTH, "Health adding alarm log entry with id: %u", host->health_log.next_log_id); + + ALARM_ENTRY *ae = callocz(1, sizeof(ALARM_ENTRY)); + ae->name = string_dup(name); + ae->chart = string_dup(chart); + ae->chart_context = string_dup(chart_context); + ae->chart_name = string_dup(chart_name); + + uuid_copy(ae->config_hash_id, *((uuid_t *) config_hash_id)); + + uuid_generate_random(ae->transition_id); + ae->global_id = now_realtime_usec(); + + ae->classification = string_dup(class); + ae->component = string_dup(component); + ae->type = string_dup(type); + ae->exec = string_dup(exec); + ae->recipient = string_dup(recipient); + ae->source = string_dup(source); + ae->units = string_dup(units); + + ae->unique_id = host->health_log.next_log_id++; + ae->alarm_id = alarm_id; + ae->alarm_event_id = alarm_event_id; + ae->when = when; + ae->old_value = old_value; + ae->new_value = new_value; + + char value_string[100 + 1]; + ae->old_value_string = string_strdupz(format_value_and_unit(value_string, 100, ae->old_value, ae_units(ae), -1)); + ae->new_value_string = string_strdupz(format_value_and_unit(value_string, 100, ae->new_value, ae_units(ae), -1)); + + ae->summary = string_dup(summary); + ae->info = string_dup(info); + ae->old_status = old_status; + ae->new_status = new_status; + ae->duration = duration; + ae->delay = delay; + ae->delay_up_to_timestamp = when + delay; + ae->flags |= flags; + + ae->last_repeat = 0; + + if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL) + ae->non_clear_duration += ae->duration; + + return ae; +} + +inline void health_alarm_log_add_entry( + RRDHOST *host, + ALARM_ENTRY *ae +) { + netdata_log_debug(D_HEALTH, "Health adding alarm log entry with id: %u", ae->unique_id); + + __atomic_add_fetch(&host->health_transitions, 1, __ATOMIC_RELAXED); + + // link it + rw_spinlock_write_lock(&host->health_log.spinlock); + ae->next = host->health_log.alarms; + host->health_log.alarms = ae; + host->health_log.count++; + rw_spinlock_write_unlock(&host->health_log.spinlock); + + // match previous alarms + rw_spinlock_read_lock(&host->health_log.spinlock); + ALARM_ENTRY *t; + for(t = host->health_log.alarms ; t ; t = t->next) { + if(t != ae && t->alarm_id == ae->alarm_id) { + if(!(t->flags & HEALTH_ENTRY_FLAG_UPDATED) && !t->updated_by_id) { + t->flags |= HEALTH_ENTRY_FLAG_UPDATED; + t->updated_by_id = ae->unique_id; + ae->updates_id = t->unique_id; + + if((t->new_status == RRDCALC_STATUS_WARNING || t->new_status == RRDCALC_STATUS_CRITICAL) && + (t->old_status == RRDCALC_STATUS_WARNING || t->old_status == RRDCALC_STATUS_CRITICAL)) + ae->non_clear_duration += t->non_clear_duration; + + health_alarm_log_save(host, t); + } + + // no need to continue + break; + } + } + rw_spinlock_read_unlock(&host->health_log.spinlock); + + health_alarm_log_save(host, ae); +} + +inline void health_alarm_log_free_one_nochecks_nounlink(ALARM_ENTRY *ae) { + string_freez(ae->name); + string_freez(ae->chart); + string_freez(ae->chart_context); + string_freez(ae->classification); + string_freez(ae->component); + string_freez(ae->type); + string_freez(ae->exec); + string_freez(ae->recipient); + string_freez(ae->source); + string_freez(ae->units); + string_freez(ae->info); + string_freez(ae->old_value_string); + string_freez(ae->new_value_string); + freez(ae); +} + +inline void health_alarm_log_free(RRDHOST *host) { + rw_spinlock_write_lock(&host->health_log.spinlock); + + ALARM_ENTRY *ae; + while((ae = host->health_log.alarms)) { + host->health_log.alarms = ae->next; + health_alarm_log_free_one_nochecks_nounlink(ae); + } + + rw_spinlock_write_unlock(&host->health_log.spinlock); +} |