summaryrefslogtreecommitdiffstats
path: root/health/health.c
diff options
context:
space:
mode:
Diffstat (limited to 'health/health.c')
-rw-r--r--health/health.c181
1 files changed, 149 insertions, 32 deletions
diff --git a/health/health.c b/health/health.c
index f92a1ba6b..55bd72843 100644
--- a/health/health.c
+++ b/health/health.c
@@ -13,18 +13,74 @@ unsigned int default_health_enabled = 1;
// ----------------------------------------------------------------------------
// health initialization
+/**
+ * User Config directory
+ *
+ * Get the config directory for health and return it.
+ *
+ * @return a pointer to the user config directory
+ */
inline char *health_user_config_dir(void) {
char buffer[FILENAME_MAX + 1];
snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_user_config_dir);
return config_get(CONFIG_SECTION_HEALTH, "health configuration directory", buffer);
}
+/**
+ * Stock Config Directory
+ *
+ * Get the Stock config directory and return it.
+ *
+ * @return a pointer to the stock config directory.
+ */
inline char *health_stock_config_dir(void) {
char buffer[FILENAME_MAX + 1];
snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_stock_config_dir);
return config_get(CONFIG_SECTION_HEALTH, "stock health configuration directory", buffer);
}
+/**
+ * Silencers init
+ *
+ * Function used to initialize the silencer structure.
+ */
+void health_silencers_init(void) {
+ struct stat statbuf;
+ if (!stat(silencers_filename,&statbuf)) {
+ off_t length = statbuf.st_size;
+ if (length && length < HEALTH_SILENCERS_MAX_FILE_LEN) {
+ FILE *fd = fopen(silencers_filename, "r");
+ if (fd) {
+ char *str = mallocz((length+1)* sizeof(char));
+ if(str) {
+ size_t copied;
+ copied = fread(str, sizeof(char), length, fd);
+ if (copied == (length* sizeof(char))) {
+ str[length] = 0x00;
+ json_parse(str, NULL, health_silencers_json_read_callback);
+ info("Parsed health silencers file %s", silencers_filename);
+ } else {
+ error("Cannot read the data from health silencers file %s", silencers_filename);
+ }
+ freez(str);
+ }
+ fclose(fd);
+ } else {
+ error("Cannot open the file %s",silencers_filename);
+ }
+ } else {
+ error("Health silencers file %s has the size %ld that is out of range[ 1 , %d ]. Aborting read.", silencers_filename, length, HEALTH_SILENCERS_MAX_FILE_LEN);
+ }
+ } else {
+ error("Cannot open the file %s",silencers_filename);
+ }
+}
+
+/**
+ * Health Init
+ *
+ * Initialize the health thread.
+ */
void health_init(void) {
debug(D_HEALTH, "Health configuration initializing");
@@ -32,11 +88,20 @@ void health_init(void) {
debug(D_HEALTH, "Health is disabled.");
return;
}
+
+ health_silencers_init();
}
// ----------------------------------------------------------------------------
// re-load health configuration
+/**
+ * Reload host
+ *
+ * Reload configuration for a specific host.
+ *
+ * @param host the structure of the host that the function will reload the configuration.
+ */
void health_reload_host(RRDHOST *host) {
if(unlikely(!host->health_enabled))
return;
@@ -84,6 +149,11 @@ void health_reload_host(RRDHOST *host) {
rrdhost_unlock(host);
}
+/**
+ * Reload
+ *
+ * Reload the host configuration for all hosts.
+ */
void health_reload(void) {
rrd_rdlock();
@@ -255,17 +325,18 @@ static inline void health_alarm_log_process(RRDHOST *host) {
netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
ALARM_ENTRY *ae;
- for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id ; ae = ae->next) {
- if(unlikely(
- !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
- !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
+ for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) {
+ if(likely(!alarm_entry_isrepeating(host, ae))) {
+ if(unlikely(
+ !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
+ !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
)) {
+ if(unlikely(ae->unique_id < first_waiting))
+ first_waiting = ae->unique_id;
- if(unlikely(ae->unique_id < first_waiting))
- first_waiting = ae->unique_id;
-
- if(likely(now >= ae->delay_up_to_timestamp))
- health_process_notifications(host, ae);
+ if(likely(now >= ae->delay_up_to_timestamp))
+ health_process_notifications(host, ae);
+ }
}
}
@@ -294,10 +365,12 @@ static inline void health_alarm_log_process(RRDHOST *host) {
ALARM_ENTRY *t = ae->next;
- health_alarm_log_free_one_nochecks_nounlink(ae);
+ if(likely(!alarm_entry_isrepeating(host, ae))) {
+ health_alarm_log_free_one_nochecks_nounlink(ae);
+ host->health_log.count--;
+ }
ae = t;
- host->health_log.count--;
}
netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
@@ -411,7 +484,7 @@ SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) {
debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rc->name);
} else {
debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s"
- , (silencers->stype==STYPE_DISABLE_ALARMS)?"Disabled":"Silenced"
+ , (silencers->stype == STYPE_DISABLE_ALARMS)?"Disabled":"Silenced"
, rc->name
, (rc->rrdset)?rc->rrdset->context:""
, rc->chart
@@ -425,6 +498,16 @@ SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) {
return STYPE_NONE;
}
+/**
+ * Update Disabled Silenced
+ *
+ * Update the variable rrdcalc_flags of the structure RRDCALC according with the values of the host structure
+ *
+ * @param host structure that contains information about the host monitored.
+ * @param rc structure with information about the alarm
+ *
+ * @return It returns 1 case rrdcalc_flags is DISABLED or 0 otherwise
+ */
int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
uint32_t rrdcalc_flags_old = rc->rrdcalc_flags;
// Clear the flags
@@ -454,6 +537,15 @@ int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
return 0;
}
+/**
+ * Health Main
+ *
+ * The main thread of the health system. In this function all the alarms will be processed.
+ *
+ * @param ptr is a pointer to the netdata_static_thread structure.
+ *
+ * @return It always returns NULL
+ */
void *health_main(void *ptr) {
netdata_thread_cleanup_push(health_main_cleanup, ptr);
@@ -464,12 +556,6 @@ void *health_main(void *ptr) {
time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60);
unsigned int loop = 0;
-
- silencers = mallocz(sizeof(SILENCERS));
- silencers->all_alarms=0;
- silencers->stype=STYPE_NONE;
- silencers->silencers=NULL;
-
while(!netdata_exit) {
loop++;
debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
@@ -756,20 +842,22 @@ void *health_main(void *ptr) {
rc->delay_last = delay;
rc->delay_up_to_timestamp = now + delay;
- health_alarm_log(
- host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id,
- rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change,
- rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info,
- rc->delay_last,
- (
- ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
- ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
- )
-
- );
-
- rc->last_status_change = now;
- rc->status = status;
+ if(likely(!rrdcalc_isrepeating(rc))) {
+ ALARM_ENTRY *ae = health_create_alarm_entry(
+ host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id,
+ rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change,
+ rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info,
+ rc->delay_last,
+ (
+ ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
+ ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
+ )
+ );
+ health_alarm_log(host, ae);
+ }
+ rc->last_status_change = now;
+ rc->old_status = rc->status;
+ rc->status = status;
}
rc->last_updated = now;
@@ -779,6 +867,35 @@ void *health_main(void *ptr) {
next_run = rc->next_update;
}
+ // process repeating alarms
+ RRDCALC *rc;
+ for(rc = host->alarms; rc ; rc = rc->next) {
+ int repeat_every = 0;
+ if(unlikely(rrdcalc_isrepeating(rc))) {
+ if(unlikely(rc->status == RRDCALC_STATUS_WARNING))
+ repeat_every = rc->warn_repeat_every;
+ else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL))
+ repeat_every = rc->crit_repeat_every;
+ }
+ if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) {
+ rc->last_repeat = now;
+ ALARM_ENTRY *ae = health_create_alarm_entry(
+ host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id,
+ rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change,
+ rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info,
+ rc->delay_last,
+ (
+ ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
+ ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
+ )
+ );
+ ae->last_repeat = rc->last_repeat;
+ health_process_notifications(host, ae);
+ debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id);
+ health_alarm_log_free_one_nochecks_nounlink(ae);
+ }
+ }
+
rrdhost_unlock(host);
}