diff options
Diffstat (limited to 'health/health.c')
-rw-r--r-- | health/health.c | 181 |
1 files changed, 149 insertions, 32 deletions
diff --git a/health/health.c b/health/health.c index f92a1ba6..55bd7284 100644 --- a/health/health.c +++ b/health/health.c @@ -13,18 +13,74 @@ unsigned int default_health_enabled = 1; // ---------------------------------------------------------------------------- // health initialization +/** + * User Config directory + * + * Get the config directory for health and return it. + * + * @return a pointer to the user config directory + */ inline char *health_user_config_dir(void) { char buffer[FILENAME_MAX + 1]; snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_user_config_dir); return config_get(CONFIG_SECTION_HEALTH, "health configuration directory", buffer); } +/** + * Stock Config Directory + * + * Get the Stock config directory and return it. + * + * @return a pointer to the stock config directory. + */ inline char *health_stock_config_dir(void) { char buffer[FILENAME_MAX + 1]; snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_stock_config_dir); return config_get(CONFIG_SECTION_HEALTH, "stock health configuration directory", buffer); } +/** + * Silencers init + * + * Function used to initialize the silencer structure. + */ +void health_silencers_init(void) { + struct stat statbuf; + if (!stat(silencers_filename,&statbuf)) { + off_t length = statbuf.st_size; + if (length && length < HEALTH_SILENCERS_MAX_FILE_LEN) { + FILE *fd = fopen(silencers_filename, "r"); + if (fd) { + char *str = mallocz((length+1)* sizeof(char)); + if(str) { + size_t copied; + copied = fread(str, sizeof(char), length, fd); + if (copied == (length* sizeof(char))) { + str[length] = 0x00; + json_parse(str, NULL, health_silencers_json_read_callback); + info("Parsed health silencers file %s", silencers_filename); + } else { + error("Cannot read the data from health silencers file %s", silencers_filename); + } + freez(str); + } + fclose(fd); + } else { + error("Cannot open the file %s",silencers_filename); + } + } else { + error("Health silencers file %s has the size %ld that is out of range[ 1 , %d ]. Aborting read.", silencers_filename, length, HEALTH_SILENCERS_MAX_FILE_LEN); + } + } else { + error("Cannot open the file %s",silencers_filename); + } +} + +/** + * Health Init + * + * Initialize the health thread. + */ void health_init(void) { debug(D_HEALTH, "Health configuration initializing"); @@ -32,11 +88,20 @@ void health_init(void) { debug(D_HEALTH, "Health is disabled."); return; } + + health_silencers_init(); } // ---------------------------------------------------------------------------- // re-load health configuration +/** + * Reload host + * + * Reload configuration for a specific host. + * + * @param host the structure of the host that the function will reload the configuration. + */ void health_reload_host(RRDHOST *host) { if(unlikely(!host->health_enabled)) return; @@ -84,6 +149,11 @@ void health_reload_host(RRDHOST *host) { rrdhost_unlock(host); } +/** + * Reload + * + * Reload the host configuration for all hosts. + */ void health_reload(void) { rrd_rdlock(); @@ -255,17 +325,18 @@ static inline void health_alarm_log_process(RRDHOST *host) { netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock); ALARM_ENTRY *ae; - for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id ; ae = ae->next) { - if(unlikely( - !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) && - !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED) + for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) { + if(likely(!alarm_entry_isrepeating(host, ae))) { + if(unlikely( + !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) && + !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED) )) { + if(unlikely(ae->unique_id < first_waiting)) + first_waiting = ae->unique_id; - if(unlikely(ae->unique_id < first_waiting)) - first_waiting = ae->unique_id; - - if(likely(now >= ae->delay_up_to_timestamp)) - health_process_notifications(host, ae); + if(likely(now >= ae->delay_up_to_timestamp)) + health_process_notifications(host, ae); + } } } @@ -294,10 +365,12 @@ static inline void health_alarm_log_process(RRDHOST *host) { ALARM_ENTRY *t = ae->next; - health_alarm_log_free_one_nochecks_nounlink(ae); + if(likely(!alarm_entry_isrepeating(host, ae))) { + health_alarm_log_free_one_nochecks_nounlink(ae); + host->health_log.count--; + } ae = t; - host->health_log.count--; } netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); @@ -411,7 +484,7 @@ SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) { debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rc->name); } else { debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s" - , (silencers->stype==STYPE_DISABLE_ALARMS)?"Disabled":"Silenced" + , (silencers->stype == STYPE_DISABLE_ALARMS)?"Disabled":"Silenced" , rc->name , (rc->rrdset)?rc->rrdset->context:"" , rc->chart @@ -425,6 +498,16 @@ SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) { return STYPE_NONE; } +/** + * Update Disabled Silenced + * + * Update the variable rrdcalc_flags of the structure RRDCALC according with the values of the host structure + * + * @param host structure that contains information about the host monitored. + * @param rc structure with information about the alarm + * + * @return It returns 1 case rrdcalc_flags is DISABLED or 0 otherwise + */ int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) { uint32_t rrdcalc_flags_old = rc->rrdcalc_flags; // Clear the flags @@ -454,6 +537,15 @@ int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) { return 0; } +/** + * Health Main + * + * The main thread of the health system. In this function all the alarms will be processed. + * + * @param ptr is a pointer to the netdata_static_thread structure. + * + * @return It always returns NULL + */ void *health_main(void *ptr) { netdata_thread_cleanup_push(health_main_cleanup, ptr); @@ -464,12 +556,6 @@ void *health_main(void *ptr) { time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60); unsigned int loop = 0; - - silencers = mallocz(sizeof(SILENCERS)); - silencers->all_alarms=0; - silencers->stype=STYPE_NONE; - silencers->silencers=NULL; - while(!netdata_exit) { loop++; debug(D_HEALTH, "Health monitoring iteration no %u started", loop); @@ -756,20 +842,22 @@ void *health_main(void *ptr) { rc->delay_last = delay; rc->delay_up_to_timestamp = now + delay; - health_alarm_log( - host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, - rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, - rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, - rc->delay_last, - ( - ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | - ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) - ) - - ); - - rc->last_status_change = now; - rc->status = status; + if(likely(!rrdcalc_isrepeating(rc))) { + ALARM_ENTRY *ae = health_create_alarm_entry( + host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, + rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, + rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, + rc->delay_last, + ( + ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | + ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) + ) + ); + health_alarm_log(host, ae); + } + rc->last_status_change = now; + rc->old_status = rc->status; + rc->status = status; } rc->last_updated = now; @@ -779,6 +867,35 @@ void *health_main(void *ptr) { next_run = rc->next_update; } + // process repeating alarms + RRDCALC *rc; + for(rc = host->alarms; rc ; rc = rc->next) { + int repeat_every = 0; + if(unlikely(rrdcalc_isrepeating(rc))) { + if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) + repeat_every = rc->warn_repeat_every; + else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) + repeat_every = rc->crit_repeat_every; + } + if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) { + rc->last_repeat = now; + ALARM_ENTRY *ae = health_create_alarm_entry( + host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, + rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, + rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info, + rc->delay_last, + ( + ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | + ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) + ) + ); + ae->last_repeat = rc->last_repeat; + health_process_notifications(host, ae); + debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id); + health_alarm_log_free_one_nochecks_nounlink(ae); + } + } + rrdhost_unlock(host); } |