diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2019-07-08 20:14:42 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2019-07-08 20:14:42 +0000 |
commit | 4f88e1a9be89a257fd6ed3045703db6e900027ee (patch) | |
tree | 518eb3c3aa1dce9ea281d02e0fd3cc01a9e7913f /health | |
parent | Adding upstream version 1.15.0. (diff) | |
download | netdata-upstream/1.16.0.tar.xz netdata-upstream/1.16.0.zip |
Adding upstream version 1.16.0.upstream/1.16.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'health')
-rw-r--r-- | health/Makefile.am | 6 | ||||
-rw-r--r-- | health/README.md | 22 | ||||
-rw-r--r-- | health/health.c | 181 | ||||
-rw-r--r-- | health/health.d/dbengine.conf | 26 | ||||
-rw-r--r-- | health/health.d/disks.conf | 4 | ||||
-rw-r--r-- | health/health.d/dnsmasq_dhcp.conf | 12 | ||||
-rw-r--r-- | health/health.d/pihole.conf | 67 | ||||
-rw-r--r-- | health/health.d/processes.conf | 27 | ||||
-rw-r--r-- | health/health.d/ram.conf | 2 | ||||
-rw-r--r-- | health/health.d/riakkv.conf | 80 | ||||
-rw-r--r-- | health/health.d/wmi.conf | 130 | ||||
-rw-r--r-- | health/health.d/x509check.conf | 14 | ||||
-rw-r--r-- | health/health.h | 46 | ||||
-rw-r--r-- | health/health_config.c | 138 | ||||
-rw-r--r-- | health/health_json.c | 4 | ||||
-rw-r--r-- | health/health_log.c | 57 | ||||
-rw-r--r-- | health/notifications/README.md | 3 | ||||
-rwxr-xr-x | health/notifications/alarm-notify.sh.in | 9 | ||||
-rw-r--r-- | health/notifications/custom/README.md | 84 | ||||
-rw-r--r-- | health/notifications/email/README.md | 2 |
20 files changed, 731 insertions, 183 deletions
diff --git a/health/Makefile.am b/health/Makefile.am index 62a4c6d33..5310bd8aa 100644 --- a/health/Makefile.am +++ b/health/Makefile.am @@ -35,6 +35,7 @@ dist_healthconfig_DATA = \ health.d/cpu.conf \ health.d/couchdb.conf \ health.d/disks.conf \ + health.d/dnsmasq_dhcp.conf \ health.d/dockerd.conf \ health.d/elasticsearch.conf \ health.d/entropy.conf \ @@ -62,13 +63,16 @@ dist_healthconfig_DATA = \ health.d/netfilter.conf \ health.d/nginx.conf \ health.d/nginx_plus.conf \ + health.d/pihole.conf \ health.d/phpfpm.conf \ health.d/portcheck.conf \ health.d/postgres.conf \ + health.d/processes.conf \ health.d/qos.conf \ health.d/ram.conf \ health.d/redis.conf \ health.d/retroshare.conf \ + health.d/riakkv.conf \ health.d/softnet.conf \ health.d/squid.conf \ health.d/stiebeleltron.conf \ @@ -81,6 +85,8 @@ dist_healthconfig_DATA = \ health.d/udp_errors.conf \ health.d/varnish.conf \ health.d/web_log.conf \ + health.d/wmi.conf \ health.d/x509check.conf \ health.d/zfs.conf \ + health.d/dbengine.conf \ $(NULL) diff --git a/health/README.md b/health/README.md index 54f6a3e1f..81cc043d0 100644 --- a/health/README.md +++ b/health/README.md @@ -11,7 +11,6 @@ packet dropped). Netdata also supports alarm **templates**, so that an alarm can be attached to all the charts of the same context (i.e. all network interfaces, or all disks, or all mysql servers, etc.). - Each alarm can execute a single query to the database using statistical algorithms against past data, but alarms can be combined. So, if you need 2 queries in the database, you can combine 2 alarms together (both will run a query to the database, and the results can be combined). @@ -342,6 +341,24 @@ delay: [[[up U] [down D] multiplier M] max X] their matching one) and a delay is in place. - All are reset to their defaults when the alarm switches state without a delay in place. +--- + +#### Alarm line `repeat` + +Defines the interval between repeating notifications for the alarms in CRITICAL or WARNING mode. This will override the default interval settings inherited from health settings in `netdata.conf`. The default settings for repeating notifications are `default repeat warning = DURATION` and `default repeat critical = DURATION` which can be found in health stock configuration. + +Format: + +``` +repeat: [off] [warning DURATION] [critical DURATION] +``` + +* `off`: Turns off the repeating feature for the current alarm. This is effective when the default repeat settings has been enabled in health configuration. +* `warning DURATION`: Defines the interval when the alarm is in WARNING state. Use `0s` to turn off the repeating notification for WARNING mode. +* `critical DURATION`: Defines the interval when the alarm is in CRITICAL state. Use `0s` to turn off the repeating notification for CRITICAL mode. + +--- + #### Alarm line `option` The only possible value for the `option` line is @@ -567,12 +584,15 @@ template: disk_full_percent every: 1m warn: $this > 80 crit: $this > 95 + repeat: warning 120s critical 10s ``` `$used` and `$avail` are the `used` and `avail` chart dimensions as shown on the dashboard. So, the `calc` line finds the percentage of used space. `$this` resolves to this percentage. +This is a repeating alarm and if the alarm becomes CRITICAL it repeats the notifications every 10 seconds. It also repeats notifications every 2 minutes if the alarm goes into WARNING mode. + ### Example 3 Predict if any disk will run out of space in the near future. diff --git a/health/health.c b/health/health.c index f92a1ba6b..55bd72843 100644 --- a/health/health.c +++ b/health/health.c @@ -13,18 +13,74 @@ unsigned int default_health_enabled = 1; // ---------------------------------------------------------------------------- // health initialization +/** + * User Config directory + * + * Get the config directory for health and return it. + * + * @return a pointer to the user config directory + */ inline char *health_user_config_dir(void) { char buffer[FILENAME_MAX + 1]; snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_user_config_dir); return config_get(CONFIG_SECTION_HEALTH, "health configuration directory", buffer); } +/** + * Stock Config Directory + * + * Get the Stock config directory and return it. + * + * @return a pointer to the stock config directory. + */ inline char *health_stock_config_dir(void) { char buffer[FILENAME_MAX + 1]; snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_stock_config_dir); return config_get(CONFIG_SECTION_HEALTH, "stock health configuration directory", buffer); } +/** + * Silencers init + * + * Function used to initialize the silencer structure. + */ +void health_silencers_init(void) { + struct stat statbuf; + if (!stat(silencers_filename,&statbuf)) { + off_t length = statbuf.st_size; + if (length && length < HEALTH_SILENCERS_MAX_FILE_LEN) { + FILE *fd = fopen(silencers_filename, "r"); + if (fd) { + char *str = mallocz((length+1)* sizeof(char)); + if(str) { + size_t copied; + copied = fread(str, sizeof(char), length, fd); + if (copied == (length* sizeof(char))) { + str[length] = 0x00; + json_parse(str, NULL, health_silencers_json_read_callback); + info("Parsed health silencers file %s", silencers_filename); + } else { + error("Cannot read the data from health silencers file %s", silencers_filename); + } + freez(str); + } + fclose(fd); + } else { + error("Cannot open the file %s",silencers_filename); + } + } else { + error("Health silencers file %s has the size %ld that is out of range[ 1 , %d ]. Aborting read.", silencers_filename, length, HEALTH_SILENCERS_MAX_FILE_LEN); + } + } else { + error("Cannot open the file %s",silencers_filename); + } +} + +/** + * Health Init + * + * Initialize the health thread. + */ void health_init(void) { debug(D_HEALTH, "Health configuration initializing"); @@ -32,11 +88,20 @@ void health_init(void) { debug(D_HEALTH, "Health is disabled."); return; } + + health_silencers_init(); } // ---------------------------------------------------------------------------- // re-load health configuration +/** + * Reload host + * + * Reload configuration for a specific host. + * + * @param host the structure of the host that the function will reload the configuration. + */ void health_reload_host(RRDHOST *host) { if(unlikely(!host->health_enabled)) return; @@ -84,6 +149,11 @@ void health_reload_host(RRDHOST *host) { rrdhost_unlock(host); } +/** + * Reload + * + * Reload the host configuration for all hosts. + */ void health_reload(void) { rrd_rdlock(); @@ -255,17 +325,18 @@ static inline void health_alarm_log_process(RRDHOST *host) { netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock); ALARM_ENTRY *ae; - for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id ; ae = ae->next) { - if(unlikely( - !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) && - !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED) + for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) { + if(likely(!alarm_entry_isrepeating(host, ae))) { + if(unlikely( + !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) && + !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED) )) { + if(unlikely(ae->unique_id < first_waiting)) + first_waiting = ae->unique_id; - if(unlikely(ae->unique_id < first_waiting)) - first_waiting = ae->unique_id; - - if(likely(now >= ae->delay_up_to_timestamp)) - health_process_notifications(host, ae); + if(likely(now >= ae->delay_up_to_timestamp)) + health_process_notifications(host, ae); + } } } @@ -294,10 +365,12 @@ static inline void health_alarm_log_process(RRDHOST *host) { ALARM_ENTRY *t = ae->next; - health_alarm_log_free_one_nochecks_nounlink(ae); + if(likely(!alarm_entry_isrepeating(host, ae))) { + health_alarm_log_free_one_nochecks_nounlink(ae); + host->health_log.count--; + } ae = t; - host->health_log.count--; } netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock); @@ -411,7 +484,7 @@ SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) { debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rc->name); } else { debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s" - , (silencers->stype==STYPE_DISABLE_ALARMS)?"Disabled":"Silenced" + , (silencers->stype == STYPE_DISABLE_ALARMS)?"Disabled":"Silenced" , rc->name , (rc->rrdset)?rc->rrdset->context:"" , rc->chart @@ -425,6 +498,16 @@ SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) { return STYPE_NONE; } +/** + * Update Disabled Silenced + * + * Update the variable rrdcalc_flags of the structure RRDCALC according with the values of the host structure + * + * @param host structure that contains information about the host monitored. + * @param rc structure with information about the alarm + * + * @return It returns 1 case rrdcalc_flags is DISABLED or 0 otherwise + */ int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) { uint32_t rrdcalc_flags_old = rc->rrdcalc_flags; // Clear the flags @@ -454,6 +537,15 @@ int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) { return 0; } +/** + * Health Main + * + * The main thread of the health system. In this function all the alarms will be processed. + * + * @param ptr is a pointer to the netdata_static_thread structure. + * + * @return It always returns NULL + */ void *health_main(void *ptr) { netdata_thread_cleanup_push(health_main_cleanup, ptr); @@ -464,12 +556,6 @@ void *health_main(void *ptr) { time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60); unsigned int loop = 0; - - silencers = mallocz(sizeof(SILENCERS)); - silencers->all_alarms=0; - silencers->stype=STYPE_NONE; - silencers->silencers=NULL; - while(!netdata_exit) { loop++; debug(D_HEALTH, "Health monitoring iteration no %u started", loop); @@ -756,20 +842,22 @@ void *health_main(void *ptr) { rc->delay_last = delay; rc->delay_up_to_timestamp = now + delay; - health_alarm_log( - host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, - rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, - rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, - rc->delay_last, - ( - ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | - ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) - ) - - ); - - rc->last_status_change = now; - rc->status = status; + if(likely(!rrdcalc_isrepeating(rc))) { + ALARM_ENTRY *ae = health_create_alarm_entry( + host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, + rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, + rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info, + rc->delay_last, + ( + ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | + ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) + ) + ); + health_alarm_log(host, ae); + } + rc->last_status_change = now; + rc->old_status = rc->status; + rc->status = status; } rc->last_updated = now; @@ -779,6 +867,35 @@ void *health_main(void *ptr) { next_run = rc->next_update; } + // process repeating alarms + RRDCALC *rc; + for(rc = host->alarms; rc ; rc = rc->next) { + int repeat_every = 0; + if(unlikely(rrdcalc_isrepeating(rc))) { + if(unlikely(rc->status == RRDCALC_STATUS_WARNING)) + repeat_every = rc->warn_repeat_every; + else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL)) + repeat_every = rc->crit_repeat_every; + } + if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) { + rc->last_repeat = now; + ALARM_ENTRY *ae = health_create_alarm_entry( + host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id, + rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change, + rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info, + rc->delay_last, + ( + ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) | + ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0) + ) + ); + ae->last_repeat = rc->last_repeat; + health_process_notifications(host, ae); + debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id); + health_alarm_log_free_one_nochecks_nounlink(ae); + } + } + rrdhost_unlock(host); } diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf new file mode 100644 index 000000000..7a623ba2b --- /dev/null +++ b/health/health.d/dbengine.conf @@ -0,0 +1,26 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: 10min_dbengine_global_fs_errors + on: netdata.dbengine_global_errors + os: linux freebsd macos + hosts: * + lookup: sum -10m unaligned of FS errors + units: errors + every: 10s + crit: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: number of File-System errors dbengine came across the last 10 minutes (too many open files, wrong permissions etc) + to: sysadmin + + alarm: 10min_dbengine_global_io_errors + on: netdata.dbengine_global_errors + os: linux freebsd macos + hosts: * + lookup: sum -10m unaligned of I/O errors + units: errors + every: 10s + crit: $this > 0 + delay: down 1h multiplier 1.5 max 3h + info: number of IO errors dbengine came across the last 10 minutes (out of space, bad disk etc) + to: sysadmin diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf index 26f85848a..9c194ced2 100644 --- a/health/health.d/disks.conf +++ b/health/health.d/disks.conf @@ -13,7 +13,7 @@ template: disk_space_usage on: disk.space os: linux freebsd hosts: * -families: * +families: !/dev !/dev/* !/run !/run/* * calc: $used * 100 / ($avail + $used) units: % every: 1m @@ -27,7 +27,7 @@ template: disk_inode_usage on: disk.inodes os: linux freebsd hosts: * -families: * +families: !/dev !/dev/* !/run !/run/* * calc: $used * 100 / ($avail + $used) units: % every: 1m diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf new file mode 100644 index 000000000..b7eb4e0a3 --- /dev/null +++ b/health/health.d/dnsmasq_dhcp.conf @@ -0,0 +1,12 @@ + # dhcp-range utilization + + template: dnsmasq_dhcp_dhcp_range_utilization + on: dnsmasq_dhcp.dhcp_range_utilization + every: 10s + units: % + calc: $used + warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) ) + crit: $this > ( ($status >= $CRITICAL) ? ( 90 ) : ( 95 ) ) + delay: down 5m + info: dhcp-range utilization above threshold! + to: sysadmin diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf new file mode 100644 index 000000000..4a1217239 --- /dev/null +++ b/health/health.d/pihole.conf @@ -0,0 +1,67 @@ + + # Make sure Pi-hole is responding. + +template: pihole_last_collected_secs + on: pihole.dns_queries_total + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + + # Blocked DNS queries. + + template: pihole_blocked_queries + on: pihole.dns_queries_percentage + every: 10s + units: % + calc: $blocked + warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) ) + crit: $this > ( ($status >= $CRITICAL) ? ( 55 ) : ( 75 ) ) + delay: up 2m down 5m + info: percentage of blocked dns queries for the last 24 hour + to: sysadmin + + + # Blocklist last update time. + # Default update interval is a week. + + template: pihole_blocklist_last_update + on: pihole.blocklist_last_update + every: 10s + units: seconds + calc: $ago + warn: $this > 60 * 60 * 24 * 8 + crit: $this > 60 * 60 * 24 * 8 * 2 + info: blocklist last update time + to: sysadmin + + + # Gravity file check (gravity.list). + + template: pihole_blocklist_gravity_file + on: pihole.blocklist_last_update + every: 10s + units: boolean + calc: $file_exists + crit: $this != 1 + delay: up 2m down 5m + info: gravity file existence + to: sysadmin + + + # Pi-hole's ability to block unwanted domains. + # Should be enabled. The whole point of Pi-hole! + + template: pihole_status + on: pihole.unwanted_domains_blocking_status + every: 10s + units: boolean + calc: $enabled + warn: $this != 1 + delay: up 2m down 5m + info: unwanted domains blocking status + to: sysadmin diff --git a/health/health.d/processes.conf b/health/health.d/processes.conf new file mode 100644 index 000000000..d96998fdf --- /dev/null +++ b/health/health.d/processes.conf @@ -0,0 +1,27 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: active_processes_limit_freebsd + on: system.active_processes + os: freebsd + hosts: * + calc: $active + units: processes + every: 5s + warn: $this > (($status >= $WARNING) ? (75000) : (80000)) + crit: $this > (($status == $CRITICAL) ? (85000) : (90000)) + delay: down 5m multiplier 1.5 max 1h + info: the number of active processes + to: sysadmin + + alarm: active_processes_limit + on: system.active_processes + os: linux + hosts: * + calc: $active + units: processes + every: 5s + warn: $this > (($status >= $WARNING) ? (25000) : (26000)) + crit: $this > (($status == $CRITICAL) ? (28000) : (30000)) + delay: down 5m multiplier 1.5 max 1h + info: number of active processes + to: sysadmin diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf index 93883f73b..4e41bb496 100644 --- a/health/health.d/ram.conf +++ b/health/health.d/ram.conf @@ -27,7 +27,7 @@ on: mem.available os: linux hosts: * - calc: ($avail + $used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers) + calc: ($avail + $system.ram.used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers) units: % every: 10s warn: $this < (($status >= $WARNING) ? (15) : (10)) diff --git a/health/health.d/riakkv.conf b/health/health.d/riakkv.conf new file mode 100644 index 000000000..745302778 --- /dev/null +++ b/health/health.d/riakkv.conf @@ -0,0 +1,80 @@ +# Ensure that Riak is running. template: riak_last_collected_secs +template: riak_last_collected_secs + on: riak.kv.throughput + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: dba + +# Warn if a list keys operation is running. +template: riak_list_keys_active + on: riak.core.fsm_active + calc: $list_fsm_active + units: state machines + every: 10s + warn: $list_fsm_active > 0 + info: number of currently running list keys finite state machines + to: dba + + +## Timing healthchecks +# KV GET +template: 1h_kv_get_mean_latency + on: riak.kv.latency.get + calc: $node_get_fsm_time_mean + lookup: average -1h unaligned of time + every: 30s + units: ms + info: mean average KV GET latency over the last hour + +template: riak_kv_get_slow + on: riak.kv.latency.get + calc: $mean + lookup: average -3m unaligned of time + units: ms + every: 10s + warn: ($this > ($1h_kv_get_mean_latency * 2) ) + crit: ($this > ($1h_kv_get_mean_latency * 3) ) + info: average KV GET time over the last 3 minutes, compared to the average over the last hour + delay: down 5m multiplier 1.5 max 1h + to: dba + +# KV PUT +template: 1h_kv_put_mean_latency + on: riak.kv.latency.put + calc: $node_put_fsm_time_mean + lookup: average -1h unaligned of time + every: 30s + units: ms + info: mean average KV PUT latency over the last hour + +template: riak_kv_put_slow + on: riak.kv.latency.put + calc: $mean + lookup: average -3m unaligned of time + units: ms + every: 10s + warn: ($this > ($1h_kv_put_mean_latency * 2) ) + crit: ($this > ($1h_kv_put_mean_latency * 3) ) + info: average KV PUT time over the last 3 minutes, compared to the average over the last hour + delay: down 5m multiplier 1.5 max 1h + to: dba + + +## VM healthchecks + +# Default Erlang VM process limit: 262144 +# On systems observed, this is < 2000, but may grow depending on load. +template: riak_vm_high_process_count + on: riak.vm + calc: $sys_process_count + units: processes + every: 10s + warn: $this > 10000 + crit: $this > 100000 + info: number of processes running in the Erlang VM (the default limit on ERTS 10.2.4 is 262144) + to: dba diff --git a/health/health.d/wmi.conf b/health/health.d/wmi.conf new file mode 100644 index 000000000..0441fc1f3 --- /dev/null +++ b/health/health.d/wmi.conf @@ -0,0 +1,130 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + +## Availability + +template: wmi_last_collected_secs + on: cpu.collector_duration + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +## CPU + +template: wmi_10min_cpu_usage + on: wmi.cpu_utilization_total + os: linux + hosts: * + lookup: average -10m unaligned match-names of dpc,user,privileged,interrupt + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + info: cpu utilization for the last 10 minutes + to: sysadmin + + +## Memory + +template: wmi_ram_in_use + on: wmi.memory_utilization + os: linux + hosts: * + calc: ($used) * 100 / ($used + $available) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: used RAM + to: sysadmin + +template: wmi_swap_in_use + on: wmi.memory_swap_utilization + os: linux + hosts: * + calc: ($used) * 100 / ($used + $available) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: used Swap + to: sysadmin + + +## Network + +template: inbound_packets_discarded + on: wmi.net_discarded + os: linux + hosts: * +families: * + lookup: sum -10m unaligned absolute match-names of inbound + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + info: interface inbound discarded packets in the last 10 minutes + to: sysadmin + +template: outbound_packets_discarded + on: wmi.net_discarded + os: linux + hosts: * +families: * + lookup: sum -10m unaligned absolute match-names of outbound + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + info: interface outbound discarded packets in the last 10 minutes + to: sysadmin + +template: inbound_packets_errors + on: wmi.net_errors + os: linux + hosts: * +families: * + lookup: sum -10m unaligned absolute match-names of inbound + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + info: interface inbound errors in the last 10 minutes + to: sysadmin + +template: outbound_packets_errors + on: wmi.net_errors + os: linux + hosts: * +families: * + lookup: sum -10m unaligned absolute match-names of outbound + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + info: interface outbound errors in the last 10 minutes + to: sysadmin + + +## Disk + +template: wmi_disk_in_use + on: wmi.logical_disk_utilization + os: linux + hosts: * + calc: ($used) * 100 / ($used + $free) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: used disk space + to: sysadmin diff --git a/health/health.d/x509check.conf b/health/health.d/x509check.conf index dc0e6c695..a56f48fc3 100644 --- a/health/health.d/x509check.conf +++ b/health/health.d/x509check.conf @@ -1,4 +1,18 @@ +# make sure x509check is running + +template: x509check_last_collected_secs + on: x509check.time_until_expiration + calc: $now - $last_collected_t + units: seconds ago + every: 60s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + + template: x509check_days_until_expiration on: x509check.time_until_expiration calc: $expiry diff --git a/health/health.h b/health/health.h index 1511f3648..6920d12de 100644 --- a/health/health.h +++ b/health/health.h @@ -35,16 +35,7 @@ extern unsigned int default_health_enabled; #define HEALTH_LISTEN_BACKLOG 4096 #endif -#define HEALTH_ALARM_KEY "alarm" -#define HEALTH_TEMPLATE_KEY "template" #define HEALTH_ON_KEY "on" -#define HEALTH_CONTEXT_KEY "context" -#define HEALTH_CHART_KEY "chart" -#define HEALTH_HOST_KEY "hosts" -#define HEALTH_OS_KEY "os" -#define HEALTH_FAMILIES_KEY "families" -#define HEALTH_LOOKUP_KEY "lookup" -#define HEALTH_CALC_KEY "calc" #define HEALTH_EVERY_KEY "every" #define HEALTH_GREEN_KEY "green" #define HEALTH_RED_KEY "red" @@ -57,38 +48,9 @@ extern unsigned int default_health_enabled; #define HEALTH_DELAY_KEY "delay" #define HEALTH_OPTIONS_KEY "options" -typedef struct silencer { - char *alarms; - SIMPLE_PATTERN *alarms_pattern; +#define HEALTH_SILENCERS_MAX_FILE_LEN 10000 - char *hosts; - SIMPLE_PATTERN *hosts_pattern; - - char *contexts; - SIMPLE_PATTERN *contexts_pattern; - - char *charts; - SIMPLE_PATTERN *charts_pattern; - - char *families; - SIMPLE_PATTERN *families_pattern; - - struct silencer *next; -} SILENCER; - -typedef enum silence_type { - STYPE_NONE, - STYPE_DISABLE_ALARMS, - STYPE_SILENCE_NOTIFICATIONS -} SILENCE_TYPE; - -typedef struct silencers { - int all_alarms; - SILENCE_TYPE stype; - SILENCER *silencers; -} SILENCERS; - -SILENCERS *silencers; +char *silencers_filename; extern void health_init(void); extern void *health_main(void *ptr); @@ -108,7 +70,7 @@ extern void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae); extern ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filename); extern void health_alarm_log_load(RRDHOST *host); -extern void health_alarm_log( +extern ALARM_ENTRY* health_create_alarm_entry( RRDHOST *host, uint32_t alarm_id, uint32_t alarm_event_id, @@ -129,6 +91,8 @@ extern void health_alarm_log( int delay, uint32_t flags); +extern void health_alarm_log(RRDHOST *host, ALARM_ENTRY *ae); + extern void health_readdir(RRDHOST *host, const char *user_path, const char *stock_path, const char *subpath); extern char *health_user_config_dir(void); extern char *health_stock_config_dir(void); diff --git a/health/health_config.c b/health/health_config.c index 35fde90bc..0d6e77a9e 100644 --- a/health/health_config.c +++ b/health/health_config.c @@ -23,6 +23,7 @@ #define HEALTH_INFO_KEY "info" #define HEALTH_DELAY_KEY "delay" #define HEALTH_OPTIONS_KEY "options" +#define HEALTH_REPEAT_KEY "repeat" static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) { if(!rc->chart) { @@ -45,7 +46,7 @@ static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) { rc->id = rrdcalc_get_unique_id(host, rc->chart, rc->name, &rc->next_event_id); - debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f", + debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u", rc->chart?rc->chart:"NOCHART", rc->name, rc->id, @@ -66,10 +67,12 @@ static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) { rc->delay_up_duration, rc->delay_down_duration, rc->delay_max_duration, - rc->delay_multiplier + rc->delay_multiplier, + rc->warn_repeat_every, + rc->crit_repeat_every ); - rrdcalc_create_part2(host, rc); + rrdcalc_add_to_host(host, rc); return 1; } @@ -100,7 +103,7 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL } } - debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f", + debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u", rt->name, (rt->context)?rt->context:"NONE", (rt->exec)?rt->exec:"DEFAULT", @@ -120,7 +123,9 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL rt->delay_up_duration, rt->delay_down_duration, rt->delay_max_duration, - rt->delay_multiplier + rt->delay_multiplier, + rt->warn_repeat_every, + rt->crit_repeat_every ); if(likely(last)) { @@ -134,48 +139,6 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL return 1; } -static inline int health_parse_duration(char *string, int *result) { - // make sure it is a number - if(!*string || !(isdigit(*string) || *string == '+' || *string == '-')) { - *result = 0; - return 0; - } - - char *e = NULL; - calculated_number n = str2ld(string, &e); - if(e && *e) { - switch (*e) { - case 'Y': - *result = (int) (n * 86400 * 365); - break; - case 'M': - *result = (int) (n * 86400 * 30); - break; - case 'w': - *result = (int) (n * 86400 * 7); - break; - case 'd': - *result = (int) (n * 86400); - break; - case 'h': - *result = (int) (n * 3600); - break; - case 'm': - *result = (int) (n * 60); - break; - - default: - case 's': - *result = (int) (n); - break; - } - } - else - *result = (int)(n); - - return 1; -} - static inline int health_parse_delay( size_t line, const char *filename, char *string, int *delay_up_duration, @@ -202,14 +165,14 @@ static inline int health_parse_delay( while(*s && isspace(*s)) *s++ = '\0'; if(!strcasecmp(key, "up")) { - if (!health_parse_duration(value, delay_up_duration)) { + if (!config_parse_duration(value, delay_up_duration)) { error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", line, filename, value, key); } else given_up = 1; } else if(!strcasecmp(key, "down")) { - if (!health_parse_duration(value, delay_down_duration)) { + if (!config_parse_duration(value, delay_down_duration)) { error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", line, filename, value, key); } @@ -224,7 +187,7 @@ static inline int health_parse_delay( else given_multiplier = 1; } else if(!strcasecmp(key, "max")) { - if (!health_parse_duration(value, delay_max_duration)) { + if (!config_parse_duration(value, delay_max_duration)) { error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", line, filename, value, key); } @@ -285,6 +248,50 @@ static inline uint32_t health_parse_options(const char *s) { return options; } +static inline int health_parse_repeat( + size_t line, + const char *file, + char *string, + uint32_t *warn_repeat_every, + uint32_t *crit_repeat_every +) { + + char *s = string; + while(*s) { + char *key = s; + + while(*s && !isspace(*s)) s++; + while(*s && isspace(*s)) *s++ = '\0'; + + if(!*key) break; + + char *value = s; + while(*s && !isspace(*s)) s++; + while(*s && isspace(*s)) *s++ = '\0'; + + if(!strcasecmp(key, "off")) { + *warn_repeat_every = 0; + *crit_repeat_every = 0; + return 1; + } + if(!strcasecmp(key, "warning")) { + if (!config_parse_duration(value, (int*)warn_repeat_every)) { + error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", + line, file, value, key); + } + } + else if(!strcasecmp(key, "critical")) { + if (!config_parse_duration(value, (int*)crit_repeat_every)) { + error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword", + line, file, value, key); + } + } + } + + return 1; +} + + static inline int health_parse_db_lookup( size_t line, const char *filename, char *string, RRDR_GROUPING *group_method, int *after, int *before, int *every, @@ -322,7 +329,7 @@ static inline int health_parse_db_lookup( while(*s && !isspace(*s)) s++; while(*s && isspace(*s)) *s++ = '\0'; - if(!health_parse_duration(key, after)) { + if(!config_parse_duration(key, after)) { error("Health configuration at line %zu of file '%s': invalid duration '%s' after group method", line, filename, key); return 0; @@ -343,7 +350,7 @@ static inline int health_parse_db_lookup( while(*s && !isspace(*s)) s++; while(*s && isspace(*s)) *s++ = '\0'; - if (!health_parse_duration(value, before)) { + if (!config_parse_duration(value, before)) { error("Health configuration at line %zu of file '%s': invalid duration '%s' for '%s' keyword", line, filename, value, key); } @@ -353,7 +360,7 @@ static inline int health_parse_db_lookup( while(*s && !isspace(*s)) s++; while(*s && isspace(*s)) *s++ = '\0'; - if (!health_parse_duration(value, every)) { + if (!config_parse_duration(value, every)) { error("Health configuration at line %zu of file '%s': invalid duration '%s' for '%s' keyword", line, filename, value, key); } @@ -430,7 +437,8 @@ static int health_readfile(const char *filename, void *data) { hash_info = 0, hash_recipient = 0, hash_delay = 0, - hash_options = 0; + hash_options = 0, + hash_repeat = 0; char buffer[HEALTH_CONF_MAX_LINE + 1]; @@ -454,6 +462,7 @@ static int health_readfile(const char *filename, void *data) { hash_recipient = simple_hash(HEALTH_RECIPIENT_KEY); hash_delay = simple_uhash(HEALTH_DELAY_KEY); hash_options = simple_uhash(HEALTH_OPTIONS_KEY); + hash_repeat = simple_uhash(HEALTH_REPEAT_KEY); } FILE *fp = fopen(filename, "r"); @@ -481,7 +490,7 @@ static int health_readfile(const char *filename, void *data) { if(append < HEALTH_CONF_MAX_LINE) continue; else { - error("Health configuration has too long muli-line at line %zu of file '%s'.", line, filename); + error("Health configuration has too long multi-line at line %zu of file '%s'.", line, filename); } } append = 0; @@ -532,6 +541,9 @@ static int health_readfile(const char *filename, void *data) { rc->value = NAN; rc->old_value = NAN; rc->delay_multiplier = 1.0; + rc->old_status = RRDCALC_STATUS_UNINITIALIZED; + rc->warn_repeat_every = host->health_default_warn_repeat_every; + rc->crit_repeat_every = host->health_default_crit_repeat_every; if(rrdvar_fix_name(rc->name)) error("Health configuration renamed alarm '%s' to '%s'", value, rc->name); @@ -556,6 +568,8 @@ static int health_readfile(const char *filename, void *data) { rt->green = NAN; rt->red = NAN; rt->delay_multiplier = 1.0; + rt->warn_repeat_every = host->health_default_warn_repeat_every; + rt->crit_repeat_every = host->health_default_crit_repeat_every; if(rrdvar_fix_name(rt->name)) error("Health configuration renamed template '%s' to '%s'", value, rt->name); @@ -612,7 +626,7 @@ static int health_readfile(const char *filename, void *data) { &rc->options, &rc->dimensions); } else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) { - if(!health_parse_duration(value, &rc->update_every)) + if(!config_parse_duration(value, &rc->update_every)) error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.", line, filename, rc->name, key, value); } @@ -707,6 +721,11 @@ static int health_readfile(const char *filename, void *data) { else if(hash == hash_options && !strcasecmp(key, HEALTH_OPTIONS_KEY)) { rc->options |= health_parse_options(value); } + else if(hash == hash_repeat && !strcasecmp(key, HEALTH_REPEAT_KEY)){ + health_parse_repeat(line, filename, value, + &rc->warn_repeat_every, + &rc->crit_repeat_every); + } else { error("Health configuration at line %zu of file '%s' for alarm '%s' has unknown key '%s'.", line, filename, rc->name, key); @@ -736,7 +755,7 @@ static int health_readfile(const char *filename, void *data) { &rt->update_every, &rt->options, &rt->dimensions); } else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) { - if(!health_parse_duration(value, &rt->update_every)) + if(!config_parse_duration(value, &rt->update_every)) error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' cannot parse duration: '%s'.", line, filename, rt->name, key, value); } @@ -831,6 +850,11 @@ static int health_readfile(const char *filename, void *data) { else if(hash == hash_options && !strcasecmp(key, HEALTH_OPTIONS_KEY)) { rt->options |= health_parse_options(value); } + else if(hash == hash_repeat && !strcasecmp(key, HEALTH_REPEAT_KEY)){ + health_parse_repeat(line, filename, value, + &rt->warn_repeat_every, + &rt->crit_repeat_every); + } else { error("Health configuration at line %zu of file '%s' for template '%s' has unknown key '%s'.", line, filename, rt->name, key); diff --git a/health/health_json.c b/health/health_json.c index 781132447..e923b05c6 100644 --- a/health/health_json.c +++ b/health/health_json.c @@ -140,6 +140,8 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC "\t\t\t\"delay_multiplier\": %f,\n" "\t\t\t\"delay\": %d,\n" "\t\t\t\"delay_up_to_timestamp\": %lu,\n" + "\t\t\t\"warn_repeat_every\": \"%u\",\n" + "\t\t\t\"crit_repeat_every\": \"%u\",\n" "\t\t\t\"value_string\": \"%s\",\n" , rc->chart, rc->name , (unsigned long)rc->id @@ -165,6 +167,8 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC , rc->delay_multiplier , rc->delay_last , (unsigned long)rc->delay_up_to_timestamp + , rc->warn_repeat_every + , rc->crit_repeat_every , value_string ); diff --git a/health/health_log.c b/health/health_log.c index 009e42673..c91cde6cb 100644 --- a/health/health_log.c +++ b/health/health_log.c @@ -79,6 +79,7 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) { "\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s" "\t%d\t%d\t%d\t%d" "\t" CALCULATED_NUMBER_FORMAT_AUTO "\t" CALCULATED_NUMBER_FORMAT_AUTO + "\t%016lx" "\n" , (ae->flags & HEALTH_ENTRY_FLAG_SAVED)?'U':'A' , host->hostname @@ -112,6 +113,7 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) { , ae->new_value , ae->old_value + , (uint64_t)ae->last_repeat ) < 0)) error("HEALTH [%s]: failed to save alarm log entry to '%s'. Health data may be lost in case of abnormal restart.", host->hostname, host->health_log_filename); else { @@ -174,10 +176,40 @@ inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filena continue; } + // Check if we got last_repeat field + time_t last_repeat = 0; + if(entries > 27) { + char* alarm_name = pointers[13]; + last_repeat = (time_t)strtoul(pointers[27], NULL, 16); + + RRDCALC *rc = alarm_max_last_repeat(host, alarm_name,simple_hash(alarm_name)); + if (!rc) { + for(rc = host->alarms; rc ; rc = rc->next) { + RRDCALC *rdcmp = (RRDCALC *) avl_insert_lock(&(host)->alarms_idx_name, (avl *)rc); + if(rdcmp != rc) { + error("Cannot insert the alarm index ID using log %s", rc->name); + } + } + + rc = alarm_max_last_repeat(host, alarm_name,simple_hash(alarm_name)); + } + + if(unlikely(rc)) { + if (rrdcalc_isrepeating(rc)) { + rc->last_repeat = last_repeat; + // We iterate through repeating alarm entries only to + // find the latest last_repeat timestamp. Otherwise, + // there is no need to keep them in memory. + continue; + } + } + } + if(unlikely(*pointers[0] == 'A')) { // make sure it is properly numbered if(unlikely(host->health_log.alarms && unique_id < host->health_log.alarms->unique_id)) { - error("HEALTH [%s]: line %zu of file '%s' has alarm log entry %u in wrong order. Ignoring it.", host->hostname, line, filename, unique_id); + error( "HEALTH [%s]: line %zu of file '%s' has alarm log entry %u in wrong order. Ignoring it." + , host->hostname, line, filename, unique_id); errored++; continue; } @@ -186,11 +218,11 @@ inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filena } else if(unlikely(*pointers[0] == 'U')) { // find the original - for(ae = host->health_log.alarms; ae; ae = ae->next) { + for(ae = host->health_log.alarms; ae ; ae = ae->next) { if(unlikely(unique_id == ae->unique_id)) { if(unlikely(*pointers[0] == 'A')) { error("HEALTH [%s]: line %zu of file '%s' adds duplicate alarm log entry %u. Using the later." - , host->hostname, line, filename, unique_id); + , host->hostname, line, filename, unique_id); *pointers[0] = 'U'; duplicate++; } @@ -270,6 +302,8 @@ inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filena ae->new_value = str2l(pointers[25]); ae->old_value = str2l(pointers[26]); + ae->last_repeat = last_repeat; + char value_string[100 + 1]; freez(ae->old_value_string); freez(ae->new_value_string); @@ -339,7 +373,7 @@ inline void health_alarm_log_load(RRDHOST *host) { // ---------------------------------------------------------------------------- // health alarm log management -inline void health_alarm_log( +inline ALARM_ENTRY* health_create_alarm_entry( RRDHOST *host, uint32_t alarm_id, uint32_t alarm_event_id, @@ -398,9 +432,24 @@ inline void health_alarm_log( ae->delay_up_to_timestamp = when + delay; ae->flags |= flags; + ae->last_repeat = 0; + if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL) ae->non_clear_duration += ae->duration; + return ae; +} + +inline void health_alarm_log( + RRDHOST *host, + ALARM_ENTRY *ae +) { + debug(D_HEALTH, "Health adding alarm log entry with id: %u", ae->unique_id); + + if(unlikely(alarm_entry_isrepeating(host, ae))) { + error("Repeating alarms cannot be added to host's alarm log entries. It seems somewhere in the logic, API is being misused. Alarm id: %u", ae->alarm_id); + return; + } // link it netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock); ae->next = host->health_log.alarms; diff --git a/health/notifications/README.md b/health/notifications/README.md index 5b7b43406..8c7ab66f7 100644 --- a/health/notifications/README.md +++ b/health/notifications/README.md @@ -58,6 +58,9 @@ export NETDATA_ALARM_NOTIFY_DEBUG=1 # send test alarms to any role /usr/libexec/netdata/plugins.d/alarm-notify.sh test "ROLE" ``` + +Note that in versions before 1.16, the plugins.d directory may be installed in a different location in certain OSs (e.g. under `/usr/lib/netdata`). You can always find the location of the alarm-notify.sh script in `netdata.conf`. + If you need to dig even deeper, you can trace the execution with `bash -x`. Note that in test mode, alarm-notify.sh calls itself with many more arguments. So first do ```sh bash -x /usr/libexec/netdata/plugins.d/alarm-notify.sh test diff --git a/health/notifications/alarm-notify.sh.in b/health/notifications/alarm-notify.sh.in index ff4b3f3dc..852718bc9 100755 --- a/health/notifications/alarm-notify.sh.in +++ b/health/notifications/alarm-notify.sh.in @@ -189,6 +189,7 @@ fi [ -z "${NETDATA_STOCK_CONFIG_DIR}" ] && NETDATA_STOCK_CONFIG_DIR="@libconfigdir_POST@" [ -z "${NETDATA_CACHE_DIR}" ] && NETDATA_CACHE_DIR="@cachedir_POST@" [ -z "${NETDATA_REGISTRY_URL}" ] && NETDATA_REGISTRY_URL="https://registry.my-netdata.io" +[ -z "${NETDATA_REGISTRY_CLOUD_BASE_URL}" ] && NETDATA_REGISTRY_CLOUD_BASE_URL="https://netdata.cloud" # ----------------------------------------------------------------------------- # parse command line parameters @@ -681,7 +682,7 @@ date=$(date --date=@${when} "${date_format}" 2>/dev/null) # ---------------------------------------------------------------------------- # prepare some extra headers if we've been asked to thread e-mails if [ "${SEND_EMAIL}" == "YES" ] && [ "${EMAIL_THREADING}" != "NO" ]; then - email_thread_headers="In-Reply-To: <${chart}-${name}@${host}>\\nReferences: <${chart}-${name}@${host}>" + email_thread_headers="In-Reply-To: <${chart}-${name}@${host}>\\r\\nReferences: <${chart}-${name}@${host}>" else email_thread_headers= fi @@ -1790,7 +1791,7 @@ if [ "${NETDATA_REGISTRY_URL}" == "https://registry.my-netdata.io" ]; then NETDATA_REGISTRY_UNIQUE_ID="$(cat "@registrydir_POST@/netdata.public.unique.id")" fi fi - if [ ! -z "${NETDATA_REGISTRY_UNIQUE_ID}" ]; then + if [ -n "${NETDATA_REGISTRY_UNIQUE_ID}" ]; then GOTOCLOUD=1 fi fi @@ -1798,7 +1799,7 @@ fi if [ ${GOTOCLOUD} -eq 0 ]; then goto_url="${NETDATA_REGISTRY_URL}/goto-host-from-alarm.html?${redirect_params}" else - goto_url="https://netdata.cloud/alarms/redirect?agentID=${NETDATA_REGISTRY_UNIQUE_ID}&${redirect_params}" + goto_url="${NETDATA_REGISTRY_CLOUD_BASE_URL}/alarms/redirect?agentID=${NETDATA_REGISTRY_UNIQUE_ID}&${redirect_params}" fi # the severity of the alarm @@ -1953,7 +1954,7 @@ send_pushbullet "${PUSHBULLET_ACCESS_TOKEN}" "${PUSHBULLET_SOURCE_DEVICE}" "${to Severity: ${severity}\\n Chart: ${chart}\\n Family: ${family}\\n -$(date -d @${when})\\n +${date}\\n The source of this alarm is line ${src}" SENT_PUSHBULLET=$? diff --git a/health/notifications/custom/README.md b/health/notifications/custom/README.md index 627dd9d48..eeaad8a60 100644 --- a/health/notifications/custom/README.md +++ b/health/notifications/custom/README.md @@ -1,11 +1,13 @@ # Custom -Netdata allows you to send custom notifications, to any endpoint you choose. -To configure custom notifications, you will need to define the `custom_sender()` function in `health_alarm_notify.conf` -You can look at the other senders in `/usr/libexec/netdata/plugins.d/alarm-notify.sh` for examples. +Netdata allows you to send custom notifications to any endpoint you choose. + +To configure custom notifications, you will need to customize `health_alarm_notify.conf`. You can look at the other senders in `/usr/libexec/netdata/plugins.d/alarm-notify.sh` for examples of how to modify the `custom_sender()` function in `health_alarm_notify.conf`. Ensure you follow the instructions of changing any configuration file to [persist your configuration](../../../docs/configuration-guide.md#persist-my-configuration). + As with other notifications, you will also need to define the recipient list in `DEFAULT_RECIPIENT_CUSTOM` and/or the `role_recipients_custom` array. -The following is a sample `custom_sender` function to send an SMS via an imaginary HTTPS endpoint to the SMS gateway: +The following is a sample `custom_sender` function in `health_alarm_notify.conf`, to send an SMS via an imaginary HTTPS endpoint to the SMS gateway: + ``` custom_sender() { # example human readable SMS @@ -37,45 +39,45 @@ The following is a sample `custom_sender` function to send an SMS via an imagina Variables available to the custom_sender: - - ${to_custom} the list of recipients for the alarm - - ${host} the host generated this event - - ${url_host} same as ${host} but URL encoded - - ${unique_id} the unique id of this event - - ${alarm_id} the unique id of the alarm that generated this event - - ${event_id} the incremental id of the event, for this alarm id - - ${when} the timestamp this event occurred - - ${name} the name of the alarm, as given in netdata health.d entries - - ${url_name} same as ${name} but URL encoded - - ${chart} the name of the chart (type.id) - - ${url_chart} same as ${chart} but URL encoded - - ${family} the family of the chart - - ${url_family} same as ${family} but URL encoded - - ${status} the current status : REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL - - ${old_status} the previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL - - ${value} the current value of the alarm - - ${old_value} the previous value of the alarm - - ${src} the line number and file the alarm has been configured - - ${duration} the duration in seconds of the previous alarm state - - ${duration_txt} same as ${duration} for humans - - ${non_clear_duration} the total duration in seconds this is/was non-clear - - ${non_clear_duration_txt} same as ${non_clear_duration} for humans - - ${units} the units of the value - - ${info} a short description of the alarm - - ${value_string} friendly value (with units) - - ${old_value_string} friendly old value (with units) - - ${image} the URL of an image to represent the status of the alarm - - ${color} a color in #AABBCC format for the alarm - - ${goto_url} the URL the user can click to see the netdata dashboard - - ${calc_expression} the expression evaluated to provide the value for the alarm - - ${calc_param_values} the value of the variables in the evaluated expression - - ${total_warnings} the total number of alarms in WARNING state on the host - - ${total_critical} the total number of alarms in CRITICAL state on the host + - `${to_custom}` the list of recipients for the alarm + - `${host}` the host generated this event + - `${url_host}` same as `${host}` but URL encoded + - `${unique_id}` the unique id of this event + - `${alarm_id}` the unique id of the alarm that generated this event + - `${event_id}` the incremental id of the event, for this alarm id + - `${when}` the timestamp this event occurred + - `${name}` the name of the alarm, as given in netdata health.d entries + - `${url_name}` same as `${name}` but URL encoded + - `${chart}` the name of the chart (type.id) + - `${url_chart}` same as `${chart}` but URL encoded + - `${family}` the family of the chart + - `${url_family}` same as `${family}` but URL encoded + - `${status}` the current status : REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL + - `${old_status}` the previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL + - `${value}` the current value of the alarm + - `${old_value}` the previous value of the alarm + - `${src}` the line number and file the alarm has been configured + - `${duration}` the duration in seconds of the previous alarm state + - `${duration_txt}` same as `${duration}` for humans + - `${non_clear_duration}` the total duration in seconds this is/was non-clear + - `${non_clear_duration_txt}` same as `${non_clear_duration}` for humans + - `${units}` the units of the value + - `${info}` a short description of the alarm + - `${value_string}` friendly value (with units) + - `${old_value_string}` friendly old value (with units) + - `${image}` the URL of an image to represent the status of the alarm + - `${color}` a color in #AABBCC format for the alarm + - `${goto_url}` the URL the user can click to see the netdata dashboard + - `${calc_expression}` the expression evaluated to provide the value for the alarm + - `${calc_param_values}` the value of the variables in the evaluated expression + - `${total_warnings}` the total number of alarms in WARNING state on the host + - `${total_critical}` the total number of alarms in CRITICAL state on the host The following are more human friendly: - - ${alarm} like "name = value units" - - ${status_message} like "needs attention", "recovered", "is critical" - - ${severity} like "Escalated to CRITICAL", "Recovered from WARNING" - - ${raised_for} like "(alarm was raised for 10 minutes)" + - `${alarm}` like "name = value units" + - `${status_message}` like "needs attention", "recovered", "is critical" + - `${severity}` like "Escalated to CRITICAL", "Recovered from WARNING" + - `${raised_for}` like "(alarm was raised for 10 minutes)" [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fhealth%2Fnotifications%2Fcustom%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() diff --git a/health/notifications/email/README.md b/health/notifications/email/README.md index 163839b6b..84a9e0ce7 100644 --- a/health/notifications/email/README.md +++ b/health/notifications/email/README.md @@ -30,4 +30,6 @@ sudo su -s /bin/bash netdata Where `[ROLE]` is the role you want to test. The default (if you don't give a `[ROLE]`) is `sysadmin`. +Note that in versions before 1.16, the plugins.d directory may be installed in a different location in certain OSs (e.g. under `/usr/lib/netdata`). You can always find the location of the alarm-notify.sh script in `netdata.conf`. + [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fhealth%2Fnotifications%2Femail%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]() |