summaryrefslogtreecommitdiffstats
path: root/health
diff options
context:
space:
mode:
Diffstat (limited to 'health')
-rw-r--r--health/Makefile.am6
-rw-r--r--health/README.md22
-rw-r--r--health/health.c181
-rw-r--r--health/health.d/dbengine.conf26
-rw-r--r--health/health.d/disks.conf4
-rw-r--r--health/health.d/dnsmasq_dhcp.conf12
-rw-r--r--health/health.d/pihole.conf67
-rw-r--r--health/health.d/processes.conf27
-rw-r--r--health/health.d/ram.conf2
-rw-r--r--health/health.d/riakkv.conf80
-rw-r--r--health/health.d/wmi.conf130
-rw-r--r--health/health.d/x509check.conf14
-rw-r--r--health/health.h46
-rw-r--r--health/health_config.c138
-rw-r--r--health/health_json.c4
-rw-r--r--health/health_log.c57
-rw-r--r--health/notifications/README.md3
-rwxr-xr-xhealth/notifications/alarm-notify.sh.in9
-rw-r--r--health/notifications/custom/README.md84
-rw-r--r--health/notifications/email/README.md2
20 files changed, 731 insertions, 183 deletions
diff --git a/health/Makefile.am b/health/Makefile.am
index 62a4c6d3..5310bd8a 100644
--- a/health/Makefile.am
+++ b/health/Makefile.am
@@ -35,6 +35,7 @@ dist_healthconfig_DATA = \
health.d/cpu.conf \
health.d/couchdb.conf \
health.d/disks.conf \
+ health.d/dnsmasq_dhcp.conf \
health.d/dockerd.conf \
health.d/elasticsearch.conf \
health.d/entropy.conf \
@@ -62,13 +63,16 @@ dist_healthconfig_DATA = \
health.d/netfilter.conf \
health.d/nginx.conf \
health.d/nginx_plus.conf \
+ health.d/pihole.conf \
health.d/phpfpm.conf \
health.d/portcheck.conf \
health.d/postgres.conf \
+ health.d/processes.conf \
health.d/qos.conf \
health.d/ram.conf \
health.d/redis.conf \
health.d/retroshare.conf \
+ health.d/riakkv.conf \
health.d/softnet.conf \
health.d/squid.conf \
health.d/stiebeleltron.conf \
@@ -81,6 +85,8 @@ dist_healthconfig_DATA = \
health.d/udp_errors.conf \
health.d/varnish.conf \
health.d/web_log.conf \
+ health.d/wmi.conf \
health.d/x509check.conf \
health.d/zfs.conf \
+ health.d/dbengine.conf \
$(NULL)
diff --git a/health/README.md b/health/README.md
index 54f6a3e1..81cc043d 100644
--- a/health/README.md
+++ b/health/README.md
@@ -11,7 +11,6 @@ packet dropped).
Netdata also supports alarm **templates**, so that an alarm can be attached to all the charts of the same context (i.e. all network interfaces, or all disks, or all mysql servers, etc.).
-
Each alarm can execute a single query to the database using statistical algorithms against past data,
but alarms can be combined. So, if you need 2 queries in the database, you can combine
2 alarms together (both will run a query to the database, and the results can be combined).
@@ -342,6 +341,24 @@ delay: [[[up U] [down D] multiplier M] max X]
their matching one) and a delay is in place.
- All are reset to their defaults when the alarm switches state without a delay in place.
+---
+
+#### Alarm line `repeat`
+
+Defines the interval between repeating notifications for the alarms in CRITICAL or WARNING mode. This will override the default interval settings inherited from health settings in `netdata.conf`. The default settings for repeating notifications are `default repeat warning = DURATION` and `default repeat critical = DURATION` which can be found in health stock configuration.
+
+Format:
+
+```
+repeat: [off] [warning DURATION] [critical DURATION]
+```
+
+* `off`: Turns off the repeating feature for the current alarm. This is effective when the default repeat settings has been enabled in health configuration.
+* `warning DURATION`: Defines the interval when the alarm is in WARNING state. Use `0s` to turn off the repeating notification for WARNING mode.
+* `critical DURATION`: Defines the interval when the alarm is in CRITICAL state. Use `0s` to turn off the repeating notification for CRITICAL mode.
+
+---
+
#### Alarm line `option`
The only possible value for the `option` line is
@@ -567,12 +584,15 @@ template: disk_full_percent
every: 1m
warn: $this > 80
crit: $this > 95
+ repeat: warning 120s critical 10s
```
`$used` and `$avail` are the `used` and `avail` chart dimensions as shown on the dashboard.
So, the `calc` line finds the percentage of used space. `$this` resolves to this percentage.
+This is a repeating alarm and if the alarm becomes CRITICAL it repeats the notifications every 10 seconds. It also repeats notifications every 2 minutes if the alarm goes into WARNING mode.
+
### Example 3
Predict if any disk will run out of space in the near future.
diff --git a/health/health.c b/health/health.c
index f92a1ba6..55bd7284 100644
--- a/health/health.c
+++ b/health/health.c
@@ -13,18 +13,74 @@ unsigned int default_health_enabled = 1;
// ----------------------------------------------------------------------------
// health initialization
+/**
+ * User Config directory
+ *
+ * Get the config directory for health and return it.
+ *
+ * @return a pointer to the user config directory
+ */
inline char *health_user_config_dir(void) {
char buffer[FILENAME_MAX + 1];
snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_user_config_dir);
return config_get(CONFIG_SECTION_HEALTH, "health configuration directory", buffer);
}
+/**
+ * Stock Config Directory
+ *
+ * Get the Stock config directory and return it.
+ *
+ * @return a pointer to the stock config directory.
+ */
inline char *health_stock_config_dir(void) {
char buffer[FILENAME_MAX + 1];
snprintfz(buffer, FILENAME_MAX, "%s/health.d", netdata_configured_stock_config_dir);
return config_get(CONFIG_SECTION_HEALTH, "stock health configuration directory", buffer);
}
+/**
+ * Silencers init
+ *
+ * Function used to initialize the silencer structure.
+ */
+void health_silencers_init(void) {
+ struct stat statbuf;
+ if (!stat(silencers_filename,&statbuf)) {
+ off_t length = statbuf.st_size;
+ if (length && length < HEALTH_SILENCERS_MAX_FILE_LEN) {
+ FILE *fd = fopen(silencers_filename, "r");
+ if (fd) {
+ char *str = mallocz((length+1)* sizeof(char));
+ if(str) {
+ size_t copied;
+ copied = fread(str, sizeof(char), length, fd);
+ if (copied == (length* sizeof(char))) {
+ str[length] = 0x00;
+ json_parse(str, NULL, health_silencers_json_read_callback);
+ info("Parsed health silencers file %s", silencers_filename);
+ } else {
+ error("Cannot read the data from health silencers file %s", silencers_filename);
+ }
+ freez(str);
+ }
+ fclose(fd);
+ } else {
+ error("Cannot open the file %s",silencers_filename);
+ }
+ } else {
+ error("Health silencers file %s has the size %ld that is out of range[ 1 , %d ]. Aborting read.", silencers_filename, length, HEALTH_SILENCERS_MAX_FILE_LEN);
+ }
+ } else {
+ error("Cannot open the file %s",silencers_filename);
+ }
+}
+
+/**
+ * Health Init
+ *
+ * Initialize the health thread.
+ */
void health_init(void) {
debug(D_HEALTH, "Health configuration initializing");
@@ -32,11 +88,20 @@ void health_init(void) {
debug(D_HEALTH, "Health is disabled.");
return;
}
+
+ health_silencers_init();
}
// ----------------------------------------------------------------------------
// re-load health configuration
+/**
+ * Reload host
+ *
+ * Reload configuration for a specific host.
+ *
+ * @param host the structure of the host that the function will reload the configuration.
+ */
void health_reload_host(RRDHOST *host) {
if(unlikely(!host->health_enabled))
return;
@@ -84,6 +149,11 @@ void health_reload_host(RRDHOST *host) {
rrdhost_unlock(host);
}
+/**
+ * Reload
+ *
+ * Reload the host configuration for all hosts.
+ */
void health_reload(void) {
rrd_rdlock();
@@ -255,17 +325,18 @@ static inline void health_alarm_log_process(RRDHOST *host) {
netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
ALARM_ENTRY *ae;
- for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id ; ae = ae->next) {
- if(unlikely(
- !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
- !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
+ for(ae = host->health_log.alarms; ae && ae->unique_id >= host->health_last_processed_id; ae = ae->next) {
+ if(likely(!alarm_entry_isrepeating(host, ae))) {
+ if(unlikely(
+ !(ae->flags & HEALTH_ENTRY_FLAG_PROCESSED) &&
+ !(ae->flags & HEALTH_ENTRY_FLAG_UPDATED)
)) {
+ if(unlikely(ae->unique_id < first_waiting))
+ first_waiting = ae->unique_id;
- if(unlikely(ae->unique_id < first_waiting))
- first_waiting = ae->unique_id;
-
- if(likely(now >= ae->delay_up_to_timestamp))
- health_process_notifications(host, ae);
+ if(likely(now >= ae->delay_up_to_timestamp))
+ health_process_notifications(host, ae);
+ }
}
}
@@ -294,10 +365,12 @@ static inline void health_alarm_log_process(RRDHOST *host) {
ALARM_ENTRY *t = ae->next;
- health_alarm_log_free_one_nochecks_nounlink(ae);
+ if(likely(!alarm_entry_isrepeating(host, ae))) {
+ health_alarm_log_free_one_nochecks_nounlink(ae);
+ host->health_log.count--;
+ }
ae = t;
- host->health_log.count--;
}
netdata_rwlock_unlock(&host->health_log.alarm_log_rwlock);
@@ -411,7 +484,7 @@ SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) {
debug(D_HEALTH, "Alarm %s matched a silence entry, but no SILENCE or DISABLE command was issued via the command API. The match has no effect.", rc->name);
} else {
debug(D_HEALTH, "Alarm %s via the command API - name:%s context:%s chart:%s host:%s family:%s"
- , (silencers->stype==STYPE_DISABLE_ALARMS)?"Disabled":"Silenced"
+ , (silencers->stype == STYPE_DISABLE_ALARMS)?"Disabled":"Silenced"
, rc->name
, (rc->rrdset)?rc->rrdset->context:""
, rc->chart
@@ -425,6 +498,16 @@ SILENCE_TYPE check_silenced(RRDCALC *rc, char* host, SILENCERS *silencers) {
return STYPE_NONE;
}
+/**
+ * Update Disabled Silenced
+ *
+ * Update the variable rrdcalc_flags of the structure RRDCALC according with the values of the host structure
+ *
+ * @param host structure that contains information about the host monitored.
+ * @param rc structure with information about the alarm
+ *
+ * @return It returns 1 case rrdcalc_flags is DISABLED or 0 otherwise
+ */
int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
uint32_t rrdcalc_flags_old = rc->rrdcalc_flags;
// Clear the flags
@@ -454,6 +537,15 @@ int update_disabled_silenced(RRDHOST *host, RRDCALC *rc) {
return 0;
}
+/**
+ * Health Main
+ *
+ * The main thread of the health system. In this function all the alarms will be processed.
+ *
+ * @param ptr is a pointer to the netdata_static_thread structure.
+ *
+ * @return It always returns NULL
+ */
void *health_main(void *ptr) {
netdata_thread_cleanup_push(health_main_cleanup, ptr);
@@ -464,12 +556,6 @@ void *health_main(void *ptr) {
time_t hibernation_delay = config_get_number(CONFIG_SECTION_HEALTH, "postpone alarms during hibernation for seconds", 60);
unsigned int loop = 0;
-
- silencers = mallocz(sizeof(SILENCERS));
- silencers->all_alarms=0;
- silencers->stype=STYPE_NONE;
- silencers->silencers=NULL;
-
while(!netdata_exit) {
loop++;
debug(D_HEALTH, "Health monitoring iteration no %u started", loop);
@@ -756,20 +842,22 @@ void *health_main(void *ptr) {
rc->delay_last = delay;
rc->delay_up_to_timestamp = now + delay;
- health_alarm_log(
- host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id,
- rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change,
- rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info,
- rc->delay_last,
- (
- ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
- ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
- )
-
- );
-
- rc->last_status_change = now;
- rc->status = status;
+ if(likely(!rrdcalc_isrepeating(rc))) {
+ ALARM_ENTRY *ae = health_create_alarm_entry(
+ host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id,
+ rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change,
+ rc->old_value, rc->value, rc->status, status, rc->source, rc->units, rc->info,
+ rc->delay_last,
+ (
+ ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
+ ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
+ )
+ );
+ health_alarm_log(host, ae);
+ }
+ rc->last_status_change = now;
+ rc->old_status = rc->status;
+ rc->status = status;
}
rc->last_updated = now;
@@ -779,6 +867,35 @@ void *health_main(void *ptr) {
next_run = rc->next_update;
}
+ // process repeating alarms
+ RRDCALC *rc;
+ for(rc = host->alarms; rc ; rc = rc->next) {
+ int repeat_every = 0;
+ if(unlikely(rrdcalc_isrepeating(rc))) {
+ if(unlikely(rc->status == RRDCALC_STATUS_WARNING))
+ repeat_every = rc->warn_repeat_every;
+ else if(unlikely(rc->status == RRDCALC_STATUS_CRITICAL))
+ repeat_every = rc->crit_repeat_every;
+ }
+ if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) {
+ rc->last_repeat = now;
+ ALARM_ENTRY *ae = health_create_alarm_entry(
+ host, rc->id, rc->next_event_id++, now, rc->name, rc->rrdset->id,
+ rc->rrdset->family, rc->exec, rc->recipient, now - rc->last_status_change,
+ rc->old_value, rc->value, rc->old_status, rc->status, rc->source, rc->units, rc->info,
+ rc->delay_last,
+ (
+ ((rc->options & RRDCALC_FLAG_NO_CLEAR_NOTIFICATION)? HEALTH_ENTRY_FLAG_NO_CLEAR_NOTIFICATION : 0) |
+ ((rc->rrdcalc_flags & RRDCALC_FLAG_SILENCED)? HEALTH_ENTRY_FLAG_SILENCED : 0)
+ )
+ );
+ ae->last_repeat = rc->last_repeat;
+ health_process_notifications(host, ae);
+ debug(D_HEALTH, "Notification sent for the repeating alarm %u.", ae->alarm_id);
+ health_alarm_log_free_one_nochecks_nounlink(ae);
+ }
+ }
+
rrdhost_unlock(host);
}
diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf
new file mode 100644
index 00000000..7a623ba2
--- /dev/null
+++ b/health/health.d/dbengine.conf
@@ -0,0 +1,26 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ alarm: 10min_dbengine_global_fs_errors
+ on: netdata.dbengine_global_errors
+ os: linux freebsd macos
+ hosts: *
+ lookup: sum -10m unaligned of FS errors
+ units: errors
+ every: 10s
+ crit: $this > 0
+ delay: down 15m multiplier 1.5 max 1h
+ info: number of File-System errors dbengine came across the last 10 minutes (too many open files, wrong permissions etc)
+ to: sysadmin
+
+ alarm: 10min_dbengine_global_io_errors
+ on: netdata.dbengine_global_errors
+ os: linux freebsd macos
+ hosts: *
+ lookup: sum -10m unaligned of I/O errors
+ units: errors
+ every: 10s
+ crit: $this > 0
+ delay: down 1h multiplier 1.5 max 3h
+ info: number of IO errors dbengine came across the last 10 minutes (out of space, bad disk etc)
+ to: sysadmin
diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf
index 26f85848..9c194ced 100644
--- a/health/health.d/disks.conf
+++ b/health/health.d/disks.conf
@@ -13,7 +13,7 @@ template: disk_space_usage
on: disk.space
os: linux freebsd
hosts: *
-families: *
+families: !/dev !/dev/* !/run !/run/* *
calc: $used * 100 / ($avail + $used)
units: %
every: 1m
@@ -27,7 +27,7 @@ template: disk_inode_usage
on: disk.inodes
os: linux freebsd
hosts: *
-families: *
+families: !/dev !/dev/* !/run !/run/* *
calc: $used * 100 / ($avail + $used)
units: %
every: 1m
diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf
new file mode 100644
index 00000000..b7eb4e0a
--- /dev/null
+++ b/health/health.d/dnsmasq_dhcp.conf
@@ -0,0 +1,12 @@
+ # dhcp-range utilization
+
+ template: dnsmasq_dhcp_dhcp_range_utilization
+ on: dnsmasq_dhcp.dhcp_range_utilization
+ every: 10s
+ units: %
+ calc: $used
+ warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) )
+ crit: $this > ( ($status >= $CRITICAL) ? ( 90 ) : ( 95 ) )
+ delay: down 5m
+ info: dhcp-range utilization above threshold!
+ to: sysadmin
diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf
new file mode 100644
index 00000000..4a121723
--- /dev/null
+++ b/health/health.d/pihole.conf
@@ -0,0 +1,67 @@
+
+ # Make sure Pi-hole is responding.
+
+template: pihole_last_collected_secs
+ on: pihole.dns_queries_total
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: webmaster
+
+ # Blocked DNS queries.
+
+ template: pihole_blocked_queries
+ on: pihole.dns_queries_percentage
+ every: 10s
+ units: %
+ calc: $blocked
+ warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) )
+ crit: $this > ( ($status >= $CRITICAL) ? ( 55 ) : ( 75 ) )
+ delay: up 2m down 5m
+ info: percentage of blocked dns queries for the last 24 hour
+ to: sysadmin
+
+
+ # Blocklist last update time.
+ # Default update interval is a week.
+
+ template: pihole_blocklist_last_update
+ on: pihole.blocklist_last_update
+ every: 10s
+ units: seconds
+ calc: $ago
+ warn: $this > 60 * 60 * 24 * 8
+ crit: $this > 60 * 60 * 24 * 8 * 2
+ info: blocklist last update time
+ to: sysadmin
+
+
+ # Gravity file check (gravity.list).
+
+ template: pihole_blocklist_gravity_file
+ on: pihole.blocklist_last_update
+ every: 10s
+ units: boolean
+ calc: $file_exists
+ crit: $this != 1
+ delay: up 2m down 5m
+ info: gravity file existence
+ to: sysadmin
+
+
+ # Pi-hole's ability to block unwanted domains.
+ # Should be enabled. The whole point of Pi-hole!
+
+ template: pihole_status
+ on: pihole.unwanted_domains_blocking_status
+ every: 10s
+ units: boolean
+ calc: $enabled
+ warn: $this != 1
+ delay: up 2m down 5m
+ info: unwanted domains blocking status
+ to: sysadmin
diff --git a/health/health.d/processes.conf b/health/health.d/processes.conf
new file mode 100644
index 00000000..d96998fd
--- /dev/null
+++ b/health/health.d/processes.conf
@@ -0,0 +1,27 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ alarm: active_processes_limit_freebsd
+ on: system.active_processes
+ os: freebsd
+ hosts: *
+ calc: $active
+ units: processes
+ every: 5s
+ warn: $this > (($status >= $WARNING) ? (75000) : (80000))
+ crit: $this > (($status == $CRITICAL) ? (85000) : (90000))
+ delay: down 5m multiplier 1.5 max 1h
+ info: the number of active processes
+ to: sysadmin
+
+ alarm: active_processes_limit
+ on: system.active_processes
+ os: linux
+ hosts: *
+ calc: $active
+ units: processes
+ every: 5s
+ warn: $this > (($status >= $WARNING) ? (25000) : (26000))
+ crit: $this > (($status == $CRITICAL) ? (28000) : (30000))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of active processes
+ to: sysadmin
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
index 93883f73..4e41bb49 100644
--- a/health/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -27,7 +27,7 @@
on: mem.available
os: linux
hosts: *
- calc: ($avail + $used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers)
+ calc: ($avail + $system.ram.used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers)
units: %
every: 10s
warn: $this < (($status >= $WARNING) ? (15) : (10))
diff --git a/health/health.d/riakkv.conf b/health/health.d/riakkv.conf
new file mode 100644
index 00000000..74530277
--- /dev/null
+++ b/health/health.d/riakkv.conf
@@ -0,0 +1,80 @@
+# Ensure that Riak is running. template: riak_last_collected_secs
+template: riak_last_collected_secs
+ on: riak.kv.throughput
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: dba
+
+# Warn if a list keys operation is running.
+template: riak_list_keys_active
+ on: riak.core.fsm_active
+ calc: $list_fsm_active
+ units: state machines
+ every: 10s
+ warn: $list_fsm_active > 0
+ info: number of currently running list keys finite state machines
+ to: dba
+
+
+## Timing healthchecks
+# KV GET
+template: 1h_kv_get_mean_latency
+ on: riak.kv.latency.get
+ calc: $node_get_fsm_time_mean
+ lookup: average -1h unaligned of time
+ every: 30s
+ units: ms
+ info: mean average KV GET latency over the last hour
+
+template: riak_kv_get_slow
+ on: riak.kv.latency.get
+ calc: $mean
+ lookup: average -3m unaligned of time
+ units: ms
+ every: 10s
+ warn: ($this > ($1h_kv_get_mean_latency * 2) )
+ crit: ($this > ($1h_kv_get_mean_latency * 3) )
+ info: average KV GET time over the last 3 minutes, compared to the average over the last hour
+ delay: down 5m multiplier 1.5 max 1h
+ to: dba
+
+# KV PUT
+template: 1h_kv_put_mean_latency
+ on: riak.kv.latency.put
+ calc: $node_put_fsm_time_mean
+ lookup: average -1h unaligned of time
+ every: 30s
+ units: ms
+ info: mean average KV PUT latency over the last hour
+
+template: riak_kv_put_slow
+ on: riak.kv.latency.put
+ calc: $mean
+ lookup: average -3m unaligned of time
+ units: ms
+ every: 10s
+ warn: ($this > ($1h_kv_put_mean_latency * 2) )
+ crit: ($this > ($1h_kv_put_mean_latency * 3) )
+ info: average KV PUT time over the last 3 minutes, compared to the average over the last hour
+ delay: down 5m multiplier 1.5 max 1h
+ to: dba
+
+
+## VM healthchecks
+
+# Default Erlang VM process limit: 262144
+# On systems observed, this is < 2000, but may grow depending on load.
+template: riak_vm_high_process_count
+ on: riak.vm
+ calc: $sys_process_count
+ units: processes
+ every: 10s
+ warn: $this > 10000
+ crit: $this > 100000
+ info: number of processes running in the Erlang VM (the default limit on ERTS 10.2.4 is 262144)
+ to: dba
diff --git a/health/health.d/wmi.conf b/health/health.d/wmi.conf
new file mode 100644
index 00000000..0441fc1f
--- /dev/null
+++ b/health/health.d/wmi.conf
@@ -0,0 +1,130 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+## Availability
+
+template: wmi_last_collected_secs
+ on: cpu.collector_duration
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: sysadmin
+
+## CPU
+
+template: wmi_10min_cpu_usage
+ on: wmi.cpu_utilization_total
+ os: linux
+ hosts: *
+ lookup: average -10m unaligned match-names of dpc,user,privileged,interrupt
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (75) : (85))
+ crit: $this > (($status == $CRITICAL) ? (85) : (95))
+ delay: down 15m multiplier 1.5 max 1h
+ info: cpu utilization for the last 10 minutes
+ to: sysadmin
+
+
+## Memory
+
+template: wmi_ram_in_use
+ on: wmi.memory_utilization
+ os: linux
+ hosts: *
+ calc: ($used) * 100 / ($used + $available)
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: down 15m multiplier 1.5 max 1h
+ info: used RAM
+ to: sysadmin
+
+template: wmi_swap_in_use
+ on: wmi.memory_swap_utilization
+ os: linux
+ hosts: *
+ calc: ($used) * 100 / ($used + $available)
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: down 15m multiplier 1.5 max 1h
+ info: used Swap
+ to: sysadmin
+
+
+## Network
+
+template: inbound_packets_discarded
+ on: wmi.net_discarded
+ os: linux
+ hosts: *
+families: *
+ lookup: sum -10m unaligned absolute match-names of inbound
+ units: packets
+ every: 1m
+ warn: $this >= 5
+ delay: down 1h multiplier 1.5 max 2h
+ info: interface inbound discarded packets in the last 10 minutes
+ to: sysadmin
+
+template: outbound_packets_discarded
+ on: wmi.net_discarded
+ os: linux
+ hosts: *
+families: *
+ lookup: sum -10m unaligned absolute match-names of outbound
+ units: packets
+ every: 1m
+ warn: $this >= 5
+ delay: down 1h multiplier 1.5 max 2h
+ info: interface outbound discarded packets in the last 10 minutes
+ to: sysadmin
+
+template: inbound_packets_errors
+ on: wmi.net_errors
+ os: linux
+ hosts: *
+families: *
+ lookup: sum -10m unaligned absolute match-names of inbound
+ units: packets
+ every: 1m
+ warn: $this >= 5
+ delay: down 1h multiplier 1.5 max 2h
+ info: interface inbound errors in the last 10 minutes
+ to: sysadmin
+
+template: outbound_packets_errors
+ on: wmi.net_errors
+ os: linux
+ hosts: *
+families: *
+ lookup: sum -10m unaligned absolute match-names of outbound
+ units: packets
+ every: 1m
+ warn: $this >= 5
+ delay: down 1h multiplier 1.5 max 2h
+ info: interface outbound errors in the last 10 minutes
+ to: sysadmin
+
+
+## Disk
+
+template: wmi_disk_in_use
+ on: wmi.logical_disk_utilization
+ os: linux
+ hosts: *
+ calc: ($used) * 100 / ($used + $free)
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: down 15m multiplier 1.5 max 1h
+ info: used disk space
+ to: sysadmin
diff --git a/health/health.d/x509check.conf b/health/health.d/x509check.conf
index dc0e6c69..a56f48fc 100644
--- a/health/health.d/x509check.conf
+++ b/health/health.d/x509check.conf
@@ -1,4 +1,18 @@
+# make sure x509check is running
+
+template: x509check_last_collected_secs
+ on: x509check.time_until_expiration
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 60s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: webmaster
+
+
template: x509check_days_until_expiration
on: x509check.time_until_expiration
calc: $expiry
diff --git a/health/health.h b/health/health.h
index 1511f364..6920d12d 100644
--- a/health/health.h
+++ b/health/health.h
@@ -35,16 +35,7 @@ extern unsigned int default_health_enabled;
#define HEALTH_LISTEN_BACKLOG 4096
#endif
-#define HEALTH_ALARM_KEY "alarm"
-#define HEALTH_TEMPLATE_KEY "template"
#define HEALTH_ON_KEY "on"
-#define HEALTH_CONTEXT_KEY "context"
-#define HEALTH_CHART_KEY "chart"
-#define HEALTH_HOST_KEY "hosts"
-#define HEALTH_OS_KEY "os"
-#define HEALTH_FAMILIES_KEY "families"
-#define HEALTH_LOOKUP_KEY "lookup"
-#define HEALTH_CALC_KEY "calc"
#define HEALTH_EVERY_KEY "every"
#define HEALTH_GREEN_KEY "green"
#define HEALTH_RED_KEY "red"
@@ -57,38 +48,9 @@ extern unsigned int default_health_enabled;
#define HEALTH_DELAY_KEY "delay"
#define HEALTH_OPTIONS_KEY "options"
-typedef struct silencer {
- char *alarms;
- SIMPLE_PATTERN *alarms_pattern;
+#define HEALTH_SILENCERS_MAX_FILE_LEN 10000
- char *hosts;
- SIMPLE_PATTERN *hosts_pattern;
-
- char *contexts;
- SIMPLE_PATTERN *contexts_pattern;
-
- char *charts;
- SIMPLE_PATTERN *charts_pattern;
-
- char *families;
- SIMPLE_PATTERN *families_pattern;
-
- struct silencer *next;
-} SILENCER;
-
-typedef enum silence_type {
- STYPE_NONE,
- STYPE_DISABLE_ALARMS,
- STYPE_SILENCE_NOTIFICATIONS
-} SILENCE_TYPE;
-
-typedef struct silencers {
- int all_alarms;
- SILENCE_TYPE stype;
- SILENCER *silencers;
-} SILENCERS;
-
-SILENCERS *silencers;
+char *silencers_filename;
extern void health_init(void);
extern void *health_main(void *ptr);
@@ -108,7 +70,7 @@ extern void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae);
extern ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filename);
extern void health_alarm_log_load(RRDHOST *host);
-extern void health_alarm_log(
+extern ALARM_ENTRY* health_create_alarm_entry(
RRDHOST *host,
uint32_t alarm_id,
uint32_t alarm_event_id,
@@ -129,6 +91,8 @@ extern void health_alarm_log(
int delay,
uint32_t flags);
+extern void health_alarm_log(RRDHOST *host, ALARM_ENTRY *ae);
+
extern void health_readdir(RRDHOST *host, const char *user_path, const char *stock_path, const char *subpath);
extern char *health_user_config_dir(void);
extern char *health_stock_config_dir(void);
diff --git a/health/health_config.c b/health/health_config.c
index 35fde90b..0d6e77a9 100644
--- a/health/health_config.c
+++ b/health/health_config.c
@@ -23,6 +23,7 @@
#define HEALTH_INFO_KEY "info"
#define HEALTH_DELAY_KEY "delay"
#define HEALTH_OPTIONS_KEY "options"
+#define HEALTH_REPEAT_KEY "repeat"
static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
if(!rc->chart) {
@@ -45,7 +46,7 @@ static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
rc->id = rrdcalc_get_unique_id(host, rc->chart, rc->name, &rc->next_event_id);
- debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
+ debug(D_HEALTH, "Health configuration adding alarm '%s.%s' (%u): exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u",
rc->chart?rc->chart:"NOCHART",
rc->name,
rc->id,
@@ -66,10 +67,12 @@ static inline int rrdcalc_add_alarm_from_config(RRDHOST *host, RRDCALC *rc) {
rc->delay_up_duration,
rc->delay_down_duration,
rc->delay_max_duration,
- rc->delay_multiplier
+ rc->delay_multiplier,
+ rc->warn_repeat_every,
+ rc->crit_repeat_every
);
- rrdcalc_create_part2(host, rc);
+ rrdcalc_add_to_host(host, rc);
return 1;
}
@@ -100,7 +103,7 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL
}
}
- debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f",
+ debug(D_HEALTH, "Health configuration adding template '%s': context '%s', exec '%s', recipient '%s', green " CALCULATED_NUMBER_FORMAT_AUTO ", red " CALCULATED_NUMBER_FORMAT_AUTO ", lookup: group %d, after %d, before %d, options %u, dimensions '%s', update every %d, calculation '%s', warning '%s', critical '%s', source '%s', delay up %d, delay down %d, delay max %d, delay_multiplier %f, warn_repeat_every %u, crit_repeat_every %u",
rt->name,
(rt->context)?rt->context:"NONE",
(rt->exec)?rt->exec:"DEFAULT",
@@ -120,7 +123,9 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL
rt->delay_up_duration,
rt->delay_down_duration,
rt->delay_max_duration,
- rt->delay_multiplier
+ rt->delay_multiplier,
+ rt->warn_repeat_every,
+ rt->crit_repeat_every
);
if(likely(last)) {
@@ -134,48 +139,6 @@ static inline int rrdcalctemplate_add_template_from_config(RRDHOST *host, RRDCAL
return 1;
}
-static inline int health_parse_duration(char *string, int *result) {
- // make sure it is a number
- if(!*string || !(isdigit(*string) || *string == '+' || *string == '-')) {
- *result = 0;
- return 0;
- }
-
- char *e = NULL;
- calculated_number n = str2ld(string, &e);
- if(e && *e) {
- switch (*e) {
- case 'Y':
- *result = (int) (n * 86400 * 365);
- break;
- case 'M':
- *result = (int) (n * 86400 * 30);
- break;
- case 'w':
- *result = (int) (n * 86400 * 7);
- break;
- case 'd':
- *result = (int) (n * 86400);
- break;
- case 'h':
- *result = (int) (n * 3600);
- break;
- case 'm':
- *result = (int) (n * 60);
- break;
-
- default:
- case 's':
- *result = (int) (n);
- break;
- }
- }
- else
- *result = (int)(n);
-
- return 1;
-}
-
static inline int health_parse_delay(
size_t line, const char *filename, char *string,
int *delay_up_duration,
@@ -202,14 +165,14 @@ static inline int health_parse_delay(
while(*s && isspace(*s)) *s++ = '\0';
if(!strcasecmp(key, "up")) {
- if (!health_parse_duration(value, delay_up_duration)) {
+ if (!config_parse_duration(value, delay_up_duration)) {
error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
line, filename, value, key);
}
else given_up = 1;
}
else if(!strcasecmp(key, "down")) {
- if (!health_parse_duration(value, delay_down_duration)) {
+ if (!config_parse_duration(value, delay_down_duration)) {
error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
line, filename, value, key);
}
@@ -224,7 +187,7 @@ static inline int health_parse_delay(
else given_multiplier = 1;
}
else if(!strcasecmp(key, "max")) {
- if (!health_parse_duration(value, delay_max_duration)) {
+ if (!config_parse_duration(value, delay_max_duration)) {
error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
line, filename, value, key);
}
@@ -285,6 +248,50 @@ static inline uint32_t health_parse_options(const char *s) {
return options;
}
+static inline int health_parse_repeat(
+ size_t line,
+ const char *file,
+ char *string,
+ uint32_t *warn_repeat_every,
+ uint32_t *crit_repeat_every
+) {
+
+ char *s = string;
+ while(*s) {
+ char *key = s;
+
+ while(*s && !isspace(*s)) s++;
+ while(*s && isspace(*s)) *s++ = '\0';
+
+ if(!*key) break;
+
+ char *value = s;
+ while(*s && !isspace(*s)) s++;
+ while(*s && isspace(*s)) *s++ = '\0';
+
+ if(!strcasecmp(key, "off")) {
+ *warn_repeat_every = 0;
+ *crit_repeat_every = 0;
+ return 1;
+ }
+ if(!strcasecmp(key, "warning")) {
+ if (!config_parse_duration(value, (int*)warn_repeat_every)) {
+ error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
+ line, file, value, key);
+ }
+ }
+ else if(!strcasecmp(key, "critical")) {
+ if (!config_parse_duration(value, (int*)crit_repeat_every)) {
+ error("Health configuration at line %zu of file '%s': invalid value '%s' for '%s' keyword",
+ line, file, value, key);
+ }
+ }
+ }
+
+ return 1;
+}
+
+
static inline int health_parse_db_lookup(
size_t line, const char *filename, char *string,
RRDR_GROUPING *group_method, int *after, int *before, int *every,
@@ -322,7 +329,7 @@ static inline int health_parse_db_lookup(
while(*s && !isspace(*s)) s++;
while(*s && isspace(*s)) *s++ = '\0';
- if(!health_parse_duration(key, after)) {
+ if(!config_parse_duration(key, after)) {
error("Health configuration at line %zu of file '%s': invalid duration '%s' after group method",
line, filename, key);
return 0;
@@ -343,7 +350,7 @@ static inline int health_parse_db_lookup(
while(*s && !isspace(*s)) s++;
while(*s && isspace(*s)) *s++ = '\0';
- if (!health_parse_duration(value, before)) {
+ if (!config_parse_duration(value, before)) {
error("Health configuration at line %zu of file '%s': invalid duration '%s' for '%s' keyword",
line, filename, value, key);
}
@@ -353,7 +360,7 @@ static inline int health_parse_db_lookup(
while(*s && !isspace(*s)) s++;
while(*s && isspace(*s)) *s++ = '\0';
- if (!health_parse_duration(value, every)) {
+ if (!config_parse_duration(value, every)) {
error("Health configuration at line %zu of file '%s': invalid duration '%s' for '%s' keyword",
line, filename, value, key);
}
@@ -430,7 +437,8 @@ static int health_readfile(const char *filename, void *data) {
hash_info = 0,
hash_recipient = 0,
hash_delay = 0,
- hash_options = 0;
+ hash_options = 0,
+ hash_repeat = 0;
char buffer[HEALTH_CONF_MAX_LINE + 1];
@@ -454,6 +462,7 @@ static int health_readfile(const char *filename, void *data) {
hash_recipient = simple_hash(HEALTH_RECIPIENT_KEY);
hash_delay = simple_uhash(HEALTH_DELAY_KEY);
hash_options = simple_uhash(HEALTH_OPTIONS_KEY);
+ hash_repeat = simple_uhash(HEALTH_REPEAT_KEY);
}
FILE *fp = fopen(filename, "r");
@@ -481,7 +490,7 @@ static int health_readfile(const char *filename, void *data) {
if(append < HEALTH_CONF_MAX_LINE)
continue;
else {
- error("Health configuration has too long muli-line at line %zu of file '%s'.", line, filename);
+ error("Health configuration has too long multi-line at line %zu of file '%s'.", line, filename);
}
}
append = 0;
@@ -532,6 +541,9 @@ static int health_readfile(const char *filename, void *data) {
rc->value = NAN;
rc->old_value = NAN;
rc->delay_multiplier = 1.0;
+ rc->old_status = RRDCALC_STATUS_UNINITIALIZED;
+ rc->warn_repeat_every = host->health_default_warn_repeat_every;
+ rc->crit_repeat_every = host->health_default_crit_repeat_every;
if(rrdvar_fix_name(rc->name))
error("Health configuration renamed alarm '%s' to '%s'", value, rc->name);
@@ -556,6 +568,8 @@ static int health_readfile(const char *filename, void *data) {
rt->green = NAN;
rt->red = NAN;
rt->delay_multiplier = 1.0;
+ rt->warn_repeat_every = host->health_default_warn_repeat_every;
+ rt->crit_repeat_every = host->health_default_crit_repeat_every;
if(rrdvar_fix_name(rt->name))
error("Health configuration renamed template '%s' to '%s'", value, rt->name);
@@ -612,7 +626,7 @@ static int health_readfile(const char *filename, void *data) {
&rc->options, &rc->dimensions);
}
else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
- if(!health_parse_duration(value, &rc->update_every))
+ if(!config_parse_duration(value, &rc->update_every))
error("Health configuration at line %zu of file '%s' for alarm '%s' at key '%s' cannot parse duration: '%s'.",
line, filename, rc->name, key, value);
}
@@ -707,6 +721,11 @@ static int health_readfile(const char *filename, void *data) {
else if(hash == hash_options && !strcasecmp(key, HEALTH_OPTIONS_KEY)) {
rc->options |= health_parse_options(value);
}
+ else if(hash == hash_repeat && !strcasecmp(key, HEALTH_REPEAT_KEY)){
+ health_parse_repeat(line, filename, value,
+ &rc->warn_repeat_every,
+ &rc->crit_repeat_every);
+ }
else {
error("Health configuration at line %zu of file '%s' for alarm '%s' has unknown key '%s'.",
line, filename, rc->name, key);
@@ -736,7 +755,7 @@ static int health_readfile(const char *filename, void *data) {
&rt->update_every, &rt->options, &rt->dimensions);
}
else if(hash == hash_every && !strcasecmp(key, HEALTH_EVERY_KEY)) {
- if(!health_parse_duration(value, &rt->update_every))
+ if(!config_parse_duration(value, &rt->update_every))
error("Health configuration at line %zu of file '%s' for template '%s' at key '%s' cannot parse duration: '%s'.",
line, filename, rt->name, key, value);
}
@@ -831,6 +850,11 @@ static int health_readfile(const char *filename, void *data) {
else if(hash == hash_options && !strcasecmp(key, HEALTH_OPTIONS_KEY)) {
rt->options |= health_parse_options(value);
}
+ else if(hash == hash_repeat && !strcasecmp(key, HEALTH_REPEAT_KEY)){
+ health_parse_repeat(line, filename, value,
+ &rt->warn_repeat_every,
+ &rt->crit_repeat_every);
+ }
else {
error("Health configuration at line %zu of file '%s' for template '%s' has unknown key '%s'.",
line, filename, rt->name, key);
diff --git a/health/health_json.c b/health/health_json.c
index 78113244..e923b05c 100644
--- a/health/health_json.c
+++ b/health/health_json.c
@@ -140,6 +140,8 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC
"\t\t\t\"delay_multiplier\": %f,\n"
"\t\t\t\"delay\": %d,\n"
"\t\t\t\"delay_up_to_timestamp\": %lu,\n"
+ "\t\t\t\"warn_repeat_every\": \"%u\",\n"
+ "\t\t\t\"crit_repeat_every\": \"%u\",\n"
"\t\t\t\"value_string\": \"%s\",\n"
, rc->chart, rc->name
, (unsigned long)rc->id
@@ -165,6 +167,8 @@ static inline void health_rrdcalc2json_nolock(RRDHOST *host, BUFFER *wb, RRDCALC
, rc->delay_multiplier
, rc->delay_last
, (unsigned long)rc->delay_up_to_timestamp
+ , rc->warn_repeat_every
+ , rc->crit_repeat_every
, value_string
);
diff --git a/health/health_log.c b/health/health_log.c
index 009e4267..c91cde6c 100644
--- a/health/health_log.c
+++ b/health/health_log.c
@@ -79,6 +79,7 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
"\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s"
"\t%d\t%d\t%d\t%d"
"\t" CALCULATED_NUMBER_FORMAT_AUTO "\t" CALCULATED_NUMBER_FORMAT_AUTO
+ "\t%016lx"
"\n"
, (ae->flags & HEALTH_ENTRY_FLAG_SAVED)?'U':'A'
, host->hostname
@@ -112,6 +113,7 @@ inline void health_alarm_log_save(RRDHOST *host, ALARM_ENTRY *ae) {
, ae->new_value
, ae->old_value
+ , (uint64_t)ae->last_repeat
) < 0))
error("HEALTH [%s]: failed to save alarm log entry to '%s'. Health data may be lost in case of abnormal restart.", host->hostname, host->health_log_filename);
else {
@@ -174,10 +176,40 @@ inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filena
continue;
}
+ // Check if we got last_repeat field
+ time_t last_repeat = 0;
+ if(entries > 27) {
+ char* alarm_name = pointers[13];
+ last_repeat = (time_t)strtoul(pointers[27], NULL, 16);
+
+ RRDCALC *rc = alarm_max_last_repeat(host, alarm_name,simple_hash(alarm_name));
+ if (!rc) {
+ for(rc = host->alarms; rc ; rc = rc->next) {
+ RRDCALC *rdcmp = (RRDCALC *) avl_insert_lock(&(host)->alarms_idx_name, (avl *)rc);
+ if(rdcmp != rc) {
+ error("Cannot insert the alarm index ID using log %s", rc->name);
+ }
+ }
+
+ rc = alarm_max_last_repeat(host, alarm_name,simple_hash(alarm_name));
+ }
+
+ if(unlikely(rc)) {
+ if (rrdcalc_isrepeating(rc)) {
+ rc->last_repeat = last_repeat;
+ // We iterate through repeating alarm entries only to
+ // find the latest last_repeat timestamp. Otherwise,
+ // there is no need to keep them in memory.
+ continue;
+ }
+ }
+ }
+
if(unlikely(*pointers[0] == 'A')) {
// make sure it is properly numbered
if(unlikely(host->health_log.alarms && unique_id < host->health_log.alarms->unique_id)) {
- error("HEALTH [%s]: line %zu of file '%s' has alarm log entry %u in wrong order. Ignoring it.", host->hostname, line, filename, unique_id);
+ error( "HEALTH [%s]: line %zu of file '%s' has alarm log entry %u in wrong order. Ignoring it."
+ , host->hostname, line, filename, unique_id);
errored++;
continue;
}
@@ -186,11 +218,11 @@ inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filena
}
else if(unlikely(*pointers[0] == 'U')) {
// find the original
- for(ae = host->health_log.alarms; ae; ae = ae->next) {
+ for(ae = host->health_log.alarms; ae ; ae = ae->next) {
if(unlikely(unique_id == ae->unique_id)) {
if(unlikely(*pointers[0] == 'A')) {
error("HEALTH [%s]: line %zu of file '%s' adds duplicate alarm log entry %u. Using the later."
- , host->hostname, line, filename, unique_id);
+ , host->hostname, line, filename, unique_id);
*pointers[0] = 'U';
duplicate++;
}
@@ -270,6 +302,8 @@ inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char *filena
ae->new_value = str2l(pointers[25]);
ae->old_value = str2l(pointers[26]);
+ ae->last_repeat = last_repeat;
+
char value_string[100 + 1];
freez(ae->old_value_string);
freez(ae->new_value_string);
@@ -339,7 +373,7 @@ inline void health_alarm_log_load(RRDHOST *host) {
// ----------------------------------------------------------------------------
// health alarm log management
-inline void health_alarm_log(
+inline ALARM_ENTRY* health_create_alarm_entry(
RRDHOST *host,
uint32_t alarm_id,
uint32_t alarm_event_id,
@@ -398,9 +432,24 @@ inline void health_alarm_log(
ae->delay_up_to_timestamp = when + delay;
ae->flags |= flags;
+ ae->last_repeat = 0;
+
if(ae->old_status == RRDCALC_STATUS_WARNING || ae->old_status == RRDCALC_STATUS_CRITICAL)
ae->non_clear_duration += ae->duration;
+ return ae;
+}
+
+inline void health_alarm_log(
+ RRDHOST *host,
+ ALARM_ENTRY *ae
+) {
+ debug(D_HEALTH, "Health adding alarm log entry with id: %u", ae->unique_id);
+
+ if(unlikely(alarm_entry_isrepeating(host, ae))) {
+ error("Repeating alarms cannot be added to host's alarm log entries. It seems somewhere in the logic, API is being misused. Alarm id: %u", ae->alarm_id);
+ return;
+ }
// link it
netdata_rwlock_wrlock(&host->health_log.alarm_log_rwlock);
ae->next = host->health_log.alarms;
diff --git a/health/notifications/README.md b/health/notifications/README.md
index 5b7b4340..8c7ab66f 100644
--- a/health/notifications/README.md
+++ b/health/notifications/README.md
@@ -58,6 +58,9 @@ export NETDATA_ALARM_NOTIFY_DEBUG=1
# send test alarms to any role
/usr/libexec/netdata/plugins.d/alarm-notify.sh test "ROLE"
```
+
+Note that in versions before 1.16, the plugins.d directory may be installed in a different location in certain OSs (e.g. under `/usr/lib/netdata`). You can always find the location of the alarm-notify.sh script in `netdata.conf`.
+
If you need to dig even deeper, you can trace the execution with `bash -x`. Note that in test mode, alarm-notify.sh calls itself with many more arguments. So first do
```sh
bash -x /usr/libexec/netdata/plugins.d/alarm-notify.sh test
diff --git a/health/notifications/alarm-notify.sh.in b/health/notifications/alarm-notify.sh.in
index ff4b3f3d..852718bc 100755
--- a/health/notifications/alarm-notify.sh.in
+++ b/health/notifications/alarm-notify.sh.in
@@ -189,6 +189,7 @@ fi
[ -z "${NETDATA_STOCK_CONFIG_DIR}" ] && NETDATA_STOCK_CONFIG_DIR="@libconfigdir_POST@"
[ -z "${NETDATA_CACHE_DIR}" ] && NETDATA_CACHE_DIR="@cachedir_POST@"
[ -z "${NETDATA_REGISTRY_URL}" ] && NETDATA_REGISTRY_URL="https://registry.my-netdata.io"
+[ -z "${NETDATA_REGISTRY_CLOUD_BASE_URL}" ] && NETDATA_REGISTRY_CLOUD_BASE_URL="https://netdata.cloud"
# -----------------------------------------------------------------------------
# parse command line parameters
@@ -681,7 +682,7 @@ date=$(date --date=@${when} "${date_format}" 2>/dev/null)
# ----------------------------------------------------------------------------
# prepare some extra headers if we've been asked to thread e-mails
if [ "${SEND_EMAIL}" == "YES" ] && [ "${EMAIL_THREADING}" != "NO" ]; then
- email_thread_headers="In-Reply-To: <${chart}-${name}@${host}>\\nReferences: <${chart}-${name}@${host}>"
+ email_thread_headers="In-Reply-To: <${chart}-${name}@${host}>\\r\\nReferences: <${chart}-${name}@${host}>"
else
email_thread_headers=
fi
@@ -1790,7 +1791,7 @@ if [ "${NETDATA_REGISTRY_URL}" == "https://registry.my-netdata.io" ]; then
NETDATA_REGISTRY_UNIQUE_ID="$(cat "@registrydir_POST@/netdata.public.unique.id")"
fi
fi
- if [ ! -z "${NETDATA_REGISTRY_UNIQUE_ID}" ]; then
+ if [ -n "${NETDATA_REGISTRY_UNIQUE_ID}" ]; then
GOTOCLOUD=1
fi
fi
@@ -1798,7 +1799,7 @@ fi
if [ ${GOTOCLOUD} -eq 0 ]; then
goto_url="${NETDATA_REGISTRY_URL}/goto-host-from-alarm.html?${redirect_params}"
else
- goto_url="https://netdata.cloud/alarms/redirect?agentID=${NETDATA_REGISTRY_UNIQUE_ID}&${redirect_params}"
+ goto_url="${NETDATA_REGISTRY_CLOUD_BASE_URL}/alarms/redirect?agentID=${NETDATA_REGISTRY_UNIQUE_ID}&${redirect_params}"
fi
# the severity of the alarm
@@ -1953,7 +1954,7 @@ send_pushbullet "${PUSHBULLET_ACCESS_TOKEN}" "${PUSHBULLET_SOURCE_DEVICE}" "${to
Severity: ${severity}\\n
Chart: ${chart}\\n
Family: ${family}\\n
-$(date -d @${when})\\n
+${date}\\n
The source of this alarm is line ${src}"
SENT_PUSHBULLET=$?
diff --git a/health/notifications/custom/README.md b/health/notifications/custom/README.md
index 627dd9d4..eeaad8a6 100644
--- a/health/notifications/custom/README.md
+++ b/health/notifications/custom/README.md
@@ -1,11 +1,13 @@
# Custom
-Netdata allows you to send custom notifications, to any endpoint you choose.
-To configure custom notifications, you will need to define the `custom_sender()` function in `health_alarm_notify.conf`
-You can look at the other senders in `/usr/libexec/netdata/plugins.d/alarm-notify.sh` for examples.
+Netdata allows you to send custom notifications to any endpoint you choose.
+
+To configure custom notifications, you will need to customize `health_alarm_notify.conf`. You can look at the other senders in `/usr/libexec/netdata/plugins.d/alarm-notify.sh` for examples of how to modify the `custom_sender()` function in `health_alarm_notify.conf`. Ensure you follow the instructions of changing any configuration file to [persist your configuration](../../../docs/configuration-guide.md#persist-my-configuration).
+
As with other notifications, you will also need to define the recipient list in `DEFAULT_RECIPIENT_CUSTOM` and/or the `role_recipients_custom` array.
-The following is a sample `custom_sender` function to send an SMS via an imaginary HTTPS endpoint to the SMS gateway:
+The following is a sample `custom_sender` function in `health_alarm_notify.conf`, to send an SMS via an imaginary HTTPS endpoint to the SMS gateway:
+
```
custom_sender() {
# example human readable SMS
@@ -37,45 +39,45 @@ The following is a sample `custom_sender` function to send an SMS via an imagina
Variables available to the custom_sender:
- - ${to_custom} the list of recipients for the alarm
- - ${host} the host generated this event
- - ${url_host} same as ${host} but URL encoded
- - ${unique_id} the unique id of this event
- - ${alarm_id} the unique id of the alarm that generated this event
- - ${event_id} the incremental id of the event, for this alarm id
- - ${when} the timestamp this event occurred
- - ${name} the name of the alarm, as given in netdata health.d entries
- - ${url_name} same as ${name} but URL encoded
- - ${chart} the name of the chart (type.id)
- - ${url_chart} same as ${chart} but URL encoded
- - ${family} the family of the chart
- - ${url_family} same as ${family} but URL encoded
- - ${status} the current status : REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
- - ${old_status} the previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
- - ${value} the current value of the alarm
- - ${old_value} the previous value of the alarm
- - ${src} the line number and file the alarm has been configured
- - ${duration} the duration in seconds of the previous alarm state
- - ${duration_txt} same as ${duration} for humans
- - ${non_clear_duration} the total duration in seconds this is/was non-clear
- - ${non_clear_duration_txt} same as ${non_clear_duration} for humans
- - ${units} the units of the value
- - ${info} a short description of the alarm
- - ${value_string} friendly value (with units)
- - ${old_value_string} friendly old value (with units)
- - ${image} the URL of an image to represent the status of the alarm
- - ${color} a color in #AABBCC format for the alarm
- - ${goto_url} the URL the user can click to see the netdata dashboard
- - ${calc_expression} the expression evaluated to provide the value for the alarm
- - ${calc_param_values} the value of the variables in the evaluated expression
- - ${total_warnings} the total number of alarms in WARNING state on the host
- - ${total_critical} the total number of alarms in CRITICAL state on the host
+ - `${to_custom}` the list of recipients for the alarm
+ - `${host}` the host generated this event
+ - `${url_host}` same as `${host}` but URL encoded
+ - `${unique_id}` the unique id of this event
+ - `${alarm_id}` the unique id of the alarm that generated this event
+ - `${event_id}` the incremental id of the event, for this alarm id
+ - `${when}` the timestamp this event occurred
+ - `${name}` the name of the alarm, as given in netdata health.d entries
+ - `${url_name}` same as `${name}` but URL encoded
+ - `${chart}` the name of the chart (type.id)
+ - `${url_chart}` same as `${chart}` but URL encoded
+ - `${family}` the family of the chart
+ - `${url_family}` same as `${family}` but URL encoded
+ - `${status}` the current status : REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
+ - `${old_status}` the previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
+ - `${value}` the current value of the alarm
+ - `${old_value}` the previous value of the alarm
+ - `${src}` the line number and file the alarm has been configured
+ - `${duration}` the duration in seconds of the previous alarm state
+ - `${duration_txt}` same as `${duration}` for humans
+ - `${non_clear_duration}` the total duration in seconds this is/was non-clear
+ - `${non_clear_duration_txt}` same as `${non_clear_duration}` for humans
+ - `${units}` the units of the value
+ - `${info}` a short description of the alarm
+ - `${value_string}` friendly value (with units)
+ - `${old_value_string}` friendly old value (with units)
+ - `${image}` the URL of an image to represent the status of the alarm
+ - `${color}` a color in #AABBCC format for the alarm
+ - `${goto_url}` the URL the user can click to see the netdata dashboard
+ - `${calc_expression}` the expression evaluated to provide the value for the alarm
+ - `${calc_param_values}` the value of the variables in the evaluated expression
+ - `${total_warnings}` the total number of alarms in WARNING state on the host
+ - `${total_critical}` the total number of alarms in CRITICAL state on the host
The following are more human friendly:
- - ${alarm} like "name = value units"
- - ${status_message} like "needs attention", "recovered", "is critical"
- - ${severity} like "Escalated to CRITICAL", "Recovered from WARNING"
- - ${raised_for} like "(alarm was raised for 10 minutes)"
+ - `${alarm}` like "name = value units"
+ - `${status_message}` like "needs attention", "recovered", "is critical"
+ - `${severity}` like "Escalated to CRITICAL", "Recovered from WARNING"
+ - `${raised_for}` like "(alarm was raised for 10 minutes)"
[![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fhealth%2Fnotifications%2Fcustom%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]()
diff --git a/health/notifications/email/README.md b/health/notifications/email/README.md
index 163839b6..84a9e0ce 100644
--- a/health/notifications/email/README.md
+++ b/health/notifications/email/README.md
@@ -30,4 +30,6 @@ sudo su -s /bin/bash netdata
Where `[ROLE]` is the role you want to test. The default (if you don't give a `[ROLE]`) is `sysadmin`.
+Note that in versions before 1.16, the plugins.d directory may be installed in a different location in certain OSs (e.g. under `/usr/lib/netdata`). You can always find the location of the alarm-notify.sh script in `netdata.conf`.
+
[![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fhealth%2Fnotifications%2Femail%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]()