summaryrefslogtreecommitdiffstats
path: root/health
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2021-03-31 12:58:11 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2021-03-31 12:58:11 +0000
commitf99c4526d94d3e04124c5c48ab4a3da6ca53a458 (patch)
treea2ed8860030cc49f492b09b3222d593c65619800 /health
parentAdding upstream version 1.29.3. (diff)
downloadnetdata-f99c4526d94d3e04124c5c48ab4a3da6ca53a458.tar.xz
netdata-f99c4526d94d3e04124c5c48ab4a3da6ca53a458.zip
Adding upstream version 1.30.0.upstream/1.30.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'health')
-rw-r--r--health/Makefile.am1
-rw-r--r--health/health.c6
-rw-r--r--health/health.d/adaptec_raid.conf16
-rw-r--r--health/health.d/anomalies.conf8
-rw-r--r--health/health.d/apcupsd.conf8
-rw-r--r--health/health.d/apps_plugin.conf15
-rw-r--r--health/health.d/backend.conf25
-rw-r--r--health/health.d/bcache.conf14
-rw-r--r--health/health.d/beanstalkd.conf10
-rw-r--r--health/health.d/bind_rndc.conf4
-rw-r--r--health/health.d/boinc.conf8
-rw-r--r--health/health.d/btrfs.conf9
-rw-r--r--health/health.d/ceph.conf13
-rw-r--r--health/health.d/cgroups.conf17
-rw-r--r--health/health.d/cockroachdb.conf6
-rw-r--r--health/health.d/cpu.conf8
-rw-r--r--health/health.d/dbengine.conf10
-rw-r--r--health/health.d/disks.conf104
-rw-r--r--health/health.d/dns_query.conf2
-rw-r--r--health/health.d/dnsmasq_dhcp.conf4
-rw-r--r--health/health.d/dockerd.conf2
-rw-r--r--health/health.d/entropy.conf4
-rw-r--r--health/health.d/exporting.conf11
-rw-r--r--health/health.d/fping.conf13
-rw-r--r--health/health.d/gearman.conf4
-rw-r--r--health/health.d/haproxy.conf4
-rw-r--r--health/health.d/hdfs.conf10
-rw-r--r--health/health.d/httpcheck.conf42
-rw-r--r--health/health.d/ioping.conf4
-rw-r--r--health/health.d/ipc.conf4
-rw-r--r--health/health.d/ipfs.conf2
-rw-r--r--health/health.d/ipmi.conf4
-rw-r--r--health/health.d/isc_dhcpd.conf20
-rw-r--r--health/health.d/kubelet.conf43
-rw-r--r--health/health.d/linux_power_supply.conf2
-rw-r--r--health/health.d/load.conf20
-rw-r--r--health/health.d/mdstat.conf7
-rw-r--r--health/health.d/megacli.conf68
-rw-r--r--health/health.d/memcached.conf13
-rw-r--r--health/health.d/memory.conf4
-rw-r--r--health/health.d/mysql.conf26
-rw-r--r--health/health.d/net.conf97
-rw-r--r--health/health.d/netfilter.conf19
-rw-r--r--health/health.d/pihole.conf10
-rw-r--r--health/health.d/portcheck.conf12
-rw-r--r--health/health.d/processes.conf6
-rw-r--r--health/health.d/ram.conf11
-rw-r--r--health/health.d/redis.conf4
-rw-r--r--health/health.d/retroshare.conf2
-rw-r--r--health/health.d/riakkv.conf38
-rw-r--r--health/health.d/scaleio.conf4
-rw-r--r--health/health.d/softnet.conf13
-rw-r--r--health/health.d/swap.conf22
-rw-r--r--health/health.d/synchronization.conf12
-rw-r--r--health/health.d/tcp_conn.conf4
-rw-r--r--health/health.d/tcp_listen.conf9
-rw-r--r--health/health.d/tcp_mem.conf4
-rw-r--r--health/health.d/tcp_orphans.conf4
-rw-r--r--health/health.d/tcp_resets.conf28
-rw-r--r--health/health.d/udp_errors.conf29
-rw-r--r--health/health.d/unbound.conf4
-rw-r--r--health/health.d/vcsa.conf24
-rw-r--r--health/health.d/vernemq.conf319
-rw-r--r--health/health.d/vsphere.conf62
-rw-r--r--health/health.d/web_log.conf50
-rw-r--r--health/health.d/whoisquery.conf2
-rw-r--r--health/health.d/wmi.conf24
-rw-r--r--health/health.d/x509check.conf4
-rw-r--r--health/health.d/zfs.conf2
-rw-r--r--health/health.h2
-rw-r--r--health/health_config.c2
-rw-r--r--health/health_json.c19
-rw-r--r--health/health_log.c6
-rwxr-xr-xhealth/notifications/alarm-notify.sh.in35
-rw-r--r--health/notifications/email/README.md2
-rwxr-xr-xhealth/notifications/health_alarm_notify.conf4
-rw-r--r--health/notifications/stackpulse/README.md5
77 files changed, 682 insertions, 807 deletions
diff --git a/health/Makefile.am b/health/Makefile.am
index 399d6df5..0802dc75 100644
--- a/health/Makefile.am
+++ b/health/Makefile.am
@@ -29,7 +29,6 @@ dist_healthconfig_DATA = \
health.d/anomalies.conf \
health.d/apache.conf \
health.d/apcupsd.conf \
- health.d/apps_plugin.conf \
health.d/backend.conf \
health.d/bcache.conf \
health.d/beanstalkd.conf \
diff --git a/health/health.c b/health/health.c
index b81361e8..0793100a 100644
--- a/health/health.c
+++ b/health/health.c
@@ -966,12 +966,14 @@ void *health_main(void *ptr) {
} else if(unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
if(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE)) {
if(rc->old_status == RRDCALC_STATUS_CRITICAL) {
- repeat_every = rc->crit_repeat_every;
+ repeat_every = 1;
} else if (rc->old_status == RRDCALC_STATUS_WARNING) {
- repeat_every = rc->warn_repeat_every;
+ repeat_every = 1;
}
}
}
+ } else {
+ continue;
}
if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) {
diff --git a/health/health.d/adaptec_raid.conf b/health/health.d/adaptec_raid.conf
index a1301ce8..0753c6e5 100644
--- a/health/health.d/adaptec_raid.conf
+++ b/health/health.d/adaptec_raid.conf
@@ -1,24 +1,24 @@
# logical device status check
-template: adapter_raid_ld_status
- on: adapter_raid.ld_status
- lookup: max -5s
+template: adaptec_raid_ld_status
+ on: adaptec_raid.ld_status
+ lookup: max -10s foreach *
units: bool
every: 10s
crit: $this > 0
delay: down 5m multiplier 1.5 max 1h
- info: at least 1 logical device is failed or degraded
+ info: logical device status is failed or degraded
to: sysadmin
# physical device state check
-template: adapter_raid_pd_state
- on: adapter_raid.pd_state
- lookup: max -5s
+template: adaptec_raid_pd_state
+ on: adaptec_raid.pd_state
+ lookup: max -10s foreach *
units: bool
every: 10s
crit: $this > 0
delay: down 5m multiplier 1.5 max 1h
- info: at least 1 physical device is not in online state
+ info: physical device state is not online
to: sysadmin
diff --git a/health/health.d/anomalies.conf b/health/health.d/anomalies.conf
index a2d248ef..c4c96eaf 100644
--- a/health/health.d/anomalies.conf
+++ b/health/health.d/anomalies.conf
@@ -1,17 +1,17 @@
# raise a warning alarm if an anomaly probability is consistently above 50%
-template: anomaly_probabilities
+template: anomalies_anomaly_probabilities
on: anomalies.probability
lookup: average -2m foreach *
every: 1m
warn: $this > 50
- info: average anomaly probability > 50% for last 2 minutes
+ info: average anomaly probability over the last 2 minutes
# raise a warning alarm if an anomaly flag is consistently firing
-template: anomaly_flags
+template: anomalies_anomaly_flags
on: anomalies.anomaly
lookup: sum -2m foreach *
every: 1m
warn: $this > 10
- info: count of anomalies > 10 for last 2 minutes
+ info: number of anomalies in the last 2 minutes
diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf
index 4f86037b..12384fac 100644
--- a/health/health.d/apcupsd.conf
+++ b/health/health.d/apcupsd.conf
@@ -1,6 +1,6 @@
# you can disable an alarm notification by setting the 'to' line to: silent
-template: 10min_ups_load
+template: apcupsd_10min_ups_load
on: apcupsd.load
os: *
hosts: *
@@ -10,12 +10,12 @@ template: 10min_ups_load
warn: $this > (($status >= $WARNING) ? (70) : (80))
crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 10m multiplier 1.5 max 1h
- info: average UPS load for the last 10 minutes
+ info: average UPS load over the last 10 minutes
to: sitemgr
# Discussion in https://github.com/netdata/netdata/pull/3928:
# Fire the alarm as soon as it's going on battery (99% charge) and clear only when full.
-template: ups_charge
+template: apcupsd_ups_charge
on: apcupsd.charge
os: *
hosts: *
@@ -25,7 +25,7 @@ template: ups_charge
warn: $this < 100
crit: $this < (($status == $CRITICAL) ? (60) : (50))
delay: down 10m multiplier 1.5 max 1h
- info: current UPS charge, averaged over the last 60 seconds to reduce measurement errors
+ info: average UPS charge over the last minute
to: sitemgr
template: apcupsd_last_collected_secs
diff --git a/health/health.d/apps_plugin.conf b/health/health.d/apps_plugin.conf
deleted file mode 100644
index 9a27bc6b..00000000
--- a/health/health.d/apps_plugin.conf
+++ /dev/null
@@ -1,15 +0,0 @@
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-# disabled due to https://github.com/netdata/netdata/issues/10327
-#
-# alarm: used_file_descriptors
-# on: apps.files
-# hosts: *
-# calc: $fdperc
-# units: %
-# every: 5s
-# warn: $this > (($status >= $WARNING) ? (75) : (80))
-# crit: $this > (($status == $CRITICAL) ? (85) : (90))
-# delay: down 5m multiplier 1.5 max 1h
-# info: Peak percentage of file descriptors used
-# to: sysadmin
diff --git a/health/health.d/backend.conf b/health/health.d/backend.conf
index e51b8aa5..8089dc94 100644
--- a/health/health.d/backend.conf
+++ b/health/health.d/backend.conf
@@ -6,7 +6,7 @@
every: 1m
warn: $this > 0
delay: down 5m multiplier 1.5 max 1h
- info: The backends subsystem is deprecated and will be removed soon. Migrate your configuration to exporting.conf.
+ info: the backends subsystem is deprecated and will be removed soon. Migrate your configuration to exporting.conf.
to: sysadmin
# make sure we are sending data to backend
@@ -31,26 +31,3 @@
delay: down 5m multiplier 1.5 max 1h
info: percentage of metrics sent to the backend server
to: dba
-
- alarm: backend_metrics_lost
- on: netdata.backend_metrics
- units: metrics
- calc: abs($lost)
- every: 10s
- crit: ($this != 0) || ($status == $CRITICAL && abs($sent) == 0)
- delay: down 5m multiplier 1.5 max 1h
- info: number of metrics lost due to repeating failures to contact the backend server
- to: dba
-
-
-# this chart has been removed from netdata
-# alarm: backend_slow
-# on: netdata.backend_latency
-# units: %
-# calc: $latency * 100 / ($update_every * 1000)
-# every: 10s
-# warn: $this > 50
-# crit: $this > 100
-# delay: down 5m multiplier 1.5 max 1h
-# info: the percentage of time between iterations needed by the backend time to process the data sent by netdata
-# to: dba
diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf
index f0da9ac5..d5fccf4f 100644
--- a/health/health.d/bcache.conf
+++ b/health/health.d/bcache.conf
@@ -1,13 +1,14 @@
template: bcache_cache_errors
on: disk.bcache_cache_read_races
- lookup: sum -10m unaligned absolute
+ lookup: sum -1m unaligned absolute
units: errors
every: 1m
warn: $this > 0
- crit: $this > ( ($status >= $CRITICAL) ? (0) : (10) )
- delay: down 1h multiplier 1.5 max 2h
- info: the number of times bcache had issues using the cache, during the last 10 mins (this usually means your SSD cache is failing)
+ delay: up 2m down 1h multiplier 1.5 max 2h
+ info: number of times data was read from the cache, \
+ the bucket was reused and invalidated in the last 10 minutes \
+ (when this occurs the data is reread from the backing device)
to: sysadmin
template: bcache_cache_dirty
@@ -16,7 +17,8 @@ template: bcache_cache_dirty
units: %
every: 1m
warn: $this > ( ($status >= $WARNING ) ? ( 70 ) : ( 90 ) )
- crit: $this > ( ($status >= $CRITICAL) ? ( 90 ) : ( 95 ) )
+ crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
delay: up 1m down 1h multiplier 1.5 max 2h
- info: the percentage of cache space used for dirty and metadata (this usually means your SSD cache is too small)
+ info: percentage of cache space used for dirty data and metadata \
+ (this usually means your SSD cache is too small)
to: sysadmin
diff --git a/health/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf
index 30dc2732..0c428ecb 100644
--- a/health/health.d/beanstalkd.conf
+++ b/health/health.d/beanstalkd.conf
@@ -1,6 +1,6 @@
# get the number of buried jobs in all queues
-template: server_buried_jobs
+template: beanstalk_server_buried_jobs
on: beanstalk.current_jobs
calc: $buried
units: jobs
@@ -8,12 +8,14 @@ template: server_buried_jobs
warn: $this > 0
crit: $this > 10
delay: up 0 down 5m multiplier 1.2 max 1h
- info: the number of buried jobs aggregated across all tubes
+ info: number of buried jobs across all tubes. \
+ You need to manually kick them so they can be processed. \
+ Presence of buried jobs in a tube does not affect new jobs.
to: sysadmin
# get the number of buried jobs per queue
-#template: tube_buried_jobs
+#template: beanstalk_tube_buried_jobs
# on: beanstalk.jobs
# calc: $buried
# units: jobs
@@ -26,7 +28,7 @@ template: server_buried_jobs
# get the current number of tubes
-#template: number_of_tubes
+#template: beanstalk_number_of_tubes
# on: beanstalk.current_tubes
# calc: $tubes
# every: 10s
diff --git a/health/health.d/bind_rndc.conf b/health/health.d/bind_rndc.conf
index 4145e77c..5cc7a72f 100644
--- a/health/health.d/bind_rndc.conf
+++ b/health/health.d/bind_rndc.conf
@@ -1,9 +1,9 @@
- template: bind_rndc_stats_file_size
+template: bind_rndc_stats_file_size
on: bind_rndc.stats_size
units: megabytes
every: 60
calc: $stats_size
warn: $this > 512
crit: $this > 1024
- info: Bind stats file is very large! Consider to create logrotate conf file for it!
+ info: BIND statistics-file size
to: sysadmin
diff --git a/health/health.d/boinc.conf b/health/health.d/boinc.conf
index 43c588db..25b7f199 100644
--- a/health/health.d/boinc.conf
+++ b/health/health.d/boinc.conf
@@ -12,7 +12,7 @@ families: *
warn: $this > 0
crit: $this > 1
delay: up 1m down 5m multiplier 1.5 max 1h
- info: the total number of compute errors over the past 10 minutes
+ info: average number of compute errors over the last 10 minutes
to: sysadmin
# Warn on lots of upload errors
@@ -27,7 +27,7 @@ families: *
warn: $this > 0
crit: $this > 1
delay: up 1m down 5m multiplier 1.5 max 1h
- info: the average number of failed uploads over the past 10 minutes
+ info: average number of failed uploads over the last 10 minutes
to: sysadmin
# Warn on the task queue being empty
@@ -42,7 +42,7 @@ families: *
warn: $this < 1
crit: $this < 0.1
delay: up 5m down 10m multiplier 1.5 max 1h
- info: the total number of locally available tasks
+ info: average number of total tasks over the last 10 minutes
to: sysadmin
# Warn on no active tasks with a non-empty queue
@@ -58,5 +58,5 @@ families: *
warn: $this < 1
crit: $this < 0.1
delay: up 5m down 10m multiplier 1.5 max 1h
- info: the total number of active tasks
+ info: average number of active tasks over the last 10 minutes
to: sysadmin
diff --git a/health/health.d/btrfs.conf b/health/health.d/btrfs.conf
index b27aa544..93ab8748 100644
--- a/health/health.d/btrfs.conf
+++ b/health/health.d/btrfs.conf
@@ -10,7 +10,7 @@ families: *
warn: $this > (($status >= $WARNING) ? (90) : (95))
crit: $this > (($status == $CRITICAL) ? (95) : (98))
delay: up 1m down 15m multiplier 1.5 max 1h
- info: the percentage of allocated BTRFS physical disk space
+ info: percentage of allocated BTRFS physical disk space
to: sysadmin
template: btrfs_data
@@ -24,7 +24,7 @@ families: *
warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98
crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
delay: up 1m down 15m multiplier 1.5 max 1h
- info: the percentage of used BTRFS data space
+ info: utilization of BTRFS data space
to: sysadmin
template: btrfs_metadata
@@ -38,7 +38,7 @@ families: *
warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98
crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
delay: up 1m down 15m multiplier 1.5 max 1h
- info: the percentage of used BTRFS metadata space
+ info: utilization of BTRFS metadata space
to: sysadmin
template: btrfs_system
@@ -52,6 +52,5 @@ families: *
warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98
crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
delay: up 1m down 15m multiplier 1.5 max 1h
- info: the percentage of used BTRFS system space
+ info: utilization of BTRFS system space
to: sysadmin
-
diff --git a/health/health.d/ceph.conf b/health/health.d/ceph.conf
index de16f7b6..cdbab0f6 100644
--- a/health/health.d/ceph.conf
+++ b/health/health.d/ceph.conf
@@ -1,13 +1,12 @@
# low ceph disk available
-template: cluster_space_usage
+template: ceph_cluster_space_usage
on: ceph.general_usage
- calc: $avail * 100 / ($avail + $used)
+ calc: $used * 100 / ($used + $avail)
units: %
- every: 10s
- warn: $this < 10
- crit: $this < 1
+ every: 1m
+ warn: $this > (($status >= $WARNING ) ? (85) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
delay: down 5m multiplier 1.2 max 1h
- info: ceph disk usage is almost full
+ info: cluster disk space utilization
to: sysadmin
-
diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf
index 79ece53f..c0a16f15 100644
--- a/health/health.d/cgroups.conf
+++ b/health/health.d/cgroups.conf
@@ -11,7 +11,7 @@ template: cgroup_10min_cpu_usage
warn: $this > (($status >= $WARNING) ? (75) : (85))
crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 15m multiplier 1.5 max 1h
- info: cpu utilization for the last 10 minutes
+ info: average cgroup CPU utilization over the last 10 minutes
to: sysadmin
template: cgroup_ram_in_use
@@ -24,18 +24,5 @@ template: cgroup_ram_in_use
warn: $this > (($status >= $WARNING) ? (80) : (90))
crit: $this > (($status == $CRITICAL) ? (90) : (98))
delay: down 15m multiplier 1.5 max 1h
- info: RAM used by cgroup
- to: sysadmin
-
-template: cgroup_ram_and_swap_in_use
- on: cgroup.mem_usage
- os: linux
- hosts: *
- calc: ($ram + $swap) * 100 / $memory_and_swap_limit
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING) ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
- delay: down 15m multiplier 1.5 max 1h
- info: RAM and Swap used by cgroup
+ info: cgroup memory utilization
to: sysadmin
diff --git a/health/health.d/cockroachdb.conf b/health/health.d/cockroachdb.conf
index 8ab2c9d0..47773d04 100644
--- a/health/health.d/cockroachdb.conf
+++ b/health/health.d/cockroachdb.conf
@@ -22,7 +22,7 @@ template: cockroachdb_used_storage_capacity
warn: $this > (($status >= $WARNING) ? (80) : (85))
crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 15m multiplier 1.5 max 1h
- info: entire disk usage percentage
+ info: storage capacity utilization
to: dba
template: cockroachdb_used_usable_storage_capacity
@@ -33,7 +33,7 @@ template: cockroachdb_used_usable_storage_capacity
warn: $this > (($status >= $WARNING) ? (80) : (85))
crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 15m multiplier 1.5 max 1h
- info: usable space usage percentage
+ info: storage usable space utilization
to: dba
# Replication
@@ -67,7 +67,7 @@ template: cockroachdb_open_file_descriptors_limit
every: 10s
warn: $this > 80
delay: down 15m multiplier 1.5 max 1h
- info: open file descriptors usage percentage
+ info: open file descriptors utilization (against softlimit)
to: dba
# SQL
diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf
index fa818985..32c69f8f 100644
--- a/health/health.d/cpu.conf
+++ b/health/health.d/cpu.conf
@@ -11,7 +11,7 @@ template: 10min_cpu_usage
warn: $this > (($status >= $WARNING) ? (75) : (85))
crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 15m multiplier 1.5 max 1h
- info: average cpu utilization for the last 10 minutes (excluding iowait, nice and steal)
+ info: average CPU utilization over the last 10 minutes (excluding iowait, nice and steal)
to: sysadmin
template: 10min_cpu_iowait
@@ -24,7 +24,7 @@ template: 10min_cpu_iowait
warn: $this > (($status >= $WARNING) ? (20) : (40))
crit: $this > (($status == $CRITICAL) ? (40) : (50))
delay: down 15m multiplier 1.5 max 1h
- info: average CPU wait I/O for the last 10 minutes
+ info: average CPU iowait time over the last 10 minutes
to: sysadmin
template: 20min_steal_cpu
@@ -37,7 +37,7 @@ template: 20min_steal_cpu
warn: $this > (($status >= $WARNING) ? (5) : (10))
crit: $this > (($status == $CRITICAL) ? (20) : (30))
delay: down 1h multiplier 1.5 max 2h
- info: average CPU steal time for the last 20 minutes
+ info: average CPU steal time over the last 20 minutes
to: sysadmin
## FreeBSD
@@ -51,5 +51,5 @@ template: 10min_cpu_usage
warn: $this > (($status >= $WARNING) ? (75) : (85))
crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 15m multiplier 1.5 max 1h
- info: average cpu utilization for the last 10 minutes (excluding nice)
+ info: average CPU utilization over the last 10 minutes (excluding nice)
to: sysadmin
diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf
index 274673e3..3e51d37e 100644
--- a/health/health.d/dbengine.conf
+++ b/health/health.d/dbengine.conf
@@ -10,7 +10,7 @@ lookup: sum -10m unaligned of fs_errors
every: 10s
crit: $this > 0
delay: down 15m multiplier 1.5 max 1h
- info: number of File-System errors dbengine came across the last 10 minutes (too many open files, wrong permissions etc)
+ info: number of filesystem errors in the last 10 minutes (too many open files, wrong permissions, etc)
to: sysadmin
alarm: 10min_dbengine_global_io_errors
@@ -22,7 +22,7 @@ lookup: sum -10m unaligned of io_errors
every: 10s
crit: $this > 0
delay: down 1h multiplier 1.5 max 3h
- info: number of IO errors dbengine came across the last 10 minutes (CRC errors, out of space, bad disk etc)
+ info: number of IO errors in the last 10 minutes (CRC errors, out of space, bad disk, etc)
to: sysadmin
alarm: 10min_dbengine_global_flushing_warnings
@@ -34,7 +34,8 @@ lookup: sum -10m unaligned of pg_cache_over_half_dirty_events
every: 10s
warn: $this > 0
delay: down 1h multiplier 1.5 max 3h
- info: number of times in the last 10 minutes that dbengine dirty pages were over 50% of the instance's page cache, metric data at risk of not being stored in the database, please reduce disk load or use faster disks
+ info: number of times when dbengine dirty pages were over 50% of the instance's page cache in the last 10 minutes. \
+ Metric data are at risk of not being stored in the database. To remedy, reduce disk load or use faster disks.
to: sysadmin
alarm: 10min_dbengine_global_flushing_errors
@@ -46,5 +47,6 @@ lookup: sum -10m unaligned of flushing_pressure_deletions
every: 10s
crit: $this != 0
delay: down 1h multiplier 1.5 max 3h
- info: number of pages deleted due to failure to flush data to disk in the last 10 minutes, metric data were lost to unblock data collection, please reduce disk load or use faster disks
+ info: number of pages deleted due to failure to flush data to disk in the last 10 minutes. \
+ Metric data were lost to unblock data collection. To fix, reduce disk load or use faster disks.
to: sysadmin
diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf
index 9c194ced..d0cd60cf 100644
--- a/health/health.d/disks.conf
+++ b/health/health.d/disks.conf
@@ -20,7 +20,7 @@ families: !/dev !/dev/* !/run !/run/* *
warn: $this > (($status >= $WARNING ) ? (80) : (90))
crit: $this > (($status == $CRITICAL) ? (90) : (98))
delay: up 1m down 15m multiplier 1.5 max 1h
- info: current disk space usage
+ info: disk space utilization
to: sysadmin
template: disk_inode_usage
@@ -34,7 +34,7 @@ families: !/dev !/dev/* !/run !/run/* *
warn: $this > (($status >= $WARNING) ? (80) : (90))
crit: $this > (($status == $CRITICAL) ? (90) : (98))
delay: up 1m down 15m multiplier 1.5 max 1h
- info: current disk inode usage
+ info: disk inode utilization
to: sysadmin
@@ -49,35 +49,35 @@ families: !/dev !/dev/* !/run !/run/* *
# we will use it in the next template to find
# the hours remaining
-template: disk_fill_rate
- on: disk.space
- os: linux freebsd
- hosts: *
-families: *
- lookup: min -10m at -50m unaligned of avail
- calc: ($this - $avail) / (($now - $after) / 3600)
- every: 1m
- units: GB/hour
- info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour
+# template: disk_fill_rate
+# on: disk.space
+# os: linux freebsd
+# hosts: *
+# families: *
+# lookup: min -10m at -50m unaligned of avail
+# calc: ($this - $avail) / (($now - $after) / 3600)
+# every: 1m
+# units: GB/hour
+# info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour
# calculate the hours remaining
# if the disk continues to fill
# in this rate
-template: out_of_disk_space_time
- on: disk.space
- os: linux freebsd
- hosts: *
-families: *
- calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf)
- units: hours
- every: 10s
- warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
- crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
- delay: down 15m multiplier 1.2 max 1h
- info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour
- to: sysadmin
+# template: out_of_disk_space_time
+# on: disk.space
+# os: linux freebsd
+# hosts: *
+# families: *
+# calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf)
+# units: hours
+# every: 10s
+# warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
+# crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+# delay: down 15m multiplier 1.2 max 1h
+# info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour
+# to: sysadmin
# -----------------------------------------------------------------------------
@@ -91,34 +91,34 @@ families: *
# we will use it in the next template to find
# the hours remaining
-template: disk_inode_rate
- on: disk.inodes
- os: linux freebsd
- hosts: *
-families: *
- lookup: min -10m at -50m unaligned of avail
- calc: ($this - $avail) / (($now - $after) / 3600)
- every: 1m
- units: inodes/hour
- info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour
+# template: disk_inode_rate
+# on: disk.inodes
+# os: linux freebsd
+# hosts: *
+# families: *
+# lookup: min -10m at -50m unaligned of avail
+# calc: ($this - $avail) / (($now - $after) / 3600)
+# every: 1m
+# units: inodes/hour
+# info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour
# calculate the hours remaining
# if the disk inodes are allocated
# in this rate
-template: out_of_disk_inodes_time
- on: disk.inodes
- os: linux freebsd
- hosts: *
-families: *
- calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf)
- units: hours
- every: 10s
- warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
- crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
- delay: down 15m multiplier 1.2 max 1h
- info: estimated time the disk will run out of inodes, if the system continues to allocate inodes with the rate of the last hour
- to: sysadmin
+# template: out_of_disk_inodes_time
+# on: disk.inodes
+# os: linux freebsd
+# hosts: *
+# families: *
+# calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf)
+# units: hours
+# every: 10s
+# warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
+# crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+# delay: down 15m multiplier 1.2 max 1h
+# info: estimated time the disk will run out of inodes, if the system continues to allocate inodes with the rate of the last hour
+# to: sysadmin
# -----------------------------------------------------------------------------
@@ -141,8 +141,8 @@ families: *
warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1))
crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1))
delay: down 15m multiplier 1.2 max 1h
- info: the percentage of time the disk was busy, during the last 10 minutes
- to: sysadmin
+ info: average percentage of time the disk was busy over the last 10 minutes
+ to: silent
# raise an alarm if the disk backlog
@@ -163,5 +163,5 @@ families: *
warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1))
crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1))
delay: down 15m multiplier 1.2 max 1h
- info: average of the kernel estimated disk backlog, for the last 10 minutes
- to: sysadmin
+ info: average disk backlog size over the last 10 minutes
+ to: silent
diff --git a/health/health.d/dns_query.conf b/health/health.d/dns_query.conf
index 113c950e..64770b98 100644
--- a/health/health.d/dns_query.conf
+++ b/health/health.d/dns_query.conf
@@ -8,5 +8,5 @@ template: dns_query_time_query_time
every: 10s
warn: $this == nan
delay: up 20s down 5m multiplier 1.5 max 1h
- info: query round trip time
+ info: average DNS query round trip time over the last 10 seconds
to: sysadmin
diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf
index ecf3b84a..dff1f07d 100644
--- a/health/health.d/dnsmasq_dhcp.conf
+++ b/health/health.d/dnsmasq_dhcp.conf
@@ -6,7 +6,7 @@ template: dnsmasq_dhcp_dhcp_range_utilization
units: %
calc: $used
warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) )
- crit: $this > ( ($status >= $CRITICAL) ? ( 90 ) : ( 95 ) )
+ crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
delay: down 5m
- info: dhcp-range utilization above threshold!
+ info: DHCP range utilization
to: sysadmin
diff --git a/health/health.d/dockerd.conf b/health/health.d/dockerd.conf
index 729906cd..122d82b8 100644
--- a/health/health.d/dockerd.conf
+++ b/health/health.d/dockerd.conf
@@ -4,5 +4,5 @@ template: docker_unhealthy_containers
every: 10s
lookup: average -10s
crit: $this > 0
- info: number of unhealthy containers
+ info: average number of unhealthy docker containers over the last 10 seconds
to: sysadmin
diff --git a/health/health.d/entropy.conf b/health/health.d/entropy.conf
index 66d44ec1..0be9d45b 100644
--- a/health/health.d/entropy.conf
+++ b/health/health.d/entropy.conf
@@ -7,10 +7,10 @@
on: system.entropy
os: linux
hosts: *
- lookup: min -10m unaligned
+ lookup: min -5m unaligned
units: entries
every: 5m
warn: $this < (($status >= $WARNING) ? (200) : (100))
delay: down 1h multiplier 1.5 max 2h
- info: minimum entries in the random numbers pool in the last 10 minutes
+ info: minimum number of entries in the random numbers pool in the last 5 minutes
to: silent
diff --git a/health/health.d/exporting.conf b/health/health.d/exporting.conf
index 506cb0cf..735fb5ae 100644
--- a/health/health.d/exporting.conf
+++ b/health/health.d/exporting.conf
@@ -21,14 +21,3 @@ families: *
delay: down 5m multiplier 1.5 max 1h
info: percentage of metrics sent to the external database server
to: dba
-
-template: exporting_metrics_lost
-families: *
- on: exporting_data_size
- units: metrics
- calc: abs($lost)
- every: 10s
- crit: ($this != 0) || ($status == $CRITICAL && abs($sent) == 0)
- delay: down 5m multiplier 1.5 max 1h
- info: number of metrics lost due to repeating failures to contact the external database server
- to: dba
diff --git a/health/health.d/fping.conf b/health/health.d/fping.conf
index 43658fef..92c1525b 100644
--- a/health/health.d/fping.conf
+++ b/health/health.d/fping.conf
@@ -11,18 +11,18 @@ families: *
info: number of seconds since the last successful data collection
to: sysadmin
-template: host_reachable
+template: fping_host_reachable
families: *
on: fping.latency
calc: $average != nan
units: up/down
every: 10s
crit: $this == 0
- info: states if the remote host is reachable
delay: down 30m multiplier 1.5 max 2h
+ info: reachability status of the network host (0: unreachable, 1: reachable)
to: sysadmin
-template: host_latency
+template: fping_host_latency
families: *
on: fping.latency
lookup: average -10s unaligned of average
@@ -32,11 +32,11 @@ families: *
red: 1000
warn: $this > $green OR $max > $red
crit: $this > $red
- info: average round trip delay during the last 10 seconds
delay: down 30m multiplier 1.5 max 2h
+ info: average latency to the network host over the last 10 seconds
to: sysadmin
-template: packet_loss
+template: fping_packet_loss
families: *
on: fping.quality
lookup: average -10m unaligned of returned
@@ -47,7 +47,6 @@ families: *
every: 10s
warn: $this > $green
crit: $this > $red
- info: packet loss percentage
delay: down 30m multiplier 1.5 max 2h
+ info: packet loss ratio to the network host over the last 10 minutes
to: sysadmin
-
diff --git a/health/health.d/gearman.conf b/health/health.d/gearman.conf
index e3863ae5..d148f7b7 100644
--- a/health/health.d/gearman.conf
+++ b/health/health.d/gearman.conf
@@ -18,5 +18,5 @@ template: gearman_workers_queued
warn: $this > 30000
crit: $this > 100000
delay: down 5m multiplier 1.5 max 1h
- info: number of queued jobs
- to: sysadmin \ No newline at end of file
+ info: average number of queued jobs over the last 10 minutes
+ to: sysadmin
diff --git a/health/health.d/haproxy.conf b/health/health.d/haproxy.conf
index e49c70d4..9cd07066 100644
--- a/health/health.d/haproxy.conf
+++ b/health/health.d/haproxy.conf
@@ -4,7 +4,7 @@ template: haproxy_backend_server_status
every: 10s
lookup: average -10s
crit: $this > 0
- info: number of failed haproxy backend servers
+ info: average number of failed haproxy backend servers over the last 10 seconds
to: sysadmin
template: haproxy_backend_status
@@ -13,7 +13,7 @@ template: haproxy_backend_status
every: 10s
lookup: average -10s
crit: $this > 0
- info: number of failed haproxy backends
+ info: average number of failed haproxy backends over the last 10 seconds
to: sysadmin
template: haproxy_last_collected
diff --git a/health/health.d/hdfs.conf b/health/health.d/hdfs.conf
index 678faab4..7345df4d 100644
--- a/health/health.d/hdfs.conf
+++ b/health/health.d/hdfs.conf
@@ -23,7 +23,7 @@ template: hdfs_capacity_usage
warn: $this > (($status >= $WARNING) ? (70) : (80))
crit: $this > (($status == $CRITICAL) ? (80) : (98))
delay: down 15m multiplier 1.5 max 1h
- info: used capacity
+ info: summary datanodes space capacity utilization
to: sysadmin
@@ -36,7 +36,7 @@ template: hdfs_missing_blocks
every: 10s
warn: $this > 0
delay: down 15m multiplier 1.5 max 1h
- info: missing blocks
+ info: number of missing blocks
to: sysadmin
@@ -47,7 +47,7 @@ template: hdfs_stale_nodes
every: 10s
warn: $this > 0
delay: down 15m multiplier 1.5 max 1h
- info: stale data nodes
+ info: number of datanodes marked stale due to delayed heartbeat
to: sysadmin
@@ -58,7 +58,7 @@ template: hdfs_dead_nodes
every: 10s
crit: $this > 0
delay: down 15m multiplier 1.5 max 1h
- info: dead data nodes
+ info: number of datanodes which are currently dead
to: sysadmin
@@ -71,5 +71,5 @@ template: hdfs_num_failed_volumes
every: 10s
warn: $this > 0
delay: down 15m multiplier 1.5 max 1h
- info: failed volumes
+ info: number of failed volumes
to: sysadmin
diff --git a/health/health.d/httpcheck.conf b/health/health.d/httpcheck.conf
index 0ddf35ea..0158f63e 100644
--- a/health/health.d/httpcheck.conf
+++ b/health/health.d/httpcheck.conf
@@ -11,17 +11,17 @@ families: *
to: sysadmin
# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
-template: web_service_up
+template: httpcheck_web_service_up
families: *
on: httpcheck.status
lookup: average -1m unaligned percentage of success
calc: ($this < 75) ? (0) : ($this)
every: 5s
units: up/down
- info: at least 75% verified responses during last 60 seconds, ideal for badges
+ info: average ratio of successful HTTP requests over the last minute (at least 75%)
to: silent
-template: web_service_bad_content
+template: httpcheck_web_service_bad_content
families: *
on: httpcheck.status
lookup: average -5m unaligned percentage of bad_content
@@ -30,11 +30,11 @@ families: *
warn: $this >= 10 AND $this < 40
crit: $this >= 40
delay: down 5m multiplier 1.5 max 1h
- info: average of unexpected http response content during the last 5 minutes
+ info: average ratio of HTTP responses with unexpected content over the last 5 minutes
options: no-clear-notification
to: webmaster
-template: web_service_bad_status
+template: httpcheck_web_service_bad_status
families: *
on: httpcheck.status
lookup: average -5m unaligned percentage of bad_status
@@ -43,57 +43,57 @@ families: *
warn: $this >= 10 AND $this < 40
crit: $this >= 40
delay: down 5m multiplier 1.5 max 1h
- info: average of unexpected http status during the last 5 minutes
+ info: average ratio of HTTP responses with unexpected status over the last 5 minutes
options: no-clear-notification
to: webmaster
-template: web_service_timeouts
+template: httpcheck_web_service_timeouts
families: *
on: httpcheck.status
lookup: average -5m unaligned percentage of timeout
every: 10s
units: %
- info: average of timeouts during the last 5 minutes
+ info: average ratio of HTTP request timeouts over the last 5 minutes
-template: no_web_service_connections
+template: httpcheck_no_web_service_connections
families: *
on: httpcheck.status
lookup: average -5m unaligned percentage of no_connection
every: 10s
units: %
- info: average of failed requests during the last 5 minutes
+ info: average ratio of failed requests during the last 5 minutes
# combined timeout & no connection alarm
-template: web_service_unreachable
+template: httpcheck_web_service_unreachable
families: *
on: httpcheck.status
- calc: ($no_web_service_connections >= $web_service_timeouts) ? ($no_web_service_connections) : ($web_service_timeouts)
+ calc: ($httpcheck_no_web_service_connections >= $httpcheck_web_service_timeouts) ? ($httpcheck_no_web_service_connections) : ($httpcheck_web_service_timeouts)
units: %
every: 10s
- warn: ($no_web_service_connections >= 10 OR $web_service_timeouts >= 10) AND ($no_web_service_connections < 40 OR $web_service_timeouts < 40)
- crit: $no_web_service_connections >= 40 OR $web_service_timeouts >= 40
+ warn: ($httpcheck_no_web_service_connections >= 10 OR $httpcheck_web_service_timeouts >= 10) AND ($httpcheck_no_web_service_connections < 40 OR $httpcheck_web_service_timeouts < 40)
+ crit: $httpcheck_no_web_service_connections >= 40 OR $httpcheck_web_service_timeouts >= 40
delay: down 5m multiplier 1.5 max 1h
- info: average of failed requests either due to timeouts or no connection during the last 5 minutes
+ info: ratio of failed requests either due to timeouts or no connection over the last 5 minutes
options: no-clear-notification
to: webmaster
-template: 1h_web_service_response_time
+template: httpcheck_1h_web_service_response_time
families: *
on: httpcheck.responsetime
lookup: average -1h unaligned of time
every: 30s
units: ms
- info: average response time over the last hour
+ info: average HTTP response time over the last hour
-template: web_service_slow
+template: httpcheck_web_service_slow
families: *
on: httpcheck.responsetime
lookup: average -3m unaligned of time
units: ms
every: 10s
- warn: ($this > ($1h_web_service_response_time * 2) )
- crit: ($this > ($1h_web_service_response_time * 3) )
- info: average response time over the last 3 minutes, compared to the average over the last hour
+ warn: ($this > ($httpcheck_1h_web_service_response_time * 2) )
+ crit: ($this > ($httpcheck_1h_web_service_response_time * 3) )
delay: down 5m multiplier 1.5 max 1h
+ info: average HTTP response time over the last 3 minutes, compared to the average over the last hour
options: no-clear-notification
to: webmaster
diff --git a/health/health.d/ioping.conf b/health/health.d/ioping.conf
index 59a5c8ed..fa0196ef 100644
--- a/health/health.d/ioping.conf
+++ b/health/health.d/ioping.conf
@@ -1,4 +1,4 @@
-template: disk_latency
+template: ioping_disk_latency
families: *
on: ioping.latency
lookup: average -10s unaligned of average
@@ -8,6 +8,6 @@ families: *
red: 1000
warn: $this > $green OR $max > $red
crit: $this > $red
- info: average round trip delay during the last 10 seconds
delay: down 30m multiplier 1.5 max 2h
+ info: average I/O latency over the last 10 seconds
to: sysadmin
diff --git a/health/health.d/ipc.conf b/health/health.d/ipc.conf
index 989d6e91..f4a0f56d 100644
--- a/health/health.d/ipc.conf
+++ b/health/health.d/ipc.conf
@@ -11,7 +11,7 @@
warn: $this > (($status >= $WARNING) ? (70) : (80))
crit: $this > (($status == $CRITICAL) ? (70) : (90))
delay: down 5m multiplier 1.5 max 1h
- info: the percentage of IPC semaphores used
+ info: IPC semaphore utilization
to: sysadmin
alarm: semaphore_arrays_used
@@ -24,5 +24,5 @@
warn: $this > (($status >= $WARNING) ? (70) : (80))
crit: $this > (($status == $CRITICAL) ? (70) : (90))
delay: down 5m multiplier 1.5 max 1h
- info: the percentage of IPC semaphore arrays used
+ info: IPC semaphore arrays utilization
to: sysadmin
diff --git a/health/health.d/ipfs.conf b/health/health.d/ipfs.conf
index 3f77572d..fd53c2c4 100644
--- a/health/health.d/ipfs.conf
+++ b/health/health.d/ipfs.conf
@@ -7,5 +7,5 @@ template: ipfs_datastore_usage
warn: $this > (($status >= $WARNING) ? (80) : (90))
crit: $this > (($status == $CRITICAL) ? (90) : (98))
delay: down 15m multiplier 1.5 max 1h
- info: ipfs Datastore close to running out of space
+ info: IPFS datastore utilization
to: sysadmin
diff --git a/health/health.d/ipmi.conf b/health/health.d/ipmi.conf
index c2558196..563d7a7e 100644
--- a/health/health.d/ipmi.conf
+++ b/health/health.d/ipmi.conf
@@ -6,7 +6,7 @@
warn: $this > 0
crit: $critical > 0
delay: up 5m down 15m multiplier 1.5 max 1h
- info: the number IPMI sensors in non-nominal state
+ info: number of IPMI sensors in non-nominal state
to: sysadmin
alarm: ipmi_events
@@ -16,5 +16,5 @@
every: 10s
warn: $this > 0
delay: up 5m down 15m multiplier 1.5 max 1h
- info: the number of events in the IPMI System Event Log (SEL)
+ info: number of events in the IPMI System Event Log (SEL)
to: sysadmin
diff --git a/health/health.d/isc_dhcpd.conf b/health/health.d/isc_dhcpd.conf
index 8054656f..d1f93969 100644
--- a/health/health.d/isc_dhcpd.conf
+++ b/health/health.d/isc_dhcpd.conf
@@ -1,10 +1,10 @@
- template: isc_dhcpd_leases_size
- on: isc_dhcpd.leases_total
- units: KB
- every: 60
- calc: $leases_size
- warn: $this > 3072
- crit: $this > 6144
- delay: up 2m down 5m
- info: dhcpd.leases file too big! Module can slow down your server.
- to: sysadmin
+# template: isc_dhcpd_leases_size
+# on: isc_dhcpd.leases_total
+# units: KB
+# every: 60
+# calc: $leases_size
+# warn: $this > 3072
+# crit: $this > 6144
+# delay: up 2m down 5m
+# info: dhcpd.leases file too big! Module can slow down your server.
+# to: sysadmin
diff --git a/health/health.d/kubelet.conf b/health/health.d/kubelet.conf
index d2ef24b5..5eda59b2 100644
--- a/health/health.d/kubelet.conf
+++ b/health/health.d/kubelet.conf
@@ -4,26 +4,26 @@
# True (1) if the node is experiencing a configuration-related error, false (0) otherwise.
- template: node_config_error
+ template: kubelet_node_config_error
on: k8s_kubelet.kubelet_node_config_error
calc: $kubelet_node_config_error
units: bool
every: 10s
warn: $this == 1
delay: down 1m multiplier 1.5 max 2h
- info: the node is experiencing a configuration-related error
+ info: the node is experiencing a configuration-related error (0: false, 1: true)
to: sysadmin
# Failed Token() requests to the alternate token source
- template: token_requests
+ template: kubelet_token_requests
lookup: sum -10s of token_fail_count
on: k8s_kubelet.kubelet_token_requests
units: failed requests
every: 10s
warn: $this > 0
delay: down 1m multiplier 1.5 max 2h
- info: failed token requests to alternate token source
+ info: number of failed Token() requests to the alternate token source
to: sysadmin
# Docker and runtime operation errors
@@ -35,7 +35,7 @@
every: 10s
warn: $this > (($status >= $WARNING) ? (0) : (20))
delay: up 30s down 1m multiplier 1.5 max 2h
- info: operations error
+ info: number of Docker or runtime operation errors
to: sysadmin
# -----------------------------------------------------------------------------
@@ -53,63 +53,66 @@
# quantile 0.5
-template: 1m_kubelet_pleg_relist_latency_quantile_05
+template: kubelet_1m_pleg_relist_latency_quantile_05
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
lookup: average -1m unaligned of kubelet_pleg_relist_latency_05
units: microseconds
every: 10s
- info: the average value of pleg relisting latency during the last minute (quantile 0.5)
+ info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.5)
-template: 10s_kubelet_pleg_relist_latency_quantile_05
+template: kubelet_10s_pleg_relist_latency_quantile_05
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
lookup: average -10s unaligned of kubelet_pleg_relist_latency_05
- calc: $this * 100 / (($1m_kubelet_pleg_relist_latency_quantile_05 < 1000)?(1000):($1m_kubelet_pleg_relist_latency_quantile_05))
+ calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05))
every: 10s
units: %
warn: $this > (($status >= $WARNING)?(100):(200))
crit: $this > (($status >= $WARNING)?(200):(400))
delay: down 1m multiplier 1.5 max 2h
- info: the % of the pleg relisting latency in the last 10 seconds, compared to the last minute (quantile 0.5)
+ info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
+ compared to the last minute (quantile 0.5)
to: sysadmin
# quantile 0.9
-template: 1m_kubelet_pleg_relist_latency_quantile_09
+template: kubelet_1m_pleg_relist_latency_quantile_09
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
lookup: average -1m unaligned of kubelet_pleg_relist_latency_09
units: microseconds
every: 10s
- info: the average value of pleg relisting latency during the last minute (quantile 0.9)
+ info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.9)
-template: 10s_kubelet_pleg_relist_latency_quantile_09
+template: kubelet_10s_pleg_relist_latency_quantile_09
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
lookup: average -10s unaligned of kubelet_pleg_relist_latency_09
- calc: $this * 100 / (($1m_kubelet_pleg_relist_latency_quantile_09 < 1000)?(1000):($1m_kubelet_pleg_relist_latency_quantile_09))
+ calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09))
every: 10s
units: %
warn: $this > (($status >= $WARNING)?(200):(400))
crit: $this > (($status >= $WARNING)?(400):(800))
delay: down 1m multiplier 1.5 max 2h
- info: the % of the pleg relisting latency in the last 10 seconds, compared to the last minute (quantile 0.9)
+ info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
+ compared to the last minute (quantile 0.9)
to: sysadmin
# quantile 0.99
-template: 1m_kubelet_pleg_relist_latency_quantile_099
+template: kubelet_1m_pleg_relist_latency_quantile_099
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
lookup: average -1m unaligned of kubelet_pleg_relist_latency_099
units: microseconds
every: 10s
- info: the average value of pleg relisting latency during the last minute (quantile 0.99)
+ info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.99)
-template: 10s_kubelet_pleg_relist_latency_quantile_099
+template: kubelet_10s_pleg_relist_latency_quantile_099
on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
lookup: average -10s unaligned of kubelet_pleg_relist_latency_099
- calc: $this * 100 / (($1m_kubelet_pleg_relist_latency_quantile_099 < 1000)?(1000):($1m_kubelet_pleg_relist_latency_quantile_099))
+ calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099))
every: 10s
units: %
warn: $this > (($status >= $WARNING)?(400):(800))
crit: $this > (($status >= $WARNING)?(800):(1200))
delay: down 1m multiplier 1.5 max 2h
- info: the % of the pleg relisting latency in the last 10 seconds, compared to the last minute (quantile 0.99)
+ info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
+ compared to the last minute (quantile 0.99)
to: sysadmin
diff --git a/health/health.d/linux_power_supply.conf b/health/health.d/linux_power_supply.conf
index 38727be2..a27ea072 100644
--- a/health/health.d/linux_power_supply.conf
+++ b/health/health.d/linux_power_supply.conf
@@ -8,5 +8,5 @@ template: linux_power_supply_capacity
warn: $this < 10
crit: $this < 5
delay: up 30s down 5m multiplier 1.2 max 1h
- info: the percentage remaining capacity of the power supply
+ info: percentage of remaining power supply capacity
to: sysadmin
diff --git a/health/health.d/load.conf b/health/health.d/load.conf
index ee0c54b8..ffaea172 100644
--- a/health/health.d/load.conf
+++ b/health/health.d/load.conf
@@ -4,18 +4,19 @@
# Calculate the base trigger point for the load average alarms.
# This is the maximum number of CPU's in the system over the past 1
# minute, with a special case for a single CPU of setting the trigger at 2.
- alarm: load_trigger
+ alarm: load_cpu_number
on: system.load
os: linux
hosts: *
calc: ($active_processors == nan or $active_processors == inf or $active_processors < 2) ? ( 2 ) : ( $active_processors )
units: cpus
every: 1m
- info: trigger point for load average alarms
+ info: number of active CPU cores in the system
# Send alarms if the load average is unusually high.
# These intentionally _do not_ calculate the average over the sampled
# time period because the values being checked already are averages.
+
alarm: load_average_15
on: system.load
os: linux
@@ -23,10 +24,9 @@
lookup: max -1m unaligned of load15
units: load
every: 1m
- warn: $this > (($status >= $WARNING) ? (1.75 * $load_trigger) : (2 * $load_trigger))
- crit: $this > (($status == $CRITICAL) ? (3.5 * $load_trigger) : (4 * $load_trigger))
+ warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 175 : 200)
delay: down 15m multiplier 1.5 max 1h
- info: fifteen-minute load average
+ info: system fifteen-minute load average
to: sysadmin
alarm: load_average_5
@@ -36,10 +36,9 @@
lookup: max -1m unaligned of load5
units: load
every: 1m
- warn: $this > (($status >= $WARNING) ? (3.5 * $load_trigger) : (4 * $load_trigger))
- crit: $this > (($status == $CRITICAL) ? (7 * $load_trigger) : (8 * $load_trigger))
+ warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 350 : 400)
delay: down 15m multiplier 1.5 max 1h
- info: five-minute load average
+ info: system five-minute load average
to: sysadmin
alarm: load_average_1
@@ -49,8 +48,7 @@
lookup: max -1m unaligned of load1
units: load
every: 1m
- warn: $this > (($status >= $WARNING) ? (7 * $load_trigger) : (8 * $load_trigger))
- crit: $this > (($status == $CRITICAL) ? (14 * $load_trigger) : (16 * $load_trigger))
+ warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 700 : 800)
delay: down 15m multiplier 1.5 max 1h
- info: one-minute load average
+ info: system one-minute load average
to: sysadmin
diff --git a/health/health.d/mdstat.conf b/health/health.d/mdstat.conf
index 2f906e18..ca2d0d9f 100644
--- a/health/health.d/mdstat.conf
+++ b/health/health.d/mdstat.conf
@@ -14,7 +14,8 @@ template: mdstat_disks
every: 10s
calc: $down
crit: $this > 0
- info: Array is degraded!
+ info: number of devices in the down state. \
+ Any number > 0 indicates that the array is degraded.
to: sysadmin
template: mdstat_mismatch_cnt
@@ -24,7 +25,7 @@ template: mdstat_mismatch_cnt
every: 60s
warn: $this > 1024
delay: up 30m
- info: Mismatch count!
+ info: number of unsynchronized blocks
to: sysadmin
template: mdstat_nonredundant_last_collected
@@ -35,4 +36,4 @@ template: mdstat_nonredundant_last_collected
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
info: number of seconds since the last successful data collection
- to: sysadmin \ No newline at end of file
+ to: sysadmin
diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf
index 6e81a2a0..f861765d 100644
--- a/health/health.d/megacli.conf
+++ b/health/health.d/megacli.conf
@@ -1,48 +1,56 @@
-template: adapter_state
+
+## Adapters (controllers)
+
+template: megacli_adapter_state
on: megacli.adapter_degraded
- units: is degraded
- lookup: sum -10s
+ lookup: max -10s foreach *
+ units: boolean
every: 10s
crit: $this > 0
- info: adapter state
+ delay: down 5m multiplier 2 max 10m
+ info: adapter is in the degraded state (0: false, 1: true)
+ to: sysadmin
+
+## Physical Disks
+
+template: megacli_pd_predictive_failures
+ on: megacli.pd_predictive_failure
+ lookup: sum -10s foreach *
+ units: predictive failures
+ every: 10s
+ warn: $this > 0
+ delay: up 1m down 5m multiplier 2 max 10m
+ info: number of physical drive predictive failures
+ to: sysadmin
+
+template: megacli_pd_media_errors
+ on: megacli.pd_media_error
+ lookup: sum -10s foreach *
+ units: media errors
+ every: 10s
+ warn: $this > 0
+ delay: up 1m down 5m multiplier 2 max 10m
+ info: number of physical drive media errors
to: sysadmin
-template: bbu_relative_charge
+## Battery Backup Units (BBU)
+
+template: megacli_bbu_relative_charge
on: megacli.bbu_relative_charge
- units: percent
lookup: average -10s
+ units: percent
every: 10s
warn: $this <= (($status >= $WARNING) ? (85) : (80))
crit: $this <= (($status == $CRITICAL) ? (50) : (40))
- info: BBU relative state of charge
+ info: average battery backup unit (BBU) relative state of charge over the last 10 seconds
to: sysadmin
-template: bbu_cycle_count
+template: megacli_bbu_cycle_count
on: megacli.bbu_cycle_count
- units: cycle count
lookup: average -10s
+ units: cycles
every: 10s
warn: $this >= 100
crit: $this >= 500
- info: BBU cycle count
- to: sysadmin
-
-template: pd_media_errors
- on: megacli.pd_media_error
- units: media errors
- lookup: sum -10s
- every: 10s
- warn: $this > 0
- delay: down 1m multiplier 2 max 10m
- info: physical drive media errors
- to: sysadmin
-
-template: pd_predictive_failures
- on: megacli.pd_predictive_failure
- units: predictive failures
- lookup: sum -10s
- every: 10s
- warn: $this > 0
- delay: down 1m multiplier 2 max 10m
- info: physical drive predictive failures
+ info: average battery backup unit (BBU) charge cycles count over the last 10 seconds
to: sysadmin
diff --git a/health/health.d/memcached.conf b/health/health.d/memcached.conf
index d248ef57..e610f181 100644
--- a/health/health.d/memcached.conf
+++ b/health/health.d/memcached.conf
@@ -23,30 +23,31 @@ template: memcached_cache_memory_usage
warn: $this > (($status >= $WARNING) ? (70) : (80))
crit: $this > (($status == $CRITICAL) ? (80) : (90))
delay: up 0 down 15m multiplier 1.5 max 1h
- info: current cache memory usage
+ info: cache memory utilization
to: dba
# find the rate memcached cache is filling
-template: cache_fill_rate
+template: memcached_cache_fill_rate
on: memcached.cache
lookup: min -10m at -50m unaligned of available
calc: ($this - $available) / (($now - $after) / 3600)
units: KB/hour
every: 1m
- info: average rate the cache fills up (positive), or frees up (negative) space, for the last hour
+ info: average rate the cache fills up (positive), or frees up (negative) space over the last hour
# find the hours remaining until memcached cache is full
-template: out_of_cache_space_time
+template: memcached_out_of_cache_space_time
on: memcached.cache
- calc: ($cache_fill_rate > 0) ? ($available / $cache_fill_rate) : (inf)
+ calc: ($memcached_cache_fill_rate > 0) ? ($available / $memcached_cache_fill_rate) : (inf)
units: hours
every: 10s
warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
delay: down 15m multiplier 1.5 max 1h
- info: estimated time the cache will run out of space, if the system continues to add data with the rate of the last hour
+ info: estimated time the cache will run out of space \
+ if the system continues to add data at the same rate as the past hour
to: dba
diff --git a/health/health.d/memory.conf b/health/health.d/memory.conf
index 4a0e6e52..e95c0aad 100644
--- a/health/health.d/memory.conf
+++ b/health/health.d/memory.conf
@@ -10,7 +10,7 @@
every: 1m
warn: $this > 0
delay: down 1h multiplier 1.5 max 1h
- info: number of ECC correctable errors during the last hour
+ info: number of ECC correctable errors in the last 10 minutes
to: sysadmin
alarm: 1hour_ecc_memory_uncorrectable
@@ -22,7 +22,7 @@
every: 1m
crit: $this > 0
delay: down 1h multiplier 1.5 max 1h
- info: number of ECC uncorrectable errors during the last hour
+ info: number of ECC uncorrectable errors in the last 10 minutes
to: sysadmin
alarm: 1hour_memory_hw_corrupted
diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf
index 62cef5a2..7451b3f4 100644
--- a/health/health.d/mysql.conf
+++ b/health/health.d/mysql.conf
@@ -24,7 +24,7 @@ template: mysql_10s_slow_queries
warn: $this > (($status >= $WARNING) ? (5) : (10))
crit: $this > (($status == $CRITICAL) ? (10) : (20))
delay: down 5m multiplier 1.5 max 1h
- info: number of mysql slow queries over the last 10 seconds
+ info: number of slow queries in the last 10 seconds
to: dba
@@ -36,7 +36,7 @@ template: mysql_10s_table_locks_immediate
lookup: sum -10s absolute of immediate
units: immediate locks
every: 10s
- info: number of table immediate locks over the last 10 seconds
+ info: number of table immediate locks in the last 10 seconds
to: dba
template: mysql_10s_table_locks_waited
@@ -44,7 +44,7 @@ template: mysql_10s_table_locks_waited
lookup: sum -10s absolute of waited
units: waited locks
every: 10s
- info: number of table waited locks over the last 10 seconds
+ info: number of table waited locks in the last 10 seconds
to: dba
template: mysql_10s_waited_locks_ratio
@@ -55,7 +55,7 @@ template: mysql_10s_waited_locks_ratio
warn: $this > (($status >= $WARNING) ? (10) : (25))
crit: $this > (($status == $CRITICAL) ? (25) : (50))
delay: down 30m multiplier 1.5 max 1h
- info: the ratio of mysql waited table locks, for the last 10 seconds
+ info: ratio of waited table locks over the last 10 seconds
to: dba
@@ -70,7 +70,7 @@ template: mysql_connections
warn: $this > (($status >= $WARNING) ? (60) : (70))
crit: $this > (($status == $CRITICAL) ? (80) : (90))
delay: down 15m multiplier 1.5 max 1h
- info: the ratio of current active connections vs the maximum possible number of connections
+ info: client connections utilization
to: dba
@@ -84,7 +84,7 @@ template: mysql_replication
every: 10s
crit: $this == 0
delay: down 5m multiplier 1.5 max 1h
- info: checks if mysql replication has stopped
+ info: replication status (0: stopped, 1: working)
to: dba
template: mysql_replication_lag
@@ -95,7 +95,8 @@ template: mysql_replication_lag
warn: $this > (($status >= $WARNING) ? (5) : (10))
crit: $this > (($status == $CRITICAL) ? (10) : (30))
delay: down 15m multiplier 1.5 max 1h
- info: the number of seconds mysql replication is behind this master
+ info: difference between the timestamp of the latest transaction processed by the SQL thread and \
+ the timestamp of the same transaction when it was processed on the master
to: dba
@@ -107,7 +108,7 @@ template: mysql_galera_cluster_size_max_2m
lookup: max -2m absolute
units: nodes
every: 10s
- info: max cluster size 2 minute
+ info: maximum galera cluster size in the last 2 minutes
to: dba
template: mysql_galera_cluster_size
@@ -118,7 +119,7 @@ template: mysql_galera_cluster_size
warn: $this > $mysql_galera_cluster_size_max_2m
crit: $this < $mysql_galera_cluster_size_max_2m
delay: up 20s down 5m multiplier 1.5 max 1h
- info: cluster size has changed
+ info: current galera cluster size, compared to the maximum size in the last 2 minutes
to: dba
# galera node state
@@ -130,7 +131,8 @@ template: mysql_galera_cluster_state
warn: $this < 4
crit: $this < 2
delay: up 30s down 5m multiplier 1.5 max 1h
- info: node state (0: undefined, 1: joining, 2: donor/desynced, 3: joined, 4: synced)
+ info: galera node state \
+ (0: undefined, 1: joining, 2: donor/desynced, 3: joined, 4: synced)
to: dba
@@ -142,5 +144,7 @@ template: mysql_galera_cluster_status
every: 10s
crit: $mysql_galera_cluster_state != nan AND $this != 0
delay: up 30s down 5m multiplier 1.5 max 1h
- info: node and cluster status (-1: unknown, 0: primary/quorum present, 1: non-primary/quorum lost, 2: disconnected)
+ info: galera node cluster component status \
+ (-1: unknown, 0: primary/quorum present, 1: non-primary/quorum lost, 2: disconnected). \
+ Any other value than primary indicates that the node is part of a nonoperational component.
to: dba
diff --git a/health/health.d/net.conf b/health/health.d/net.conf
index 261290e5..33202421 100644
--- a/health/health.d/net.conf
+++ b/health/health.d/net.conf
@@ -12,7 +12,7 @@
calc: ( $nic_speed_max > 0 ) ? ( $nic_speed_max) : ( nan )
units: Mbit
every: 10s
- info: The current speed of the physical network interface
+ info: network interface current speed
template: 1m_received_traffic_overflow
on: net.net
@@ -20,13 +20,12 @@
hosts: *
families: *
lookup: average -1m unaligned absolute of received
- calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan )
+ calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed)) : ( nan )
units: %
every: 10s
- warn: $this > (($status >= $WARNING) ? (80) : (85))
- crit: $this > (($status == $CRITICAL) ? (85) : (90))
- delay: down 1m multiplier 1.5 max 1h
- info: interface received bandwidth usage over net device speed max
+ warn: $this > (($status >= $WARNING) ? (85) : (90))
+ delay: up 1m down 1m multiplier 1.5 max 1h
+ info: average inbound utilization for the network interface over the last minute
to: sysadmin
template: 1m_sent_traffic_overflow
@@ -35,13 +34,12 @@
hosts: *
families: *
lookup: average -1m unaligned absolute of sent
- calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan )
+ calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed)) : ( nan )
units: %
every: 10s
- warn: $this > (($status >= $WARNING) ? (80) : (85))
- crit: $this > (($status == $CRITICAL) ? (85) : (90))
- delay: down 1m multiplier 1.5 max 1h
- info: interface sent bandwidth usage over net device speed max
+ warn: $this > (($status >= $WARNING) ? (85) : (90))
+ delay: up 1m down 1m multiplier 1.5 max 1h
+ info: average outbound utilization for the network interface over the last minute
to: sysadmin
# -----------------------------------------------------------------------------
@@ -58,56 +56,76 @@ template: inbound_packets_dropped
on: net.drops
os: linux
hosts: *
-families: *
+families: !net* *
lookup: sum -10m unaligned absolute of inbound
units: packets
every: 1m
- warn: $this >= 5
- delay: down 1h multiplier 1.5 max 2h
- info: interface inbound dropped packets in the last 10 minutes
- to: sysadmin
+ info: number of inbound dropped packets for the network interface in the last 10 minutes
template: outbound_packets_dropped
on: net.drops
os: linux
hosts: *
-families: *
+families: !net* *
lookup: sum -10m unaligned absolute of outbound
units: packets
every: 1m
- warn: $this >= 5
- delay: down 1h multiplier 1.5 max 2h
- info: interface outbound dropped packets in the last 10 minutes
- to: sysadmin
+ info: number of outbound dropped packets for the network interface in the last 10 minutes
template: inbound_packets_dropped_ratio
on: net.packets
os: linux
hosts: *
-families: *
+families: !net* !wl* *
lookup: sum -10m unaligned absolute of received
- calc: (($inbound_packets_dropped != nan AND $this > 0) ? ($inbound_packets_dropped * 100 / $this) : (0))
+ calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0))
units: %
every: 1m
- warn: $this >= 0.1
- crit: $this >= 2
- delay: down 1h multiplier 1.5 max 2h
- info: the ratio of inbound dropped packets vs the total number of received packets of the network interface, during the last 10 minutes
+ warn: $this >= 2
+ delay: up 1m down 1h multiplier 1.5 max 2h
+ info: ratio of inbound dropped packets for the network interface over the last 10 minutes
to: sysadmin
template: outbound_packets_dropped_ratio
on: net.packets
os: linux
hosts: *
-families: *
+families: !net* !wl* *
lookup: sum -10m unaligned absolute of sent
- calc: (($outbound_packets_dropped != nan AND $this > 0) ? ($outbound_packets_dropped * 100 / $this) : (0))
+ calc: (($outbound_packets_dropped != nan AND $this > 1000) ? ($outbound_packets_dropped * 100 / $this) : (0))
units: %
every: 1m
- warn: $this >= 0.1
- crit: $this >= 2
- delay: down 1h multiplier 1.5 max 2h
- info: the ratio of outbound dropped packets vs the total number of sent packets of the network interface, during the last 10 minutes
+ warn: $this >= 2
+ delay: up 1m down 1h multiplier 1.5 max 2h
+ info: ratio of outbound dropped packets for the network interface over the last 10 minutes
+ to: sysadmin
+
+template: wifi_inbound_packets_dropped_ratio
+ on: net.packets
+ os: linux
+ hosts: *
+families: wl*
+ lookup: sum -10m unaligned absolute of received
+ calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0))
+ units: %
+ every: 1m
+ warn: $this >= 10
+ delay: up 1m down 1h multiplier 1.5 max 2h
+ info: ratio of inbound dropped packets for the network interface over the last 10 minutes
+ to: sysadmin
+
+template: wifi_outbound_packets_dropped_ratio
+ on: net.packets
+ os: linux
+ hosts: *
+families: wl*
+ lookup: sum -10m unaligned absolute of sent
+ calc: (($outbound_packets_dropped != nan AND $this > 1000) ? ($outbound_packets_dropped * 100 / $this) : (0))
+ units: %
+ every: 1m
+ warn: $this >= 10
+ delay: up 1m down 1h multiplier 1.5 max 2h
+ info: ratio of outbound dropped packets for the network interface over the last 10 minutes
to: sysadmin
# -----------------------------------------------------------------------------
@@ -123,7 +141,7 @@ families: *
every: 1m
warn: $this >= 5
delay: down 1h multiplier 1.5 max 2h
- info: interface inbound errors in the last 10 minutes
+ info: number of inbound errors for the network interface in the last 10 minutes
to: sysadmin
template: interface_outbound_errors
@@ -136,7 +154,7 @@ families: *
every: 1m
warn: $this >= 5
delay: down 1h multiplier 1.5 max 2h
- info: interface outbound errors in the last 10 minutes
+ info: number of outbound errors for the network interface in the last 10 minutes
to: sysadmin
# -----------------------------------------------------------------------------
@@ -157,7 +175,7 @@ families: *
every: 1m
warn: $this > 0
delay: down 1h multiplier 1.5 max 2h
- info: interface fifo errors in the last 10 minutes
+ info: number of FIFO errors for the network interface in the last 10 minutes
to: sysadmin
# -----------------------------------------------------------------------------
@@ -177,7 +195,7 @@ families: *
lookup: average -1m unaligned of received
units: packets
every: 10s
- info: the average number of packets received during the last minute
+ info: average number of packets received by the network interface over the last minute
template: 10s_received_packets_storm
on: net.packets
@@ -189,7 +207,8 @@ families: *
every: 10s
units: %
warn: $this > (($status >= $WARNING)?(200):(5000))
- crit: $this > (($status >= $WARNING)?(5000):(6000))
+ crit: $this > (($status == $CRITICAL)?(5000):(6000))
options: no-clear-notification
- info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute (clear notification for this alarm will not be sent)
+ info: ratio of average number of received packets for the network interface over the last 10 seconds, \
+ compared to the rate over the last minute
to: sysadmin
diff --git a/health/health.d/netfilter.conf b/health/health.d/netfilter.conf
index 1d07752c..f827d8e4 100644
--- a/health/health.d/netfilter.conf
+++ b/health/health.d/netfilter.conf
@@ -1,19 +1,6 @@
# you can disable an alarm notification by setting the 'to' line to: silent
- alarm: netfilter_last_collected_secs
- on: netfilter.conntrack_sockets
- os: linux
- hosts: *
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
-
alarm: netfilter_conntrack_full
on: netfilter.conntrack_sockets
os: linux
@@ -22,8 +9,8 @@
calc: $this * 100 / $netfilter_conntrack_max
units: %
every: 10s
- warn: $this > (($status >= $WARNING) ? (70) : (80))
- crit: $this > (($status == $CRITICAL) ? (80) : (90))
+ warn: $this > (($status >= $WARNING) ? (85) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (95))
delay: down 5m multiplier 1.5 max 1h
- info: the number of connections tracked by the netfilter connection tracker, as a percentage of the connection tracker table size
+ info: netfilter connection tracker table size utilization
to: sysadmin
diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf
index b255d35f..f450b712 100644
--- a/health/health.d/pihole.conf
+++ b/health/health.d/pihole.conf
@@ -20,9 +20,9 @@ template: pihole_blocked_queries
units: %
calc: $blocked
warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) )
- crit: $this > ( ($status >= $CRITICAL) ? ( 55 ) : ( 75 ) )
+ crit: $this > ( ($status == $CRITICAL) ? ( 55 ) : ( 75 ) )
delay: up 2m down 5m
- info: percentage of blocked dns queries for the last 24 hour
+ info: percentage of blocked dns queries over the last 24 hour
to: sysadmin
@@ -36,7 +36,7 @@ template: pihole_blocklist_last_update
calc: $ago
warn: $this > 60 * 60 * 24 * 8
crit: $this > 60 * 60 * 24 * 8 * 2
- info: blocklist last update time
+ info: gravity.list (blocklist) file last update time
to: sysadmin
# Gravity file check (gravity.list).
@@ -48,7 +48,7 @@ template: pihole_blocklist_gravity_file
calc: $file_exists
crit: $this != 1
delay: up 2m down 5m
- info: gravity file existence
+ info: gravity.list (blocklist) file existence state (0: exists, 1: not-exists)
to: sysadmin
# Pi-hole's ability to block unwanted domains.
@@ -61,5 +61,5 @@ template: pihole_status
calc: $enabled
warn: $this != 1
delay: up 2m down 5m
- info: unwanted domains blocking status
+ info: unwanted domains blocking status (0: enabled, 1: disabled)
to: sysadmin
diff --git a/health/health.d/portcheck.conf b/health/health.d/portcheck.conf
index 696333fd..29dcebbc 100644
--- a/health/health.d/portcheck.conf
+++ b/health/health.d/portcheck.conf
@@ -11,17 +11,17 @@ families: *
to: sysadmin
# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
-template: service_reachable
+template: portcheck_service_reachable
families: *
on: portcheck.status
lookup: average -1m unaligned percentage of success
calc: ($this < 75) ? (0) : ($this)
every: 5s
units: up/down
- info: at least 75% successful connections during last 60 seconds, ideal for badges
+ info: average ratio of successful connections over the last minute (at least 75%)
to: silent
-template: connection_timeouts
+template: portcheck_connection_timeouts
families: *
on: portcheck.status
lookup: average -5m unaligned percentage of timeout
@@ -30,10 +30,10 @@ families: *
warn: $this >= 10 AND $this < 40
crit: $this >= 40
delay: down 5m multiplier 1.5 max 1h
- info: average of timeouts during the last 5 minutes
+ info: average ratio of timeouts over the last 5 minutes
to: sysadmin
-template: connection_fails
+template: portcheck_connection_fails
families: *
on: portcheck.status
lookup: average -5m unaligned percentage of no_connection,failed
@@ -42,5 +42,5 @@ families: *
warn: $this >= 10 AND $this < 40
crit: $this >= 40
delay: down 5m multiplier 1.5 max 1h
- info: average of failed connections during the last 5 minutes
+ info: average ratio of failed connections over the last 5 minutes
to: sysadmin
diff --git a/health/health.d/processes.conf b/health/health.d/processes.conf
index 293f1aa0..b464d8f6 100644
--- a/health/health.d/processes.conf
+++ b/health/health.d/processes.conf
@@ -6,8 +6,8 @@
calc: $active * 100 / $pidmax
units: %
every: 5s
- warn: $this > (($status >= $WARNING) ? (75) : (80))
- crit: $this > (($status == $CRITICAL) ? (85) : (90))
+ warn: $this > (($status >= $WARNING) ? (85) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (95))
delay: down 5m multiplier 1.5 max 1h
- info: the percentage of active processes
+ info: system process IDs (PID) space utilization
to: sysadmin
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
index 0a71dac8..2daecc48 100644
--- a/health/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -7,7 +7,8 @@
hosts: *
calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz - $zfs.arc_size.min)
every: 10s
- info: the amount of memory that is reported as used, but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC)
+ info: amount of memory reported as used, \
+ but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC)
alarm: ram_in_use
on: system.ram
@@ -20,7 +21,7 @@
warn: $this > (($status >= $WARNING) ? (80) : (90))
crit: $this > (($status == $CRITICAL) ? (90) : (98))
delay: down 15m multiplier 1.5 max 1h
- info: system RAM used
+ info: system memory utilization
to: sysadmin
alarm: ram_available
@@ -33,7 +34,7 @@
warn: $this < (($status >= $WARNING) ? (15) : (10))
crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
delay: down 15m multiplier 1.5 max 1h
- info: estimated amount of RAM available for userspace processes, without causing swapping
+ info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
to: sysadmin
## FreeBSD
@@ -47,7 +48,7 @@
warn: $this > (($status >= $WARNING) ? (80) : (90))
crit: $this > (($status == $CRITICAL) ? (90) : (98))
delay: down 15m multiplier 1.5 max 1h
- info: system RAM usage
+ info: system memory utilization
to: sysadmin
alarm: ram_available
@@ -60,5 +61,5 @@
warn: $this < (($status >= $WARNING) ? (15) : (10))
crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
delay: down 15m multiplier 1.5 max 1h
- info: estimated amount of RAM available for userspace processes, without causing swapping
+ info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
to: sysadmin
diff --git a/health/health.d/redis.conf b/health/health.d/redis.conf
index c08a884a..43f98a1d 100644
--- a/health/health.d/redis.conf
+++ b/health/health.d/redis.conf
@@ -18,7 +18,7 @@ families: *
every: 10s
crit: $rdb_last_bgsave_status != 0
units: ok/failed
- info: states if redis bgsave is working
+ info: status of the last RDB save operation (0: ok, 1: error)
delay: down 5m multiplier 1.5 max 1h
to: dba
@@ -29,6 +29,6 @@ families: *
warn: $rdb_bgsave_in_progress > 600
crit: $rdb_bgsave_in_progress > 1200
units: seconds
- info: the time redis needs to save its database
+ info: duration of the on-going RDB save operation
delay: down 5m multiplier 1.5 max 1h
to: dba
diff --git a/health/health.d/retroshare.conf b/health/health.d/retroshare.conf
index 2344b60e..51b1deb4 100644
--- a/health/health.d/retroshare.conf
+++ b/health/health.d/retroshare.conf
@@ -21,5 +21,5 @@ template: retroshare_dht_working
warn: $this < (($status >= $WARNING) ? (120) : (100))
crit: $this < (($status == $CRITICAL) ? (10) : (1))
delay: up 0 down 15m multiplier 1.5 max 1h
- info: Checks if the DHT has enough peers to operate
+ info: number of DHT peers
to: sysadmin
diff --git a/health/health.d/riakkv.conf b/health/health.d/riakkv.conf
index 74530277..d6346026 100644
--- a/health/health.d/riakkv.conf
+++ b/health/health.d/riakkv.conf
@@ -1,5 +1,5 @@
# Ensure that Riak is running. template: riak_last_collected_secs
-template: riak_last_collected_secs
+template: riakkv_last_collected_secs
on: riak.kv.throughput
calc: $now - $last_collected_t
units: seconds ago
@@ -11,7 +11,7 @@ template: riak_last_collected_secs
to: dba
# Warn if a list keys operation is running.
-template: riak_list_keys_active
+template: riakkv_list_keys_active
on: riak.core.fsm_active
calc: $list_fsm_active
units: state machines
@@ -23,44 +23,50 @@ template: riak_list_keys_active
## Timing healthchecks
# KV GET
-template: 1h_kv_get_mean_latency
+template: riakkv_1h_kv_get_mean_latency
on: riak.kv.latency.get
calc: $node_get_fsm_time_mean
lookup: average -1h unaligned of time
every: 30s
units: ms
- info: mean average KV GET latency over the last hour
+ info: average time between reception of client GET request and \
+ subsequent response to client over the last hour
-template: riak_kv_get_slow
+template: riakkv_kv_get_slow
on: riak.kv.latency.get
calc: $mean
lookup: average -3m unaligned of time
units: ms
every: 10s
- warn: ($this > ($1h_kv_get_mean_latency * 2) )
- crit: ($this > ($1h_kv_get_mean_latency * 3) )
- info: average KV GET time over the last 3 minutes, compared to the average over the last hour
+ warn: ($this > ($riakkv_1h_kv_get_mean_latency * 2) )
+ crit: ($this > ($riakkv_1h_kv_get_mean_latency * 3) )
+ info: average time between reception of client GET request and \
+ subsequent response to the client over the last 3 minutes, \
+ compared to the average over the last hour
delay: down 5m multiplier 1.5 max 1h
to: dba
# KV PUT
-template: 1h_kv_put_mean_latency
+template: riakkv_1h_kv_put_mean_latency
on: riak.kv.latency.put
calc: $node_put_fsm_time_mean
lookup: average -1h unaligned of time
every: 30s
units: ms
- info: mean average KV PUT latency over the last hour
+ info: average time between reception of client PUT request and \
+ subsequent response to the client over the last hour
-template: riak_kv_put_slow
+template: riakkv_kv_put_slow
on: riak.kv.latency.put
calc: $mean
lookup: average -3m unaligned of time
units: ms
every: 10s
- warn: ($this > ($1h_kv_put_mean_latency * 2) )
- crit: ($this > ($1h_kv_put_mean_latency * 3) )
- info: average KV PUT time over the last 3 minutes, compared to the average over the last hour
+ warn: ($this > ($riakkv_1h_kv_put_mean_latency * 2) )
+ crit: ($this > ($riakkv_1h_kv_put_mean_latency * 3) )
+ info: average time between reception of client PUT request and \
+ subsequent response to the client over the last 3 minutes, \
+ compared to the average over the last hour
delay: down 5m multiplier 1.5 max 1h
to: dba
@@ -69,12 +75,12 @@ template: riak_kv_put_slow
# Default Erlang VM process limit: 262144
# On systems observed, this is < 2000, but may grow depending on load.
-template: riak_vm_high_process_count
+template: riakkv_vm_high_process_count
on: riak.vm
calc: $sys_process_count
units: processes
every: 10s
warn: $this > 10000
crit: $this > 100000
- info: number of processes running in the Erlang VM (the default limit on ERTS 10.2.4 is 262144)
+ info: number of processes running in the Erlang VM
to: dba
diff --git a/health/health.d/scaleio.conf b/health/health.d/scaleio.conf
index 1a3088a2..ab9771bb 100644
--- a/health/health.d/scaleio.conf
+++ b/health/health.d/scaleio.conf
@@ -22,7 +22,7 @@ template: scaleio_storage_pool_capacity_utilization
warn: $this > (($status >= $WARNING) ? (80) : (90))
crit: $this > (($status == $CRITICAL) ? (90) : (98))
delay: down 15m multiplier 1.5 max 1h
- info: Storage Pool capacity utilization
+ info: storage pool capacity utilization
to: sysadmin
@@ -34,5 +34,5 @@ template: scaleio_sdc_mdm_connection_state
every: 10s
warn: $this != 1
delay: up 30s down 5m multiplier 1.5 max 1h
- info: Sdc connection to MDM state
+ info: Data Client (SDC) to Metadata Manager (MDM) connection state (0: disconnected, 1: connected)
to: sysadmin
diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf
index f835f2ae..f761e4a0 100644
--- a/health/health.d/softnet.conf
+++ b/health/health.d/softnet.conf
@@ -12,7 +12,8 @@
every: 10s
warn: $this > (($status >= $WARNING) ? (0) : (10))
delay: down 1h multiplier 1.5 max 2h
- info: average number of packets dropped in the last 1min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets)
+ info: average number of dropped packets in the last minute \
+ due to exceeded net.core.netdev_max_backlog
to: sysadmin
alarm: 1min_netdev_budget_ran_outs
@@ -24,7 +25,9 @@
every: 10s
warn: $this > (($status >= $WARNING) ? (0) : (10))
delay: down 1h multiplier 1.5 max 2h
- info: average number of times, during the last 1min, ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs, with work remaining (this can be a cause for dropped packets)
+ info: average number of times ksoftirq ran out of sysctl net.core.netdev_budget or \
+ net.core.netdev_budget_usecs with work remaining over the last minute \
+ (this can be a cause for dropped packets)
to: silent
alarm: 10min_netisr_backlog_exceeded
@@ -34,7 +37,9 @@
lookup: average -1m unaligned absolute of qdrops
units: packets
every: 10s
- warn: $this > (($status >+ $WARNING) ? (0) : (10))
+ warn: $this > (($status >= $WARNING) ? (0) : (10))
delay: down 1h multiplier 1.5 max 2h
- info: average number of drops in the last 1min, because sysctl net.route.netisr_maxqlen was exceeded (this can be a cause for dropped packets)
+ info: average number of drops in the last minute \
+ due to exceeded sysctl net.route.netisr_maxqlen \
+ (this can be a cause for dropped packets)
to: sysadmin
diff --git a/health/health.d/swap.conf b/health/health.d/swap.conf
index f920b080..66c36c13 100644
--- a/health/health.d/swap.conf
+++ b/health/health.d/swap.conf
@@ -10,23 +10,9 @@
calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
units: % of RAM
every: 1m
- warn: $this > (($status >= $WARNING) ? (10) : (20))
- crit: $this > (($status == $CRITICAL) ? (20) : (30))
- delay: up 0 down 15m multiplier 1.5 max 1h
- info: the amount of memory swapped in the last 30 minutes, as a percentage of the system RAM
- to: sysadmin
-
- alarm: ram_in_swap
- on: system.swap
- os: linux
- hosts: *
- calc: $used * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
- units: % of RAM
- every: 10s
- warn: $this > (($status >= $WARNING) ? (15) : (20))
- crit: $this > (($status == $CRITICAL) ? (40) : (50))
- delay: up 30s down 15m multiplier 1.5 max 1h
- info: the swap memory used, as a percentage of the system RAM
+ warn: $this > (($status >= $WARNING) ? (20) : (30))
+ delay: down 15m multiplier 1.5 max 1h
+ info: percentage of the system RAM swapped in the last 30 minutes
to: sysadmin
alarm: used_swap
@@ -39,5 +25,5 @@
warn: $this > (($status >= $WARNING) ? (80) : (90))
crit: $this > (($status == $CRITICAL) ? (90) : (98))
delay: up 30s down 15m multiplier 1.5 max 1h
- info: the percentage of swap memory used
+ info: swap memory utilization
to: sysadmin
diff --git a/health/health.d/synchronization.conf b/health/health.d/synchronization.conf
new file mode 100644
index 00000000..417624ad
--- /dev/null
+++ b/health/health.d/synchronization.conf
@@ -0,0 +1,12 @@
+ alarm: sync_freq
+ on: mem.sync
+ lookup: sum -1m of sync
+ units: calls
+ plugin: ebpf.plugin
+ every: 1m
+ warn: $this > 6
+ delay: up 1m down 10m multiplier 1.5 max 1h
+ info: number of sync() system calls. \
+ Every call causes all pending modifications to filesystem metadata and \
+ cached file data to be written to the underlying filesystems.
+ to: sysadmin
diff --git a/health/health.d/tcp_conn.conf b/health/health.d/tcp_conn.conf
index 7aa9a980..38b1062d 100644
--- a/health/health.d/tcp_conn.conf
+++ b/health/health.d/tcp_conn.conf
@@ -13,7 +13,7 @@
units: %
every: 10s
warn: $this > (($status >= $WARNING ) ? ( 60 ) : ( 80 ))
- crit: $this > (($status >= $CRITICAL) ? ( 80 ) : ( 90 ))
+ crit: $this > (($status == $CRITICAL) ? ( 80 ) : ( 90 ))
delay: up 0 down 5m multiplier 1.5 max 1h
- info: the percentage of IPv4 TCP connections over the max allowed
+ info: IPv4 TCP connections utilization
to: sysadmin
diff --git a/health/health.d/tcp_listen.conf b/health/health.d/tcp_listen.conf
index 3b307257..dad462eb 100644
--- a/health/health.d/tcp_listen.conf
+++ b/health/health.d/tcp_listen.conf
@@ -28,7 +28,7 @@
warn: $this > 1
crit: $this > (($status == $CRITICAL) ? (1) : (5))
delay: up 0 down 5m multiplier 1.5 max 1h
- info: the average number of times the TCP accept queue of the kernel overflown, during the last minute
+ info: average number of overflows in the TCP accept queue over the last minute
to: sysadmin
# THIS IS TOO GENERIC
@@ -43,7 +43,7 @@
warn: $this > 1
crit: $this > (($status == $CRITICAL) ? (1) : (5))
delay: up 0 down 5m multiplier 1.5 max 1h
- info: the average number of times the TCP accept queue of the kernel dropped packets, during the last minute (includes bogus packets received)
+ info: average number of dropped packets in the TCP accept queue over the last minute
to: sysadmin
@@ -65,7 +65,8 @@
warn: $this > 1
crit: $this > (($status == $CRITICAL) ? (0) : (5))
delay: up 10 down 5m multiplier 1.5 max 1h
- info: the number of times the TCP SYN queue of the kernel was full and dropped packets, during the last minute
+ info: average number of SYN requests was dropped due to the full TCP SYN queue over the last minute \
+ (SYN cookies were not enabled)
to: sysadmin
alarm: 1m_tcp_syn_queue_cookies
@@ -78,6 +79,6 @@
warn: $this > 1
crit: $this > (($status == $CRITICAL) ? (0) : (5))
delay: up 10 down 5m multiplier 1.5 max 1h
- info: the number of times the TCP SYN queue of the kernel was full and sent SYN cookies, during the last minute
+ info: average number of sent SYN cookies due to the full TCP SYN queue over the last minute
to: sysadmin
diff --git a/health/health.d/tcp_mem.conf b/health/health.d/tcp_mem.conf
index 6927d576..29d4ad68 100644
--- a/health/health.d/tcp_mem.conf
+++ b/health/health.d/tcp_mem.conf
@@ -14,7 +14,7 @@
units: %
every: 10s
warn: ${mem} > (($status >= $WARNING ) ? ( ${tcp_mem_pressure} * 0.8 ) : ( ${tcp_mem_pressure} ))
- crit: ${mem} > (($status >= $CRITICAL ) ? ( ${tcp_mem_pressure} ) : ( ${tcp_mem_high} * 0.9 ))
+ crit: ${mem} > (($status == $CRITICAL ) ? ( ${tcp_mem_pressure} ) : ( ${tcp_mem_high} * 0.9 ))
delay: up 0 down 5m multiplier 1.5 max 1h
- info: the amount of TCP memory as a percentage of its max memory limit
+ info: TCP memory utilization
to: sysadmin
diff --git a/health/health.d/tcp_orphans.conf b/health/health.d/tcp_orphans.conf
index 280d6590..17ff7a95 100644
--- a/health/health.d/tcp_orphans.conf
+++ b/health/health.d/tcp_orphans.conf
@@ -15,7 +15,7 @@
units: %
every: 10s
warn: $this > (($status >= $WARNING ) ? ( 20 ) : ( 25 ))
- crit: $this > (($status >= $CRITICAL) ? ( 25 ) : ( 50 ))
+ crit: $this > (($status == $CRITICAL) ? ( 25 ) : ( 50 ))
delay: up 0 down 5m multiplier 1.5 max 1h
- info: the percentage of orphan IPv4 TCP sockets over the max allowed (this may lead to too-many-orphans errors)
+ info: orphan IPv4 TCP sockets utilization
to: sysadmin
diff --git a/health/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf
index 36a550a5..af2a7525 100644
--- a/health/health.d/tcp_resets.conf
+++ b/health/health.d/tcp_resets.conf
@@ -2,21 +2,6 @@
# you can disable an alarm notification by setting the 'to' line to: silent
# -----------------------------------------------------------------------------
-
- alarm: ipv4_tcphandshake_last_collected_secs
- on: ipv4.tcphandshake
- os: linux freebsd
- hosts: *
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: up 0 down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
-
-# -----------------------------------------------------------------------------
# tcp resets this host sends
alarm: 1m_ipv4_tcp_resets_sent
@@ -26,7 +11,7 @@
lookup: average -1m at -10s unaligned absolute of OutRsts
units: tcp resets/s
every: 10s
- info: average TCP RESETS this host is sending, over the last minute
+ info: average number of sent TCP RESETS over the last minute
alarm: 10s_ipv4_tcp_resets_sent
on: ipv4.tcphandshake
@@ -38,7 +23,10 @@
warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (20)))
delay: up 20s down 60m multiplier 1.2 max 2h
options: no-clear-notification
- info: average TCP RESETS this host is sending, over the last 10 seconds (this can be an indication that a port scan is made, or that a service running on this host has crashed; clear notification for this alarm will not be sent)
+ info: average number of sent TCP RESETS over the last 10 seconds. \
+ This can indicate a port scan, \
+ or that a service running on this host has crashed. \
+ Netdata will not send a clear notification for this alarm.
to: sysadmin
# -----------------------------------------------------------------------------
@@ -51,7 +39,7 @@
lookup: average -1m at -10s unaligned absolute of AttemptFails
units: tcp resets/s
every: 10s
- info: average TCP RESETS this host is sending, over the last minute
+ info: average number of received TCP RESETS over the last minute
alarm: 10s_ipv4_tcp_resets_received
on: ipv4.tcphandshake
@@ -63,5 +51,7 @@
warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (10)))
delay: up 20s down 60m multiplier 1.2 max 2h
options: no-clear-notification
- info: average TCP RESETS this host is receiving, over the last 10 seconds (this can be an indication that a service this host needs, has crashed; clear notification for this alarm will not be sent)
+ info: average number of received TCP RESETS over the last 10 seconds. \
+ This can be an indication that a service this host needs has crashed. \
+ Netdata will not send a clear notification for this alarm.
to: sysadmin
diff --git a/health/health.d/udp_errors.conf b/health/health.d/udp_errors.conf
index 1e47b5c8..4836d631 100644
--- a/health/health.d/udp_errors.conf
+++ b/health/health.d/udp_errors.conf
@@ -2,21 +2,6 @@
# you can disable an alarm notification by setting the 'to' line to: silent
# -----------------------------------------------------------------------------
-
- alarm: ipv4_udperrors_last_collected_secs
- on: ipv4.udperrors
- os: linux freebsd
- hosts: *
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: up 0 down 5m multiplier 1.5 max 1h
- info: number of seconds since the last successful data collection
- to: sysadmin
-
-# -----------------------------------------------------------------------------
# UDP receive buffer errors
alarm: 1m_ipv4_udp_receive_buffer_errors
@@ -26,10 +11,9 @@
lookup: average -1m unaligned absolute of RcvbufErrors
units: errors
every: 10s
- warn: $this > 1
- crit: $this > (($status == $CRITICAL) ? (0) : (10))
- info: average number of UDP receive buffer errors during the last minute
- delay: up 0 down 60m multiplier 1.2 max 2h
+ warn: $this > (($status >= $WARNING) ? (0) : (10))
+ info: average number of UDP receive buffer errors over the last minute
+ delay: up 1m down 60m multiplier 1.2 max 2h
to: sysadmin
# -----------------------------------------------------------------------------
@@ -42,8 +26,7 @@
lookup: average -1m unaligned absolute of SndbufErrors
units: errors
every: 10s
- warn: $this > 1
- crit: $this > (($status == $CRITICAL) ? (0) : (10))
- info: number of UDP send buffer errors during the last minute
- delay: up 0 down 60m multiplier 1.2 max 2h
+ warn: $this > (($status >= $WARNING) ? (0) : (10))
+ info: average number of UDP send buffer errors over the last minute
+ delay: up 1m down 60m multiplier 1.2 max 2h
to: sysadmin
diff --git a/health/health.d/unbound.conf b/health/health.d/unbound.conf
index bdedc11a..567baf18 100644
--- a/health/health.d/unbound.conf
+++ b/health/health.d/unbound.conf
@@ -21,7 +21,7 @@ template: unbound_request_list_overwritten
every: 10s
warn: $this > 5
delay: up 10 down 5m multiplier 1.5 max 1h
- info: the number of overwritten queries in the request-list
+ info: number of overwritten queries in the request-list
to: sysadmin
template: unbound_request_list_dropped
@@ -31,5 +31,5 @@ template: unbound_request_list_dropped
every: 10s
warn: $this > 0
delay: up 10 down 5m multiplier 1.5 max 1h
- info: the number of dropped queries in the request-list
+ info: number of dropped queries in the request-list
to: sysadmin
diff --git a/health/health.d/vcsa.conf b/health/health.d/vcsa.conf
index 7bb98a9b..f4b03d4c 100644
--- a/health/health.d/vcsa.conf
+++ b/health/health.d/vcsa.conf
@@ -27,7 +27,8 @@ template: vcsa_system_health
warn: ($this == 1) || ($this == 2)
crit: $this == 3
delay: down 1m multiplier 1.5 max 1h
- info: overall system health status
+ info: overall system health status \
+ (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
to: sysadmin
# Components health:
@@ -45,7 +46,8 @@ template: vcsa_swap_health
warn: $this == 1
crit: ($this == 2) || ($this == 3)
delay: down 1m multiplier 1.5 max 1h
- info: swap health status
+ info: swap health status \
+ (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
to: sysadmin
template: vcsa_storage_health
@@ -56,7 +58,8 @@ template: vcsa_storage_health
warn: $this == 1
crit: ($this == 2) || ($this == 3)
delay: down 1m multiplier 1.5 max 1h
- info: storage health status
+ info: storage health status \
+ (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
to: sysadmin
template: vcsa_mem_health
@@ -67,7 +70,8 @@ template: vcsa_mem_health
warn: $this == 1
crit: ($this == 2) || ($this == 3)
delay: down 1m multiplier 1.5 max 1h
- info: mem health status
+ info: memory health status \
+ (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
to: sysadmin
template: vcsa_load_health
@@ -78,7 +82,8 @@ template: vcsa_load_health
warn: $this == 1
crit: ($this == 2) || ($this == 3)
delay: down 1m multiplier 1.5 max 1h
- info: load health status
+ info: load health status \
+ (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
to: sysadmin
template: vcsa_database_storage_health
@@ -89,7 +94,8 @@ template: vcsa_database_storage_health
warn: $this == 1
crit: ($this == 2) || ($this == 3)
delay: down 1m multiplier 1.5 max 1h
- info: database storage health status
+ info: database storage health status \
+ (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
to: sysadmin
template: vcsa_applmgmt_health
@@ -100,7 +106,8 @@ template: vcsa_applmgmt_health
warn: $this == 1
crit: ($this == 2) || ($this == 3)
delay: down 1m multiplier 1.5 max 1h
- info: appl mgmt health status
+ info: applmgmt health status \
+ (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
to: sysadmin
@@ -118,5 +125,6 @@ template: vcsa_software_updates_health
warn: $this == 4
crit: $this == 3
delay: down 1m multiplier 1.5 max 1h
- info: software packages health status
+ info: software updates availability status \
+ (-1: unknown, 0: green, 2: orange, 3: red, 4: grey)
to: sysadmin
diff --git a/health/health.d/vernemq.conf b/health/health.d/vernemq.conf
index 36bbaf82..9598dd39 100644
--- a/health/health.d/vernemq.conf
+++ b/health/health.d/vernemq.conf
@@ -18,10 +18,10 @@ template: vernemq_socket_errors
on: vernemq.socket_errors
lookup: sum -1m unaligned absolute of socket_error
units: errors
- every: 10s
- warn: $this > (($status == $WARNING) ? (0) : (5))
- delay: down 5m multiplier 1.5 max 2h
- info: socket errors in the last minute
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 2m down 5m multiplier 1.5 max 2h
+ info: number of socket errors in the last minute
to: sysadmin
# Queues dropped/expired/unhandled PUBLISH messages
@@ -30,30 +30,30 @@ template: vernemq_queue_message_drop
on: vernemq.queue_undelivered_messages
lookup: sum -1m unaligned absolute of queue_message_drop
units: dropped messages
- every: 10s
- warn: $this > (($status == $WARNING) ? (0) : (5))
- delay: down 5m multiplier 1.5 max 2h
- info: dropped messaged due to full queues in the last minute
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
+ info: number of dropped messaged due to full queues in the last minute
to: sysadmin
template: vernemq_queue_message_expired
on: vernemq.queue_undelivered_messages
lookup: sum -1m unaligned absolute of queue_message_expired
units: expired messages
- every: 10s
- warn: $this > (($status == $WARNING) ? (0) : (15))
- delay: down 5m multiplier 1.5 max 2h
- info: messages which expired before delivery in the last minute
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (15))
+ delay: up 5m down 5m multiplier 1.5 max 2h
+ info: number of messages which expired before delivery in the last minute
to: sysadmin
template: vernemq_queue_message_unhandled
on: vernemq.queue_undelivered_messages
lookup: sum -1m unaligned absolute of queue_message_unhandled
units: unhandled messages
- every: 10s
- warn: $this > (($status == $WARNING) ? (0) : (5))
- delay: down 5m multiplier 1.5 max 2h
- info: unhandled messages (connections with clean session=true) in the last minute
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
+ info: number of unhandled messages (connections with clean session=true) in the last minute
to: sysadmin
# Erlang VM
@@ -66,19 +66,19 @@ template: vernemq_average_scheduler_utilization
warn: $this > (($status >= $WARNING) ? (75) : (85))
crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 15m multiplier 1.5 max 1h
- info: average scheduler utilization for the last 10 minutes
+ info: average scheduler utilization over the last 10 minutes
to: sysadmin
# Cluster communication and netsplits
template: vernemq_cluster_dropped
on: vernemq.cluster_dropped
- lookup: average -1m unaligned
- units: KiB/s
- every: 10s
+ lookup: sum -1m unaligned
+ units: KiB
+ every: 1m
warn: $this > 0
- delay: down 5m multiplier 1.5 max 1h
- info: the amount of traffic dropped during communication with the cluster nodes in the last minute
+ delay: up 5m down 5m multiplier 1.5 max 1h
+ info: amount of traffic dropped during communication with the cluster nodes in the last minute
to: sysadmin
template: vernemq_netsplits
@@ -88,68 +88,41 @@ template: vernemq_netsplits
every: 10s
warn: $this > 0
delay: down 5m multiplier 1.5 max 2h
- info: detected netsplits in the last minute
+ info: number of detected netsplits (split brain situation) in the last minute
to: sysadmin
# Unsuccessful CONNACK
-template: vernemq_mqtt_connack_sent_reason_success
- on: vernemq.mqtt_connack_sent_reason
- lookup: sum -1m unaligned absolute match-names of success
- units: packets
- every: 10s
- info: successful v3/v5 CONNACK sent in the last minute
- to: sysadmin
-
template: vernemq_mqtt_connack_sent_reason_unsuccessful
on: vernemq.mqtt_connack_sent_reason
- lookup: sum -1m unaligned absolute
- calc: $this - $vernemq_mqtt_connack_sent_reason_success
+ lookup: sum -1m unaligned absolute match-names of !success,*
units: packets
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
- info: unsuccessful v3/v5 CONNACK sent in the last minute
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
+ info: number of sent unsuccessful v3/v5 CONNACK packets in the last minute
to: sysadmin
# Not normal DISCONNECT
-template: vernemq_mqtt_disconnect_received_reason_normal_disconnect
- on: vernemq.mqtt_disconnect_received_reason
- lookup: sum -1m unaligned absolute match-names of normal_disconnect
- units: packets
- every: 10s
- info: normal v5 DISCONNECT received in the last minute
- to: sysadmin
-
-template: vernemq_mqtt_disconnect_sent_reason_normal_disconnect
- on: vernemq.mqtt_disconnect_sent_reason
- lookup: sum -1m unaligned absolute match-names of normal_disconnect
- units: packets
- every: 10s
- info: normal v5 DISCONNECT sent in the last minute
- to: sysadmin
-
template: vernemq_mqtt_disconnect_received_reason_not_normal
on: vernemq.mqtt_disconnect_received_reason
- lookup: sum -1m unaligned absolute
- calc: $this - $vernemq_mqtt_disconnect_received_reason_normal_disconnect
+ lookup: sum -1m unaligned absolute match-names of !normal_disconnect,*
units: packets
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
- info: not normal v5 DISCONNECT received in the last minute
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
+ info: number of received not normal v5 DISCONNECT packets in the last minute
to: sysadmin
template: vernemq_mqtt_disconnect_sent_reason_not_normal
on: vernemq.mqtt_disconnect_sent_reason
- lookup: sum -1m unaligned absolute
- calc: $this - $vernemq_mqtt_disconnect_sent_reason_normal_disconnect
+ lookup: sum -1m unaligned absolute match-names of !normal_disconnect,*
units: packets
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
- info: not normal v5 DISCONNECT sent in the last minute
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
+ info: number of sent not normal v5 DISCONNECT packets in the last minute
to: sysadmin
# SUBSCRIBE errors and unauthorized attempts
@@ -158,20 +131,20 @@ template: vernemq_mqtt_subscribe_error
on: vernemq.mqtt_subscribe_error
lookup: sum -1m unaligned absolute
units: failed ops
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
- info: failed v3/v5 SUBSCRIBE operations in the last minute
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
+ info: number of failed v3/v5 SUBSCRIBE operations in the last minute
to: sysadmin
template: vernemq_mqtt_subscribe_auth_error
on: vernemq.mqtt_subscribe_auth_error
lookup: sum -1m unaligned absolute
units: attempts
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
- info: unauthorized v3/v5 SUBSCRIBE attempts in the last minute
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
+ info: number of unauthorized v3/v5 SUBSCRIBE attempts in the last minute
to: sysadmin
# UNSUBSCRIBE errors
@@ -180,10 +153,10 @@ template: vernemq_mqtt_unsubscribe_error
on: vernemq.mqtt_unsubscribe_error
lookup: sum -1m unaligned absolute
units: failed ops
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
- info: failed v3/v5 UNSUBSCRIBE operations in the last minute
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
+ info: number of failed v3/v5 UNSUBSCRIBE operations in the last minute
to: sysadmin
# PUBLISH errors and unauthorized attempts
@@ -192,208 +165,136 @@ template: vernemq_mqtt_publish_errors
on: vernemq.mqtt_publish_errors
lookup: sum -1m unaligned absolute
units: failed ops
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
- info: failed v3/v5 PUBLISH operations in the last minute
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
+ info: number of failed v3/v5 PUBLISH operations in the last minute
to: sysadmin
template: vernemq_mqtt_publish_auth_errors
on: vernemq.mqtt_publish_auth_errors
lookup: sum -1m unaligned absolute
units: attempts
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
- info: unauthorized v3/v5 PUBLISH attempts in the last minute
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
+ info: number of unauthorized v3/v5 PUBLISH attempts in the last minute
to: sysadmin
# Unsuccessful and unexpected PUBACK
-template: vernemq_mqtt_puback_received_reason_success
- on: vernemq.mqtt_puback_received_reason
- lookup: sum -1m unaligned absolute match-names of success
- units: packets
- every: 10s
- info: successful v5 PUBACK received in the last minute
- to: sysadmin
-
-template: vernemq_mqtt_puback_sent_reason_success
- on: vernemq.mqtt_puback_sent_reason
- lookup: sum -1m unaligned absolute match-names of success
- units: packets
- every: 10s
- info: successful v5 PUBACK sent in the last minute
- to: sysadmin
-
template: vernemq_mqtt_puback_received_reason_unsuccessful
on: vernemq.mqtt_puback_received_reason
- lookup: sum -1m unaligned absolute
- calc: $this - $vernemq_mqtt_puback_received_reason_success
+ lookup: sum -1m unaligned absolute match-names of !success,*
units: packets
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
- info: unsuccessful v5 PUBACK received in the last minute
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
+ info: number of received unsuccessful v5 PUBACK packets in the last minute
to: sysadmin
template: vernemq_mqtt_puback_sent_reason_unsuccessful
on: vernemq.mqtt_puback_sent_reason
- lookup: sum -1m unaligned absolute
- calc: $this - $vernemq_mqtt_puback_sent_reason_success
+ lookup: sum -1m unaligned absolute match-names of !success,*
units: packets
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
- info: unsuccessful v5 PUBACK sent in the last minute
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
+ info: number of sent unsuccessful v5 PUBACK packets in the last minute
to: sysadmin
template: vernemq_mqtt_puback_unexpected
on: vernemq.mqtt_puback_invalid_error
lookup: sum -1m unaligned absolute
units: messages
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
- info: unexpected v3/v5 PUBACK received in the last minute
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
+ info: number of received unexpected v3/v5 PUBACK packets in the last minute
to: sysadmin
# Unsuccessful and unexpected PUBREC
-template: vernemq_mqtt_pubrec_received_reason_success
- on: vernemq.mqtt_pubrec_received_reason
- lookup: sum -1m unaligned absolute match-names of success
- units: packets
- every: 10s
- info: successful v5 PUBREC received in the last minute
- to: sysadmin
-
-template: vernemq_mqtt_pubrec_sent_reason_success
- on: vernemq.mqtt_pubrec_sent_reason
- lookup: sum -1m unaligned absolute match-names of success
- units: packets
- every: 10s
- info: successful v5 PUBREC sent in the last minute
- to: sysadmin
-
template: vernemq_mqtt_pubrec_received_reason_unsuccessful
on: vernemq.mqtt_pubrec_received_reason
- lookup: sum -1m unaligned absolute
- calc: $this - $vernemq_mqtt_pubrec_received_reason_success
+ lookup: sum -1m unaligned absolute match-names of !success,*
units: packets
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
- info: unsuccessful v5 PUBREC received in the last minute
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
+ info: number of received unsuccessful v5 PUBREC packets in the last minute
to: sysadmin
template: vernemq_mqtt_pubrec_sent_reason_unsuccessful
on: vernemq.mqtt_pubrec_sent_reason
- lookup: sum -1m unaligned absolute
- calc: $this - $vernemq_mqtt_pubrec_sent_reason_success
+ lookup: sum -1m unaligned absolute match-names of !success,*
units: packets
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
- info: unsuccessful v5 PUBREC sent in the last minute
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
+ info: number of sent unsuccessful v5 PUBREC packets in the last minute
to: sysadmin
template: vernemq_mqtt_pubrec_invalid_error
on: vernemq.mqtt_pubrec_invalid_error
lookup: sum -1m unaligned absolute
units: messages
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
- info: unexpected v3 PUBREC received in the last minute
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
+ info: number of received unexpected v3 PUBREC packets in the last minute
to: sysadmin
# Unsuccessful PUBREL
-template: vernemq_mqtt_pubrel_received_reason_success
- on: vernemq.mqtt_pubrel_received_reason
- lookup: sum -1m unaligned absolute match-names of success
- units: packets
- every: 10s
- info: successful v5 PUBREL received in the last minute
- to: sysadmin
-
-template: vernemq_mqtt_pubrel_sent_reason_success
- on: vernemq.mqtt_pubrel_sent_reason
- lookup: sum -1m unaligned absolute match-names of success
- units: packets
- every: 10s
- info: successful v5 PUBREL sent in the last minute
- to: sysadmin
-
template: vernemq_mqtt_pubrel_received_reason_unsuccessful
on: vernemq.mqtt_pubrel_received_reason
- lookup: sum -1m unaligned absolute
- calc: $this - $vernemq_mqtt_pubrel_received_reason_success
+ lookup: sum -1m unaligned absolute match-names of !success,*
units: packets
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
- info: unsuccessful v5 PUBREL received in the last minute
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
+ info: number of received unsuccessful v5 PUBREL packets in the last minute
to: sysadmin
template: vernemq_mqtt_pubrel_sent_reason_unsuccessful
on: vernemq.mqtt_pubrel_sent_reason
- lookup: sum -1m unaligned absolute
- calc: $this - $vernemq_mqtt_pubrel_sent_reason_success
+ lookup: sum -1m unaligned absolute match-names of !success,*
units: packets
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
- info: unsuccessful v5 PUBREL sent in the last minute
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
+ info: number of sent unsuccessful v5 PUBREL packets in the last minute
to: sysadmin
# Unsuccessful and unexpected PUBCOMP
-template: vernemq_mqtt_pubcomp_received_reason_success
- on: vernemq.mqtt_pubcomp_received_reason
- lookup: sum -1m unaligned absolute match-names of success
- units: packets
- every: 10s
- info: successful v5 PUBCOMP received in the last minute
- to: sysadmin
-
-template: vernemq_mqtt_pubcomp_sent_reason_success
- on: vernemq.mqtt_pubcomp_sent_reason
- lookup: sum -1m unaligned absolute match-names of success
- units: packets
- every: 10s
- info: successful v5 PUBCOMP sent in the last minute
- to: sysadmin
-
template: vernemq_mqtt_pubcomp_received_reason_unsuccessful
on: vernemq.mqtt_pubcomp_received_reason
- lookup: sum -1m unaligned absolute
- calc: $this - $vernemq_mqtt_pubcomp_received_reason_success
+ lookup: sum -1m unaligned absolute match-names of !success,*
units: packets
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
- info: unsuccessful v5 PUBCOMP received in the last minute
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
+ info: number of received unsuccessful v5 PUBCOMP packets in the last minute
to: sysadmin
template: vernemq_mqtt_pubcomp_sent_reason_unsuccessful
on: vernemq.mqtt_pubcomp_sent_reason
- lookup: sum -1m unaligned absolute
- calc: $this - $vernemq_mqtt_pubcomp_sent_reason_success
+ lookup: sum -1m unaligned absolute match-names of !success,*
units: packets
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
- info: unsuccessful v5 PUBCOMP sent in the last minute
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
+ info: number of sent unsuccessful v5 PUBCOMP packets in the last minute
to: sysadmin
template: vernemq_mqtt_pubcomp_unexpected
on: vernemq.mqtt_pubcomp_invalid_error
lookup: sum -1m unaligned absolute
units: messages
- every: 10s
- warn: $this > 0
- delay: down 5m multiplier 1.5 max 2h
- info: unexpected v3/v5 PUBCOMP received in the last minute
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (5))
+ delay: up 5m down 5m multiplier 1.5 max 2h
+ info: number of received unexpected v3/v5 PUBCOMP packets in the last minute
to: sysadmin
diff --git a/health/health.d/vsphere.conf b/health/health.d/vsphere.conf
index d8b2be19..3e1414c1 100644
--- a/health/health.d/vsphere.conf
+++ b/health/health.d/vsphere.conf
@@ -13,7 +13,7 @@ template: vsphere_vm_mem_usage
warn: $this > (($status >= $WARNING) ? (80) : (90))
crit: $this > (($status == $CRITICAL) ? (90) : (98))
delay: down 15m multiplier 1.5 max 1h
- info: used RAM
+ info: virtual machine memory utilization
# -----------------------------------------------HOST Specific----------------------------------------------------------
# Memory
@@ -27,7 +27,7 @@ template: vsphere_host_mem_usage
warn: $this > (($status >= $WARNING) ? (80) : (90))
crit: $this > (($status == $CRITICAL) ? (90) : (98))
delay: down 15m multiplier 1.5 max 1h
- info: used RAM
+ info: host memory utilization
# Network errors
@@ -38,10 +38,7 @@ families: *
lookup: sum -10m unaligned absolute match-names of rx
units: packets
every: 1m
- warn: $this >= 5
- delay: down 1h multiplier 1.5 max 2h
- info: interface inbound dropped packets in the last 10 minutes
- to: sysadmin
+ info: number of inbound errors for the network interface in the last 10 minutes
template: vsphere_outbound_packets_errors
on: vsphere.net_errors_total
@@ -50,10 +47,7 @@ families: *
lookup: sum -10m unaligned absolute match-names of tx
units: packets
every: 1m
- warn: $this >= 5
- delay: down 1h multiplier 1.5 max 2h
- info: interface outbound dropped packets in the last 10 minutes
- to: sysadmin
+ info: number of outbound errors for the network interface in the last 10 minutes
# Network errors ratio
@@ -62,13 +56,12 @@ template: vsphere_inbound_packets_errors_ratio
hosts: *
families: *
lookup: sum -10m unaligned absolute match-names of rx
- calc: (($vsphere_inbound_packets_errors != nan AND $this > 0) ? ($vsphere_inbound_packets_errors * 100 / $this) : (0))
+ calc: (($vsphere_inbound_packets_errors != nan AND $this > 1000) ? ($vsphere_inbound_packets_errors * 100 / $this) : (0))
units: %
every: 1m
- warn: $this >= 0.1
- crit: $this >= 2
- delay: down 1h multiplier 1.5 max 2h
- info: the ratio of inbound errors vs the total number of received packets of the network interface, during the last 10 minutes
+ warn: $this >= 2
+ delay: up 1m down 1h multiplier 1.5 max 2h
+ info: ratio of inbound errors for the network interface over the last 10 minutes
to: sysadmin
template: vsphere_outbound_packets_errors_ratio
@@ -76,13 +69,12 @@ template: vsphere_outbound_packets_errors_ratio
hosts: *
families: *
lookup: sum -10m unaligned absolute match-names of tx
- calc: (($vsphere_outbound_packets_errors != nan AND $this > 0) ? ($vsphere_outbound_packets_errors * 100 / $this) : (0))
+ calc: (($vsphere_outbound_packets_errors != nan AND $this > 1000) ? ($vsphere_outbound_packets_errors * 100 / $this) : (0))
units: %
every: 1m
- warn: $this >= 0.1
- crit: $this >= 2
- delay: down 1h multiplier 1.5 max 2h
- info: the ratio of outbound errors vs the total number of sent packets of the network interface, during the last 10 minutes
+ warn: $this >= 2
+ delay: up 1m down 1h multiplier 1.5 max 2h
+ info: ratio of outbound errors for the network interface over the last 10 minutes
to: sysadmin
# -----------------------------------------------Common-------------------------------------------------------------------
@@ -97,7 +89,7 @@ template: vsphere_cpu_usage
warn: $this > (($status >= $WARNING) ? (75) : (85))
crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 15m multiplier 1.5 max 1h
- info: cpu utilization for the last 10 minutes
+ info: average CPU utilization
to: sysadmin
# Network drops
@@ -109,10 +101,7 @@ families: *
lookup: sum -10m unaligned absolute match-names of rx
units: packets
every: 1m
- warn: $this >= 5
- delay: down 1h multiplier 1.5 max 2h
- info: interface inbound dropped packets in the last 10 minutes
- to: sysadmin
+ info: number of inbound dropped packets for the network interface in the last 10 minutes
template: vsphere_outbound_packets_dropped
on: vsphere.net_drops_total
@@ -121,10 +110,7 @@ families: *
lookup: sum -10m unaligned absolute match-names of tx
units: packets
every: 1m
- warn: $this >= 5
- delay: down 1h multiplier 1.5 max 2h
- info: interface outbound dropped packets in the last 10 minutes
- to: sysadmin
+ info: number of outbound dropped packets for the network interface in the last 10 minutes
# Network drops ratio
@@ -133,13 +119,12 @@ template: vsphere_inbound_packets_dropped_ratio
hosts: *
families: *
lookup: sum -10m unaligned absolute match-names of rx
- calc: (($vsphere_inbound_packets_dropped != nan AND $this > 0) ? ($vsphere_inbound_packets_dropped * 100 / $this) : (0))
+ calc: (($vsphere_inbound_packets_dropped != nan AND $this > 1000) ? ($vsphere_inbound_packets_dropped * 100 / $this) : (0))
units: %
every: 1m
- warn: $this >= 0.1
- crit: $this >= 2
- delay: down 1h multiplier 1.5 max 2h
- info: the ratio of inbound dropped packets vs the total number of received packets of the network interface, during the last 10 minutes
+ warn: $this >= 2
+ delay: up 1m down 1h multiplier 1.5 max 2h
+ info: ratio of inbound dropped packets for the network interface over the last 10 minutes
to: sysadmin
template: vsphere_outbound_packets_dropped_ratio
@@ -147,11 +132,10 @@ template: vsphere_outbound_packets_dropped_ratio
hosts: *
families: *
lookup: sum -10m unaligned absolute match-names of tx
- calc: (($vsphere_outbound_packets_dropped != nan AND $this > 0) ? ($vsphere_outbound_packets_dropped * 100 / $this) : (0))
+ calc: (($vsphere_outbound_packets_dropped != nan AND $this > 1000) ? ($vsphere_outbound_packets_dropped * 100 / $this) : (0))
units: %
every: 1m
- warn: $this >= 0.1
- crit: $this >= 2
- delay: down 1h multiplier 1.5 max 2h
- info: the ratio of outbound dropped packets vs the total number of sent packets of the network interface, during the last 10 minutes
+ warn: $this >= 2
+ delay: up 1m down 1h multiplier 1.5 max 2h
+ info: ratio of outbound dropped packets for the network interface over the last 10 minutes
to: sysadmin
diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf
index 44de38a4..0b01990c 100644
--- a/health/health.d/web_log.conf
+++ b/health/health.d/web_log.conf
@@ -31,7 +31,7 @@ families: *
calc: ($this == 0)?(1):($this)
units: requests
every: 10s
- info: the sum of all HTTP requests over the last minute
+ info: number of HTTP requests in the last minute
template: 1m_successful
on: web_log.response_statuses
@@ -43,7 +43,7 @@ families: *
warn: ($1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
crit: ($1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
delay: up 2m down 15m multiplier 1.5 max 1h
- info: the ratio of successful HTTP responses (1xx, 2xx, 304, 401) over the last minute
+ info: ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401)
to: webmaster
template: 1m_redirects
@@ -56,7 +56,7 @@ families: *
warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 )
crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 )
delay: up 2m down 15m multiplier 1.5 max 1h
- info: the ratio of HTTP redirects (3xx except 304) over the last minute
+ info: ratio of redirection HTTP requests over the last minute (3xx except 304)
to: webmaster
template: 1m_bad_requests
@@ -69,7 +69,7 @@ families: *
warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 )
crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 )
delay: up 2m down 15m multiplier 1.5 max 1h
- info: the ratio of HTTP bad requests (4xx except 401) over the last minute
+ info: ratio of client error HTTP requests over the last minute (4xx except 401)
to: webmaster
template: 1m_internal_errors
@@ -82,7 +82,7 @@ families: *
warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 )
crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
delay: up 2m down 15m multiplier 1.5 max 1h
- info: the ratio of HTTP internal server errors (5xx), over the last minute
+ info: ratio of server error HTTP requests over the last minute (5xx)
to: webmaster
# unmatched lines
@@ -101,10 +101,10 @@ families: *
calc: ($this == 0)?(1):($this)
units: requests
every: 10s
- info: the sum of all HTTP requests over the last minute
+ info: number of HTTP requests over the last minute
template: 1m_unmatched
-on: web_log.response_codes
+ on: web_log.response_codes
families: *
lookup: sum -1m unaligned of unmatched
calc: $this * 100 / $1m_total_requests
@@ -112,7 +112,7 @@ families: *
every: 10s
warn: ($1m_total_requests > 120) ? ($this > 1) : ( 0 )
delay: up 1m down 5m multiplier 1.5 max 1h
- info: the ratio of unmatched lines, over the last minute
+ info: percentage of unparsed log lines over the last minute
to: webmaster
# -----------------------------------------------------------------------------
@@ -131,7 +131,7 @@ families: *
lookup: average -10m unaligned of avg
units: ms
every: 30s
- info: the average time to respond to HTTP requests, over the last 10 minutes
+ info: average HTTP response time over the last 10 minutes
template: web_slow
on: web_log.response_time
@@ -144,7 +144,7 @@ families: *
warn: ($1m_requests > 120) ? ($this > $green && $this > ($10m_response_time * 2) ) : ( 0 )
crit: ($1m_requests > 120) ? ($this > $red && $this > ($10m_response_time * 4) ) : ( 0 )
delay: down 15m multiplier 1.5 max 1h
- info: the average time to respond to HTTP requests, over the last 1 minute
+ info: average HTTP response time over the last minute
options: no-clear-notification
to: webmaster
@@ -165,7 +165,7 @@ families: *
lookup: average -5m at -5m unaligned of successful_requests
units: requests/s
every: 30s
- info: average rate of successful HTTP requests over the last 5 minutes
+ info: average number of successful HTTP requests for the 5 minutes starting 10 minutes ago
template: 5m_successful
on: web_log.response_statuses
@@ -173,7 +173,7 @@ families: *
lookup: average -5m unaligned of successful_requests
units: requests/s
every: 30s
- info: average successful HTTP requests over the last 5 minutes
+ info: average number of successful HTTP requests over the last 5 minutes
template: 5m_requests_ratio
on: web_log.response_codes
@@ -185,7 +185,7 @@ families: *
crit: ($5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
delay: down 15m multiplier 1.5 max 1h
options: no-clear-notification
- info: the percentage of successful web requests over the last 5 minutes, \
+ info: ratio of successful HTTP requests over the last 5 minutes, \
compared with the previous 5 minutes \
(clear notification for this alarm will not be sent)
to: webmaster
@@ -224,7 +224,7 @@ families: *
calc: ($this == 0)?(1):($this)
units: requests
every: 10s
- info: the sum of all HTTP requests over the last minute
+ info: number of HTTP requests in the last minute
template: web_log_1m_unmatched
on: web_log.excluded_requests
@@ -235,7 +235,7 @@ families: *
every: 10s
warn: ($web_log_1m_total_requests > 120) ? ($this > 1) : ( 0 )
delay: up 1m down 5m multiplier 1.5 max 1h
- info: the ratio of unmatched lines, over the last minute
+ info: percentage of unparsed log lines over the last minute
to: webmaster
# -----------------------------------------------------------------------------
@@ -255,7 +255,7 @@ families: *
calc: ($this == 0)?(1):($this)
units: requests
every: 10s
- info: the sum of all HTTP requests over the last minute
+ info: number of HTTP requests in the last minute
template: web_log_1m_successful
on: web_log.type_requests
@@ -267,7 +267,7 @@ families: *
warn: ($web_log_1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
crit: ($web_log_1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
delay: up 2m down 15m multiplier 1.5 max 1h
- info: the ratio of successful HTTP responses (1xx, 2xx, 304, 401) over the last minute
+ info: ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401)
to: webmaster
template: web_log_1m_redirects
@@ -280,7 +280,7 @@ families: *
warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 )
crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 )
delay: up 2m down 15m multiplier 1.5 max 1h
- info: the ratio of HTTP redirects (3xx except 304) over the last minute
+ info: ratio of redirection HTTP requests over the last minute (3xx except 304)
to: webmaster
template: web_log_1m_bad_requests
@@ -293,7 +293,7 @@ families: *
warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 )
crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 )
delay: up 2m down 15m multiplier 1.5 max 1h
- info: the ratio of HTTP bad requests (4xx except 401) over the last minute
+ info: ratio of client error HTTP requests over the last minute (4xx except 401)
to: webmaster
template: web_log_1m_internal_errors
@@ -306,7 +306,7 @@ families: *
warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 )
crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
delay: up 2m down 15m multiplier 1.5 max 1h
- info: the ratio of HTTP internal server errors (5xx), over the last minute
+ info: ratio of server error HTTP requests over the last minute (5xx)
to: webmaster
# -----------------------------------------------------------------------------
@@ -325,7 +325,7 @@ families: *
lookup: average -10m unaligned of avg
units: ms
every: 30s
- info: the average time to respond to HTTP requests, over the last 10 minutes
+ info: average HTTP response time over the last 10 minutes
template: web_log_web_slow
on: web_log.request_processing_time
@@ -338,7 +338,7 @@ families: *
warn: ($web_log_1m_requests > 120) ? ($this > $green && $this > ($web_log_10m_response_time * 2) ) : ( 0 )
crit: ($web_log_1m_requests > 120) ? ($this > $red && $this > ($web_log_10m_response_time * 4) ) : ( 0 )
delay: down 15m multiplier 1.5 max 1h
- info: the average time to respond to HTTP requests, over the last 1 minute
+ info: average HTTP response time over the last 1 minute
options: no-clear-notification
to: webmaster
@@ -359,7 +359,7 @@ families: *
lookup: average -5m at -5m unaligned of success
units: requests/s
every: 30s
- info: average rate of successful HTTP requests over the last 5 minutes
+ info: average number of successful HTTP requests for the 5 minutes starting 10 minutes ago
template: web_log_5m_successful
on: web_log.type_requests
@@ -367,7 +367,7 @@ families: *
lookup: average -5m unaligned of success
units: requests/s
every: 30s
- info: average successful HTTP requests over the last 5 minutes
+ info: average number of successful HTTP requests over the last 5 minutes
template: web_log_5m_requests_ratio
on: web_log.type_requests
@@ -379,7 +379,7 @@ families: *
crit: ($web_log_5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
delay: down 15m multiplier 1.5 max 1h
options: no-clear-notification
- info: the percentage of successful web requests over the last 5 minutes, \
+ info: ratio of successful HTTP requests over over the last 5 minutes, \
compared with the previous 5 minutes \
(clear notification for this alarm will not be sent)
to: webmaster
diff --git a/health/health.d/whoisquery.conf b/health/health.d/whoisquery.conf
index 275e11dd..36ae02fa 100644
--- a/health/health.d/whoisquery.conf
+++ b/health/health.d/whoisquery.conf
@@ -20,5 +20,5 @@ template: whoisquery_days_until_expiration
every: 60s
warn: $this < $days_until_expiration_warning*24*60*60
crit: $this < $days_until_expiration_critical*24*60*60
- info: domain time until expiration
+ info: time until the domain name registration expires
to: webmaster
diff --git a/health/health.d/wmi.conf b/health/health.d/wmi.conf
index 0441fc1f..f1f71a60 100644
--- a/health/health.d/wmi.conf
+++ b/health/health.d/wmi.conf
@@ -26,7 +26,7 @@ template: wmi_10min_cpu_usage
warn: $this > (($status >= $WARNING) ? (75) : (85))
crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 15m multiplier 1.5 max 1h
- info: cpu utilization for the last 10 minutes
+ info: average CPU utilization over the last 10 minutes
to: sysadmin
@@ -42,7 +42,7 @@ template: wmi_ram_in_use
warn: $this > (($status >= $WARNING) ? (80) : (90))
crit: $this > (($status == $CRITICAL) ? (90) : (98))
delay: down 15m multiplier 1.5 max 1h
- info: used RAM
+ info: memory utilization
to: sysadmin
template: wmi_swap_in_use
@@ -55,13 +55,13 @@ template: wmi_swap_in_use
warn: $this > (($status >= $WARNING) ? (80) : (90))
crit: $this > (($status == $CRITICAL) ? (90) : (98))
delay: down 15m multiplier 1.5 max 1h
- info: used Swap
+ info: swap memory utilization
to: sysadmin
## Network
-template: inbound_packets_discarded
+template: wmi_inbound_packets_discarded
on: wmi.net_discarded
os: linux
hosts: *
@@ -71,10 +71,10 @@ families: *
every: 1m
warn: $this >= 5
delay: down 1h multiplier 1.5 max 2h
- info: interface inbound discarded packets in the last 10 minutes
+ info: number of inbound discarded packets for the network interface in the last 10 minutes
to: sysadmin
-template: outbound_packets_discarded
+template: wmi_outbound_packets_discarded
on: wmi.net_discarded
os: linux
hosts: *
@@ -84,10 +84,10 @@ families: *
every: 1m
warn: $this >= 5
delay: down 1h multiplier 1.5 max 2h
- info: interface outbound discarded packets in the last 10 minutes
+ info: number of outbound discarded packets for the network interface in the last 10 minutes
to: sysadmin
-template: inbound_packets_errors
+template: wmi_inbound_packets_errors
on: wmi.net_errors
os: linux
hosts: *
@@ -97,10 +97,10 @@ families: *
every: 1m
warn: $this >= 5
delay: down 1h multiplier 1.5 max 2h
- info: interface inbound errors in the last 10 minutes
+ info: number of inbound errors for the network interface in the last 10 minutes
to: sysadmin
-template: outbound_packets_errors
+template: wmi_outbound_packets_errors
on: wmi.net_errors
os: linux
hosts: *
@@ -110,7 +110,7 @@ families: *
every: 1m
warn: $this >= 5
delay: down 1h multiplier 1.5 max 2h
- info: interface outbound errors in the last 10 minutes
+ info: number of outbound errors for the network interface in the last 10 minutes
to: sysadmin
@@ -126,5 +126,5 @@ template: wmi_disk_in_use
warn: $this > (($status >= $WARNING) ? (80) : (90))
crit: $this > (($status == $CRITICAL) ? (90) : (98))
delay: down 15m multiplier 1.5 max 1h
- info: used disk space
+ info: disk space utilization
to: sysadmin
diff --git a/health/health.d/x509check.conf b/health/health.d/x509check.conf
index dfca3770..f2e4a050 100644
--- a/health/health.d/x509check.conf
+++ b/health/health.d/x509check.conf
@@ -20,7 +20,7 @@ template: x509check_days_until_expiration
every: 60s
warn: $this < $days_until_expiration_warning*24*60*60
crit: $this < $days_until_expiration_critical*24*60*60
- info: certificate time until expiration
+ info: time until x509 certificate expires
to: webmaster
template: x509check_revocation_status
@@ -28,5 +28,5 @@ template: x509check_revocation_status
calc: $revoked
every: 60s
crit: $this != nan AND $this != 0
- info: certificate revocation status
+ info: x509 certificate revocation status (0: revoked, 1: valid)
to: webmaster
diff --git a/health/health.d/zfs.conf b/health/health.d/zfs.conf
index af73824e..74f96dd3 100644
--- a/health/health.d/zfs.conf
+++ b/health/health.d/zfs.conf
@@ -6,5 +6,5 @@
every: 1m
warn: $this > 0
delay: down 1h multiplier 1.5 max 2h
- info: the number of times ZFS had to limit the ARC growth in the last 10 minutes
+ info: number of times ZFS had to limit the ARC growth in the last 10 minutes
to: sysadmin
diff --git a/health/health.h b/health/health.h
index 5281e16e..07ce1311 100644
--- a/health/health.h
+++ b/health/health.h
@@ -64,7 +64,7 @@ extern int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *
extern void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* context, RRDCALC_STATUS status);
extern void health_alarms2json(RRDHOST *host, BUFFER *wb, int all);
extern void health_alarms_values2json(RRDHOST *host, BUFFER *wb, int all);
-extern void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after);
+extern void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart);
void health_api_v1_chart_variables2json(RRDSET *st, BUFFER *buf);
void health_api_v1_chart_custom_variables2json(RRDSET *st, BUFFER *buf);
diff --git a/health/health_config.c b/health/health_config.c
index 1acf3693..e24acf77 100644
--- a/health/health_config.c
+++ b/health/health_config.c
@@ -384,7 +384,7 @@ static inline int health_parse_db_lookup(
}
// sane defaults
- *every = abs(*after);
+ *every = ABS(*after);
// now we may have optional parameters
while(*s) {
diff --git a/health/health_json.c b/health/health_json.c
index 7b5a1e3c..2a81d1c0 100644
--- a/health/health_json.c
+++ b/health/health_json.c
@@ -2,7 +2,7 @@
#include "health.h"
-static inline void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) {
+void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) {
if(value && *value) {
buffer_sprintf(wb, "%s\"%s\":\"", prefix, label);
buffer_strcat_htmlescape(wb, value);
@@ -13,7 +13,7 @@ static inline void health_string2json(BUFFER *wb, const char *prefix, const char
buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix);
}
-inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) {
+void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) {
buffer_sprintf(wb,
"\n\t{\n"
"\t\t\"hostname\": \"%s\",\n"
@@ -93,18 +93,22 @@ inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST
buffer_strcat(wb, "\t}");
}
-void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after) {
+void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart) {
netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
buffer_strcat(wb, "[");
unsigned int max = host->health_log.max;
unsigned int count = 0;
+ uint32_t hash_chart = 0;
+ if (chart) hash_chart = simple_hash(chart);
ALARM_ENTRY *ae;
- for(ae = host->health_log.alarms; ae && count < max ; count++, ae = ae->next) {
- if(ae->unique_id > after) {
- if(likely(count)) buffer_strcat(wb, ",");
+ for (ae = host->health_log.alarms; ae && count < max; ae = ae->next) {
+ if ((ae->unique_id > after) && (!chart || (ae->hash_chart == hash_chart && !strcmp(ae->chart, chart)))) {
+ if (likely(count))
+ buffer_strcat(wb, ",");
health_alarm_entry2json_nolock(wb, ae, host);
+ count++;
}
}
@@ -298,6 +302,9 @@ static void health_alarms2json_fill_alarms(RRDHOST *host, BUFFER *wb, int all, v
if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
continue;
+ if (unlikely(!rrdset_is_available_for_exporting_and_alarms(rc->rrdset)))
+ continue;
+
if(likely(!all && !(rc->status == RRDCALC_STATUS_WARNING || rc->status == RRDCALC_STATUS_CRITICAL)))
continue;
diff --git a/health/health_log.c b/health/health_log.c
index 8c0bc5c3..3205f592 100644
--- a/health/health_log.c
+++ b/health/health_log.c
@@ -213,8 +213,8 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char
if(likely(*pointers[0] == 'U' || *pointers[0] == 'A')) {
ALARM_ENTRY *ae = NULL;
- if(entries < 26) {
- error("HEALTH [%s]: line %zu of file '%s' should have at least 26 entries, but it has %d. Ignoring it.", host->hostname, line, filename, entries);
+ if(entries < 27) {
+ error("HEALTH [%s]: line %zu of file '%s' should have at least 27 entries, but it has %d. Ignoring it.", host->hostname, line, filename, entries);
errored++;
continue;
}
@@ -243,7 +243,7 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char
RRDCALC *rc = alarm_max_last_repeat(host, alarm_name,simple_hash(alarm_name));
if (!rc) {
for(rc = host->alarms; rc ; rc = rc->next) {
- RRDCALC *rdcmp = (RRDCALC *) avl_insert_lock(&(host)->alarms_idx_name, (avl *)rc);
+ RRDCALC *rdcmp = (RRDCALC *) avl_insert_lock(&(host)->alarms_idx_name, (avl_t *)rc);
if(rdcmp != rc) {
error("Cannot insert the alarm index ID using log %s", rc->name);
}
diff --git a/health/notifications/alarm-notify.sh.in b/health/notifications/alarm-notify.sh.in
index 3bf8db5f..bf6c0281 100755
--- a/health/notifications/alarm-notify.sh.in
+++ b/health/notifications/alarm-notify.sh.in
@@ -209,6 +209,9 @@ if [[ ${1} = "unittest" ]]; then
cfgfile="${3}" # the location of the config file to use for unit testing
status="${4}" # the current status : REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
old_status="${5}" # the previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
+elif [[ ${1} = "dump_methods" ]]; then
+ dump_methods=1
+ status="WARNING"
else
roles="${1}" # the roles that should be notified for this event
args_host="${2}" # the host generated this event
@@ -372,6 +375,7 @@ EMAIL_PLAINTEXT_ONLY=
IRC_NICKNAME=
IRC_REALNAME=
IRC_NETWORK=
+IRC_PORT=6667
# hangouts configs
declare -A HANGOUTS_WEBHOOK_URI
@@ -549,6 +553,15 @@ filter_recipient_by_criticality() {
# check stackpulse
[ -z "${STACKPULSE_WEBHOOK}" ] && SEND_STACKPULSE="NO"
+# check msteam
+[ -z "${MSTEAM_WEBHOOK_URL}" ] && SEND_MSTEAM="NO"
+
+# check pd
+[ -z "${DEFAULT_RECIPIENT_PD}" ] && SEND_PD="NO"
+
+# check prowl
+[ -z "${DEFAULT_RECIPIENT_PROWL}" ] && SEND_PROWL="NO"
+
if [ "${SEND_PUSHOVER}" = "YES" ] ||
[ "${SEND_SLACK}" = "YES" ] ||
[ "${SEND_ROCKETCHAT}" = "YES" ] ||
@@ -639,6 +652,15 @@ if [ "${SEND_AWSSNS}" = "YES" ] && [ -z "${aws}" ]; then
fi
fi
+if [ ${dump_methods} ]; then
+ for name in "${!SEND_@}"; do
+ if [ "${!name}" = "YES" ]; then
+ echo "$name"
+ fi
+ done
+ exit
+fi
+
# -----------------------------------------------------------------------------
# find the recipients' addresses per method
@@ -864,14 +886,15 @@ send_email() {
echo >&2 "--- END sendmail command ---"
fi
- "${sendmail}" -t "${opts[@]}"
+ local cmd_output
+ cmd_output=$("${sendmail}" -t "${opts[@]}" 2>&1)
ret=$?
if [ ${ret} -eq 0 ]; then
info "sent email notification for: ${host} ${chart}.${name} is ${status} to '${to_email}'"
return 0
else
- error "failed to send email notification for: ${host} ${chart}.${name} is ${status} to '${to_email}' with error code ${ret}."
+ error "failed to send email notification for: ${host} ${chart}.${name} is ${status} to '${to_email}' with error code ${ret} (${cmd_output})."
return 1
fi
fi
@@ -1722,9 +1745,9 @@ send_prowl() {
# irc sender
send_irc() {
- local NICKNAME="${1}" REALNAME="${2}" CHANNELS="${3}" NETWORK="${4}" SERVERNAME="${5}" MESSAGE="${6}" sent=0 channel color send_alarm reply_codes error
+ local NICKNAME="${1}" REALNAME="${2}" CHANNELS="${3}" NETWORK="${4}" PORT="${5}" SERVERNAME="${6}" MESSAGE="${7}" sent=0 channel color send_alarm reply_codes error
- if [ "${SEND_IRC}" = "YES" ] && [ -n "${NICKNAME}" ] && [ -n "${REALNAME}" ] && [ -n "${CHANNELS}" ] && [ -n "${NETWORK}" ] && [ -n "${SERVERNAME}" ]; then
+ if [ "${SEND_IRC}" = "YES" ] && [ -n "${NICKNAME}" ] && [ -n "${REALNAME}" ] && [ -n "${CHANNELS}" ] && [ -n "${NETWORK}" ] && [ -n "${SERVERNAME}" ] && [ -n "${PORT}" ]; then
case "${status}" in
WARNING) color="warning" ;;
CRITICAL) color="danger" ;;
@@ -1735,7 +1758,7 @@ send_irc() {
SNDMESSAGE="${MESSAGE//$'\n'/", "}"
for CHANNEL in ${CHANNELS}; do
error=0
- send_alarm=$(echo -e "USER ${NICKNAME} guest ${REALNAME} ${SERVERNAME}\\nNICK ${NICKNAME}\\nJOIN ${CHANNEL}\\nPRIVMSG ${CHANNEL} :${SNDMESSAGE}\\nQUIT\\n" \ | nc "${NETWORK}" 6667)
+ send_alarm=$(echo -e "USER ${NICKNAME} guest ${REALNAME} ${SERVERNAME}\\nNICK ${NICKNAME}\\nJOIN ${CHANNEL}\\nPRIVMSG ${CHANNEL} :${SNDMESSAGE}\\nQUIT\\n" \ | nc "${NETWORK}" "${PORT}")
reply_codes=$(echo "${send_alarm}" | cut -d ' ' -f 2 | grep -o '[0-9]*')
for code in ${reply_codes}; do
if [ "${code}" -ge 400 ] && [ "${code}" -le 599 ]; then
@@ -2465,7 +2488,7 @@ SENT_PROWL=$?
# -----------------------------------------------------------------------------
# send the irc message
-send_irc "${IRC_NICKNAME}" "${IRC_REALNAME}" "${to_irc}" "${IRC_NETWORK}" "${host}" "${host} ${status_message} - ${name//_/ } - ${chart} ----- ${alarm}
+send_irc "${IRC_NICKNAME}" "${IRC_REALNAME}" "${to_irc}" "${IRC_NETWORK}" "${IRC_PORT}" "${host}" "${host} ${status_message} - ${name//_/ } - ${chart} ----- ${alarm}
Severity: ${severity}
Chart: ${chart}
Family: ${family}
diff --git a/health/notifications/email/README.md b/health/notifications/email/README.md
index 827a9c0b..ebd7f4b8 100644
--- a/health/notifications/email/README.md
+++ b/health/notifications/email/README.md
@@ -43,7 +43,7 @@ You can always find the location of the alarm-notify.sh script in `netdata.conf`
If you want an alternative to `sendmail` in order to have a simple MTA configuration for sending emails and auth to an existing SMTP server, you can do the following:
- Install `msmtp`.
-- Modify the `sendmail` path in `health_alarm_notify.conf` to point to the location of `mstmp`:
+- Modify the `sendmail` path in `health_alarm_notify.conf` to point to the location of `msmtp`:
```
# The full path to the sendmail command.
# If empty, the system $PATH will be searched for it.
diff --git a/health/notifications/health_alarm_notify.conf b/health/notifications/health_alarm_notify.conf
index be669e13..2dab1d48 100755
--- a/health/notifications/health_alarm_notify.conf
+++ b/health/notifications/health_alarm_notify.conf
@@ -676,6 +676,10 @@ DEFAULT_RECIPIENT_IRC=""
# e.g. "irc.freenode.net"
IRC_NETWORK=""
+# The irc port to which a connection will occur.
+# e.g. 6667 (the default one), 6697 (a TLS/SSL one)
+IRC_PORT=6667
+
# The irc nickname which is required to send the notification. It must not be
# an already registered name as the connection's MODE is defined as a 'guest'.
IRC_NICKNAME=""
diff --git a/health/notifications/stackpulse/README.md b/health/notifications/stackpulse/README.md
index 13d2f723..4c44954a 100644
--- a/health/notifications/stackpulse/README.md
+++ b/health/notifications/stackpulse/README.md
@@ -39,8 +39,9 @@ SEND_STACKPULSE="YES"
STACKPULSE_WEBHOOK="https://hooks.stackpulse.io/v1/webhooks/YOUR_UNIQUE_ID"
```
-4. Now [restart Netdata](/docs/getting-started.md#start-stop-and-restart-netdata). When your node creates an alarm, you
- can see the associated notification on your StackPulse Administration Portal
+4. Now restart Netdata using `sudo systemctl restart netdata`, or the [appropriate
+ method](/docs/configure/start-stop-restart.md) for your system. When your node creates an alarm, you can see the
+ associated notification on your StackPulse Administration Portal
## React to alarms with playbooks