summaryrefslogtreecommitdiffstats
path: root/health/health.d
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2018-11-07 12:22:44 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2018-11-07 12:22:44 +0000
commit1e6c93250172946eeb38e94a92a1fd12c9d3011e (patch)
tree8ca5e16dfc7ad6b3bf2738ca0a48408a950f8f7e /health/health.d
parentUpdate watch file (diff)
downloadnetdata-1e6c93250172946eeb38e94a92a1fd12c9d3011e.tar.xz
netdata-1e6c93250172946eeb38e94a92a1fd12c9d3011e.zip
Merging upstream version 1.11.0+dfsg.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'health/health.d')
-rw-r--r--health/health.d/adaptec_raid.conf24
-rw-r--r--health/health.d/apache.conf14
-rw-r--r--health/health.d/apcupsd.conf40
-rw-r--r--health/health.d/backend.conf45
-rw-r--r--health/health.d/bcache.conf22
-rw-r--r--health/health.d/beanstalkd.conf36
-rw-r--r--health/health.d/bind_rndc.conf9
-rw-r--r--health/health.d/boinc.conf62
-rw-r--r--health/health.d/btrfs.conf57
-rw-r--r--health/health.d/ceph.conf13
-rw-r--r--health/health.d/couchdb.conf13
-rw-r--r--health/health.d/cpu.conf55
-rw-r--r--health/health.d/disks.conf167
-rw-r--r--health/health.d/dockerd.conf8
-rw-r--r--health/health.d/elasticsearch.conf9
-rw-r--r--health/health.d/entropy.conf16
-rw-r--r--health/health.d/fping.conf53
-rw-r--r--health/health.d/fronius.conf11
-rw-r--r--health/health.d/haproxy.conf27
-rw-r--r--health/health.d/httpcheck.conf99
-rw-r--r--health/health.d/ipc.conf28
-rw-r--r--health/health.d/ipfs.conf11
-rw-r--r--health/health.d/ipmi.conf20
-rw-r--r--health/health.d/isc_dhcpd.conf10
-rw-r--r--health/health.d/lighttpd.conf14
-rw-r--r--health/health.d/linux_power_supply.conf12
-rw-r--r--health/health.d/load.conf56
-rw-r--r--health/health.d/mdstat.conf27
-rw-r--r--health/health.d/megacli.conf48
-rw-r--r--health/health.d/memcached.conf52
-rw-r--r--health/health.d/memory.conf38
-rw-r--r--health/health.d/mongodb.conf13
-rw-r--r--health/health.d/mysql.conf100
-rw-r--r--health/health.d/named.conf14
-rw-r--r--health/health.d/net.conf155
-rw-r--r--health/health.d/netfilter.conf29
-rw-r--r--health/health.d/nginx.conf14
-rw-r--r--health/health.d/nginx_plus.conf14
-rw-r--r--health/health.d/portcheck.conf48
-rw-r--r--health/health.d/postgres.conf13
-rw-r--r--health/health.d/qos.conf18
-rw-r--r--health/health.d/ram.conf64
-rw-r--r--health/health.d/redis.conf34
-rw-r--r--health/health.d/retroshare.conf25
-rw-r--r--health/health.d/softnet.conf40
-rw-r--r--health/health.d/squid.conf14
-rw-r--r--health/health.d/stiebeleltron.conf11
-rw-r--r--health/health.d/swap.conf43
-rw-r--r--health/health.d/tcp_conn.conf19
-rw-r--r--health/health.d/tcp_listen.conf82
-rw-r--r--health/health.d/tcp_mem.conf20
-rw-r--r--health/health.d/tcp_orphans.conf21
-rw-r--r--health/health.d/tcp_resets.conf67
-rw-r--r--health/health.d/udp_errors.conf49
-rw-r--r--health/health.d/varnish.conf9
-rw-r--r--health/health.d/web_log.conf163
-rw-r--r--health/health.d/zfs.conf10
57 files changed, 2185 insertions, 0 deletions
diff --git a/health/health.d/adaptec_raid.conf b/health/health.d/adaptec_raid.conf
new file mode 100644
index 000000000..a1301ce8a
--- /dev/null
+++ b/health/health.d/adaptec_raid.conf
@@ -0,0 +1,24 @@
+
+# logical device status check
+
+template: adapter_raid_ld_status
+ on: adapter_raid.ld_status
+ lookup: max -5s
+ units: bool
+ every: 10s
+ crit: $this > 0
+ delay: down 5m multiplier 1.5 max 1h
+ info: at least 1 logical device is failed or degraded
+ to: sysadmin
+
+# physical device state check
+
+template: adapter_raid_pd_state
+ on: adapter_raid.pd_state
+ lookup: max -5s
+ units: bool
+ every: 10s
+ crit: $this > 0
+ delay: down 5m multiplier 1.5 max 1h
+ info: at least 1 physical device is not in online state
+ to: sysadmin
diff --git a/health/health.d/apache.conf b/health/health.d/apache.conf
new file mode 100644
index 000000000..0c98b8778
--- /dev/null
+++ b/health/health.d/apache.conf
@@ -0,0 +1,14 @@
+
+# make sure apache is running
+
+template: apache_last_collected_secs
+ on: apache.requests
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: webmaster
+
diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf
new file mode 100644
index 000000000..4f86037ba
--- /dev/null
+++ b/health/health.d/apcupsd.conf
@@ -0,0 +1,40 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+template: 10min_ups_load
+ on: apcupsd.load
+ os: *
+ hosts: *
+ lookup: average -10m unaligned of percentage
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (70) : (80))
+ crit: $this > (($status == $CRITICAL) ? (85) : (95))
+ delay: down 10m multiplier 1.5 max 1h
+ info: average UPS load for the last 10 minutes
+ to: sitemgr
+
+# Discussion in https://github.com/netdata/netdata/pull/3928:
+# Fire the alarm as soon as it's going on battery (99% charge) and clear only when full.
+template: ups_charge
+ on: apcupsd.charge
+ os: *
+ hosts: *
+ lookup: average -60s unaligned of charge
+ units: %
+ every: 60s
+ warn: $this < 100
+ crit: $this < (($status == $CRITICAL) ? (60) : (50))
+ delay: down 10m multiplier 1.5 max 1h
+ info: current UPS charge, averaged over the last 60 seconds to reduce measurement errors
+ to: sitemgr
+
+template: apcupsd_last_collected_secs
+ on: apcupsd.load
+ calc: $now - $last_collected_t
+ every: 10s
+ units: seconds ago
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: sitemgr
diff --git a/health/health.d/backend.conf b/health/health.d/backend.conf
new file mode 100644
index 000000000..7af100d8f
--- /dev/null
+++ b/health/health.d/backend.conf
@@ -0,0 +1,45 @@
+
+# make sure we are sending data to backend
+
+ alarm: backend_last_buffering
+ on: netdata.backend_metrics
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful buffering of backend data
+ to: dba
+
+ alarm: backend_metrics_sent
+ on: netdata.backend_metrics
+ units: %
+ calc: abs($sent) * 100 / abs($buffered)
+ every: 10s
+ warn: $this != 100
+ delay: down 5m multiplier 1.5 max 1h
+ info: percentage of metrics sent to the backend server
+ to: dba
+
+ alarm: backend_metrics_lost
+ on: netdata.backend_metrics
+ units: metrics
+ calc: abs($lost)
+ every: 10s
+ crit: ($this != 0) || ($status == $CRITICAL && abs($sent) == 0)
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of metrics lost due to repeating failures to contact the backend server
+ to: dba
+
+# this chart has been removed from netdata
+# alarm: backend_slow
+# on: netdata.backend_latency
+# units: %
+# calc: $latency * 100 / ($update_every * 1000)
+# every: 10s
+# warn: $this > 50
+# crit: $this > 100
+# delay: down 5m multiplier 1.5 max 1h
+# info: the percentage of time between iterations needed by the backend time to process the data sent by netdata
+# to: dba
diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf
new file mode 100644
index 000000000..f0da9ac5e
--- /dev/null
+++ b/health/health.d/bcache.conf
@@ -0,0 +1,22 @@
+
+template: bcache_cache_errors
+ on: disk.bcache_cache_read_races
+ lookup: sum -10m unaligned absolute
+ units: errors
+ every: 1m
+ warn: $this > 0
+ crit: $this > ( ($status >= $CRITICAL) ? (0) : (10) )
+ delay: down 1h multiplier 1.5 max 2h
+ info: the number of times bcache had issues using the cache, during the last 10 mins (this usually means your SSD cache is failing)
+ to: sysadmin
+
+template: bcache_cache_dirty
+ on: disk.bcache_cache_alloc
+ calc: $dirty + $metadata + $undefined
+ units: %
+ every: 1m
+ warn: $this > ( ($status >= $WARNING ) ? ( 70 ) : ( 90 ) )
+ crit: $this > ( ($status >= $CRITICAL) ? ( 90 ) : ( 95 ) )
+ delay: up 1m down 1h multiplier 1.5 max 2h
+ info: the percentage of cache space used for dirty and metadata (this usually means your SSD cache is too small)
+ to: sysadmin
diff --git a/health/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf
new file mode 100644
index 000000000..30dc27328
--- /dev/null
+++ b/health/health.d/beanstalkd.conf
@@ -0,0 +1,36 @@
+# get the number of buried jobs in all queues
+
+template: server_buried_jobs
+ on: beanstalk.current_jobs
+ calc: $buried
+ units: jobs
+ every: 10s
+ warn: $this > 0
+ crit: $this > 10
+ delay: up 0 down 5m multiplier 1.2 max 1h
+ info: the number of buried jobs aggregated across all tubes
+ to: sysadmin
+
+# get the number of buried jobs per queue
+
+#template: tube_buried_jobs
+# on: beanstalk.jobs
+# calc: $buried
+# units: jobs
+# every: 10s
+# warn: $this > 0
+# crit: $this > 10
+# delay: up 0 down 5m multiplier 1.2 max 1h
+# info: the number of jobs buried per tube
+# to: sysadmin
+
+# get the current number of tubes
+
+#template: number_of_tubes
+# on: beanstalk.current_tubes
+# calc: $tubes
+# every: 10s
+# warn: $this < 5
+# delay: up 0 down 5m multiplier 1.2 max 1h
+# info: the current number of tubes on the server
+# to: sysadmin
diff --git a/health/health.d/bind_rndc.conf b/health/health.d/bind_rndc.conf
new file mode 100644
index 000000000..4145e77cd
--- /dev/null
+++ b/health/health.d/bind_rndc.conf
@@ -0,0 +1,9 @@
+ template: bind_rndc_stats_file_size
+ on: bind_rndc.stats_size
+ units: megabytes
+ every: 60
+ calc: $stats_size
+ warn: $this > 512
+ crit: $this > 1024
+ info: Bind stats file is very large! Consider to create logrotate conf file for it!
+ to: sysadmin
diff --git a/health/health.d/boinc.conf b/health/health.d/boinc.conf
new file mode 100644
index 000000000..43c588db6
--- /dev/null
+++ b/health/health.d/boinc.conf
@@ -0,0 +1,62 @@
+# Alarms for various BOINC issues.
+
+# Warn on any compute errors encountered.
+template: boinc_compute_errors
+ on: boinc.states
+ os: *
+ hosts: *
+families: *
+ lookup: average -10m unaligned of comperror
+ units: tasks
+ every: 1m
+ warn: $this > 0
+ crit: $this > 1
+ delay: up 1m down 5m multiplier 1.5 max 1h
+ info: the total number of compute errors over the past 10 minutes
+ to: sysadmin
+
+# Warn on lots of upload errors
+template: boinc_upload_errors
+ on: boinc.states
+ os: *
+ hosts: *
+families: *
+ lookup: average -10m unaligned of upload_failed
+ units: tasks
+ every: 1m
+ warn: $this > 0
+ crit: $this > 1
+ delay: up 1m down 5m multiplier 1.5 max 1h
+ info: the average number of failed uploads over the past 10 minutes
+ to: sysadmin
+
+# Warn on the task queue being empty
+template: boinc_total_tasks
+ on: boinc.tasks
+ os: *
+ hosts: *
+families: *
+ lookup: average -10m unaligned of total
+ units: tasks
+ every: 1m
+ warn: $this < 1
+ crit: $this < 0.1
+ delay: up 5m down 10m multiplier 1.5 max 1h
+ info: the total number of locally available tasks
+ to: sysadmin
+
+# Warn on no active tasks with a non-empty queue
+template: boinc_active_tasks
+ on: boinc.tasks
+ os: *
+ hosts: *
+families: *
+ lookup: average -10m unaligned of active
+ calc: ($boinc_total_tasks >= 1) ? ($this) : (inf)
+ units: tasks
+ every: 1m
+ warn: $this < 1
+ crit: $this < 0.1
+ delay: up 5m down 10m multiplier 1.5 max 1h
+ info: the total number of active tasks
+ to: sysadmin
diff --git a/health/health.d/btrfs.conf b/health/health.d/btrfs.conf
new file mode 100644
index 000000000..b27aa544f
--- /dev/null
+++ b/health/health.d/btrfs.conf
@@ -0,0 +1,57 @@
+
+template: btrfs_allocated
+ on: btrfs.disk
+ os: *
+ hosts: *
+families: *
+ calc: 100 - ($unallocated * 100 / ($unallocated + $data_used + $data_free + $meta_used + $meta_free + $sys_used + $sys_free))
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (90) : (95))
+ crit: $this > (($status == $CRITICAL) ? (95) : (98))
+ delay: up 1m down 15m multiplier 1.5 max 1h
+ info: the percentage of allocated BTRFS physical disk space
+ to: sysadmin
+
+template: btrfs_data
+ on: btrfs.data
+ os: *
+ hosts: *
+families: *
+ calc: $used * 100 / ($used + $free)
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98
+ crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
+ delay: up 1m down 15m multiplier 1.5 max 1h
+ info: the percentage of used BTRFS data space
+ to: sysadmin
+
+template: btrfs_metadata
+ on: btrfs.metadata
+ os: *
+ hosts: *
+families: *
+ calc: ($used + $reserved) * 100 / ($used + $free + $reserved)
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98
+ crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
+ delay: up 1m down 15m multiplier 1.5 max 1h
+ info: the percentage of used BTRFS metadata space
+ to: sysadmin
+
+template: btrfs_system
+ on: btrfs.system
+ os: *
+ hosts: *
+families: *
+ calc: $used * 100 / ($used + $free)
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98
+ crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
+ delay: up 1m down 15m multiplier 1.5 max 1h
+ info: the percentage of used BTRFS system space
+ to: sysadmin
+
diff --git a/health/health.d/ceph.conf b/health/health.d/ceph.conf
new file mode 100644
index 000000000..de16f7b6f
--- /dev/null
+++ b/health/health.d/ceph.conf
@@ -0,0 +1,13 @@
+# low ceph disk available
+
+template: cluster_space_usage
+ on: ceph.general_usage
+ calc: $avail * 100 / ($avail + $used)
+ units: %
+ every: 10s
+ warn: $this < 10
+ crit: $this < 1
+ delay: down 5m multiplier 1.2 max 1h
+ info: ceph disk usage is almost full
+ to: sysadmin
+
diff --git a/health/health.d/couchdb.conf b/health/health.d/couchdb.conf
new file mode 100644
index 000000000..4a2895280
--- /dev/null
+++ b/health/health.d/couchdb.conf
@@ -0,0 +1,13 @@
+
+# make sure couchdb is running
+
+template: couchdb_last_collected_secs
+ on: couchdb.request_methods
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: dba
diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf
new file mode 100644
index 000000000..fa8189856
--- /dev/null
+++ b/health/health.d/cpu.conf
@@ -0,0 +1,55 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+template: 10min_cpu_usage
+ on: system.cpu
+ os: linux
+ hosts: *
+ lookup: average -10m unaligned of user,system,softirq,irq,guest
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (75) : (85))
+ crit: $this > (($status == $CRITICAL) ? (85) : (95))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average cpu utilization for the last 10 minutes (excluding iowait, nice and steal)
+ to: sysadmin
+
+template: 10min_cpu_iowait
+ on: system.cpu
+ os: linux
+ hosts: *
+ lookup: average -10m unaligned of iowait
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (20) : (40))
+ crit: $this > (($status == $CRITICAL) ? (40) : (50))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average CPU wait I/O for the last 10 minutes
+ to: sysadmin
+
+template: 20min_steal_cpu
+ on: system.cpu
+ os: linux
+ hosts: *
+ lookup: average -20m unaligned of steal
+ units: %
+ every: 5m
+ warn: $this > (($status >= $WARNING) ? (5) : (10))
+ crit: $this > (($status == $CRITICAL) ? (20) : (30))
+ delay: down 1h multiplier 1.5 max 2h
+ info: average CPU steal time for the last 20 minutes
+ to: sysadmin
+
+## FreeBSD
+template: 10min_cpu_usage
+ on: system.cpu
+ os: freebsd
+ hosts: *
+ lookup: average -10m unaligned of user,system,interrupt
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (75) : (85))
+ crit: $this > (($status == $CRITICAL) ? (85) : (95))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average cpu utilization for the last 10 minutes (excluding nice)
+ to: sysadmin
diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf
new file mode 100644
index 000000000..26f85848a
--- /dev/null
+++ b/health/health.d/disks.conf
@@ -0,0 +1,167 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+
+# -----------------------------------------------------------------------------
+# low disk space
+
+# checking the latest collected values
+# raise an alarm if the disk is low on
+# available disk space
+
+template: disk_space_usage
+ on: disk.space
+ os: linux freebsd
+ hosts: *
+families: *
+ calc: $used * 100 / ($avail + $used)
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING ) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: up 1m down 15m multiplier 1.5 max 1h
+ info: current disk space usage
+ to: sysadmin
+
+template: disk_inode_usage
+ on: disk.inodes
+ os: linux freebsd
+ hosts: *
+families: *
+ calc: $used * 100 / ($avail + $used)
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: up 1m down 15m multiplier 1.5 max 1h
+ info: current disk inode usage
+ to: sysadmin
+
+
+# -----------------------------------------------------------------------------
+# disk fill rate
+
+# calculate the rate the disk fills
+# use as base, the available space change
+# during the last hour
+
+# this is just a calculation - it has no alarm
+# we will use it in the next template to find
+# the hours remaining
+
+template: disk_fill_rate
+ on: disk.space
+ os: linux freebsd
+ hosts: *
+families: *
+ lookup: min -10m at -50m unaligned of avail
+ calc: ($this - $avail) / (($now - $after) / 3600)
+ every: 1m
+ units: GB/hour
+ info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour
+
+
+# calculate the hours remaining
+# if the disk continues to fill
+# in this rate
+
+template: out_of_disk_space_time
+ on: disk.space
+ os: linux freebsd
+ hosts: *
+families: *
+ calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf)
+ units: hours
+ every: 10s
+ warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
+ crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+ delay: down 15m multiplier 1.2 max 1h
+ info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour
+ to: sysadmin
+
+
+# -----------------------------------------------------------------------------
+# disk inode fill rate
+
+# calculate the rate the disk inodes are allocated
+# use as base, the available inodes change
+# during the last hour
+
+# this is just a calculation - it has no alarm
+# we will use it in the next template to find
+# the hours remaining
+
+template: disk_inode_rate
+ on: disk.inodes
+ os: linux freebsd
+ hosts: *
+families: *
+ lookup: min -10m at -50m unaligned of avail
+ calc: ($this - $avail) / (($now - $after) / 3600)
+ every: 1m
+ units: inodes/hour
+ info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour
+
+# calculate the hours remaining
+# if the disk inodes are allocated
+# in this rate
+
+template: out_of_disk_inodes_time
+ on: disk.inodes
+ os: linux freebsd
+ hosts: *
+families: *
+ calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf)
+ units: hours
+ every: 10s
+ warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
+ crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+ delay: down 15m multiplier 1.2 max 1h
+ info: estimated time the disk will run out of inodes, if the system continues to allocate inodes with the rate of the last hour
+ to: sysadmin
+
+
+# -----------------------------------------------------------------------------
+# disk congestion
+
+# raise an alarm if the disk is congested
+# by calculating the average disk utilization
+# for the last 10 minutes
+
+template: 10min_disk_utilization
+ on: disk.util
+ os: linux freebsd
+ hosts: *
+families: *
+ lookup: average -10m unaligned
+ units: %
+ every: 1m
+ green: 90
+ red: 98
+ warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1))
+ crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1))
+ delay: down 15m multiplier 1.2 max 1h
+ info: the percentage of time the disk was busy, during the last 10 minutes
+ to: sysadmin
+
+
+# raise an alarm if the disk backlog
+# is above 1000ms (1s) per second
+# for 10 minutes
+# (i.e. the disk cannot catch up)
+
+template: 10min_disk_backlog
+ on: disk.backlog
+ os: linux
+ hosts: *
+families: *
+ lookup: average -10m unaligned
+ units: ms
+ every: 1m
+ green: 2000
+ red: 5000
+ warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1))
+ crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1))
+ delay: down 15m multiplier 1.2 max 1h
+ info: average of the kernel estimated disk backlog, for the last 10 minutes
+ to: sysadmin
diff --git a/health/health.d/dockerd.conf b/health/health.d/dockerd.conf
new file mode 100644
index 000000000..729906cdb
--- /dev/null
+++ b/health/health.d/dockerd.conf
@@ -0,0 +1,8 @@
+template: docker_unhealthy_containers
+ on: docker.unhealthy_containers
+ units: unhealthy containers
+ every: 10s
+ lookup: average -10s
+ crit: $this > 0
+ info: number of unhealthy containers
+ to: sysadmin
diff --git a/health/health.d/elasticsearch.conf b/health/health.d/elasticsearch.conf
new file mode 100644
index 000000000..dffd40965
--- /dev/null
+++ b/health/health.d/elasticsearch.conf
@@ -0,0 +1,9 @@
+ alarm: elasticsearch_last_collected
+ on: elasticsearch_local.cluster_health_status
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ info: number of seconds since the last successful data collection
+ to: sysadmin
diff --git a/health/health.d/entropy.conf b/health/health.d/entropy.conf
new file mode 100644
index 000000000..66d44ec13
--- /dev/null
+++ b/health/health.d/entropy.conf
@@ -0,0 +1,16 @@
+
+# check if entropy is too low
+# the alarm is checked every 1 minute
+# and examines the last hour of data
+
+ alarm: lowest_entropy
+ on: system.entropy
+ os: linux
+ hosts: *
+ lookup: min -10m unaligned
+ units: entries
+ every: 5m
+ warn: $this < (($status >= $WARNING) ? (200) : (100))
+ delay: down 1h multiplier 1.5 max 2h
+ info: minimum entries in the random numbers pool in the last 10 minutes
+ to: silent
diff --git a/health/health.d/fping.conf b/health/health.d/fping.conf
new file mode 100644
index 000000000..43658fef6
--- /dev/null
+++ b/health/health.d/fping.conf
@@ -0,0 +1,53 @@
+
+template: fping_last_collected_secs
+families: *
+ on: fping.latency
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: sysadmin
+
+template: host_reachable
+families: *
+ on: fping.latency
+ calc: $average != nan
+ units: up/down
+ every: 10s
+ crit: $this == 0
+ info: states if the remote host is reachable
+ delay: down 30m multiplier 1.5 max 2h
+ to: sysadmin
+
+template: host_latency
+families: *
+ on: fping.latency
+ lookup: average -10s unaligned of average
+ units: ms
+ every: 10s
+ green: 500
+ red: 1000
+ warn: $this > $green OR $max > $red
+ crit: $this > $red
+ info: average round trip delay during the last 10 seconds
+ delay: down 30m multiplier 1.5 max 2h
+ to: sysadmin
+
+template: packet_loss
+families: *
+ on: fping.quality
+ lookup: average -10m unaligned of returned
+ calc: 100 - $this
+ green: 1
+ red: 10
+ units: %
+ every: 10s
+ warn: $this > $green
+ crit: $this > $red
+ info: packet loss percentage
+ delay: down 30m multiplier 1.5 max 2h
+ to: sysadmin
+
diff --git a/health/health.d/fronius.conf b/health/health.d/fronius.conf
new file mode 100644
index 000000000..cdf6c8fcb
--- /dev/null
+++ b/health/health.d/fronius.conf
@@ -0,0 +1,11 @@
+template: fronius_last_collected_secs
+families: *
+ on: fronius.power
+ calc: $now - $last_collected_t
+ every: 10s
+ units: seconds ago
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: sitemgr
diff --git a/health/health.d/haproxy.conf b/health/health.d/haproxy.conf
new file mode 100644
index 000000000..e49c70d48
--- /dev/null
+++ b/health/health.d/haproxy.conf
@@ -0,0 +1,27 @@
+template: haproxy_backend_server_status
+ on: haproxy_hs.down
+ units: failed servers
+ every: 10s
+ lookup: average -10s
+ crit: $this > 0
+ info: number of failed haproxy backend servers
+ to: sysadmin
+
+template: haproxy_backend_status
+ on: haproxy_hb.down
+ units: failed backend
+ every: 10s
+ lookup: average -10s
+ crit: $this > 0
+ info: number of failed haproxy backends
+ to: sysadmin
+
+template: haproxy_last_collected
+ on: haproxy_hb.down
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ info: number of seconds since the last successful data collection
+ to: sysadmin
diff --git a/health/health.d/httpcheck.conf b/health/health.d/httpcheck.conf
new file mode 100644
index 000000000..0ddf35eab
--- /dev/null
+++ b/health/health.d/httpcheck.conf
@@ -0,0 +1,99 @@
+template: httpcheck_last_collected_secs
+families: *
+ on: httpcheck.status
+ calc: $now - $last_collected_t
+ every: 10s
+ units: seconds ago
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: sysadmin
+
+# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
+template: web_service_up
+families: *
+ on: httpcheck.status
+ lookup: average -1m unaligned percentage of success
+ calc: ($this < 75) ? (0) : ($this)
+ every: 5s
+ units: up/down
+ info: at least 75% verified responses during last 60 seconds, ideal for badges
+ to: silent
+
+template: web_service_bad_content
+families: *
+ on: httpcheck.status
+ lookup: average -5m unaligned percentage of bad_content
+ every: 10s
+ units: %
+ warn: $this >= 10 AND $this < 40
+ crit: $this >= 40
+ delay: down 5m multiplier 1.5 max 1h
+ info: average of unexpected http response content during the last 5 minutes
+ options: no-clear-notification
+ to: webmaster
+
+template: web_service_bad_status
+families: *
+ on: httpcheck.status
+ lookup: average -5m unaligned percentage of bad_status
+ every: 10s
+ units: %
+ warn: $this >= 10 AND $this < 40
+ crit: $this >= 40
+ delay: down 5m multiplier 1.5 max 1h
+ info: average of unexpected http status during the last 5 minutes
+ options: no-clear-notification
+ to: webmaster
+
+template: web_service_timeouts
+families: *
+ on: httpcheck.status
+ lookup: average -5m unaligned percentage of timeout
+ every: 10s
+ units: %
+ info: average of timeouts during the last 5 minutes
+
+template: no_web_service_connections
+families: *
+ on: httpcheck.status
+ lookup: average -5m unaligned percentage of no_connection
+ every: 10s
+ units: %
+ info: average of failed requests during the last 5 minutes
+
+# combined timeout & no connection alarm
+template: web_service_unreachable
+families: *
+ on: httpcheck.status
+ calc: ($no_web_service_connections >= $web_service_timeouts) ? ($no_web_service_connections) : ($web_service_timeouts)
+ units: %
+ every: 10s
+ warn: ($no_web_service_connections >= 10 OR $web_service_timeouts >= 10) AND ($no_web_service_connections < 40 OR $web_service_timeouts < 40)
+ crit: $no_web_service_connections >= 40 OR $web_service_timeouts >= 40
+ delay: down 5m multiplier 1.5 max 1h
+ info: average of failed requests either due to timeouts or no connection during the last 5 minutes
+ options: no-clear-notification
+ to: webmaster
+
+template: 1h_web_service_response_time
+families: *
+ on: httpcheck.responsetime
+ lookup: average -1h unaligned of time
+ every: 30s
+ units: ms
+ info: average response time over the last hour
+
+template: web_service_slow
+families: *
+ on: httpcheck.responsetime
+ lookup: average -3m unaligned of time
+ units: ms
+ every: 10s
+ warn: ($this > ($1h_web_service_response_time * 2) )
+ crit: ($this > ($1h_web_service_response_time * 3) )
+ info: average response time over the last 3 minutes, compared to the average over the last hour
+ delay: down 5m multiplier 1.5 max 1h
+ options: no-clear-notification
+ to: webmaster
diff --git a/health/health.d/ipc.conf b/health/health.d/ipc.conf
new file mode 100644
index 000000000..989d6e912
--- /dev/null
+++ b/health/health.d/ipc.conf
@@ -0,0 +1,28 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ alarm: semaphores_used
+ on: system.ipc_semaphores
+ os: linux
+ hosts: *
+ calc: $semaphores * 100 / $ipc_semaphores_max
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (70) : (80))
+ crit: $this > (($status == $CRITICAL) ? (70) : (90))
+ delay: down 5m multiplier 1.5 max 1h
+ info: the percentage of IPC semaphores used
+ to: sysadmin
+
+ alarm: semaphore_arrays_used
+ on: system.ipc_semaphore_arrays
+ os: linux
+ hosts: *
+ calc: $arrays * 100 / $ipc_semaphores_arrays_max
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (70) : (80))
+ crit: $this > (($status == $CRITICAL) ? (70) : (90))
+ delay: down 5m multiplier 1.5 max 1h
+ info: the percentage of IPC semaphore arrays used
+ to: sysadmin
diff --git a/health/health.d/ipfs.conf b/health/health.d/ipfs.conf
new file mode 100644
index 000000000..3f77572d6
--- /dev/null
+++ b/health/health.d/ipfs.conf
@@ -0,0 +1,11 @@
+
+template: ipfs_datastore_usage
+ on: ipfs.repo_size
+ calc: $size * 100 / $avail
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: down 15m multiplier 1.5 max 1h
+ info: ipfs Datastore close to running out of space
+ to: sysadmin
diff --git a/health/health.d/ipmi.conf b/health/health.d/ipmi.conf
new file mode 100644
index 000000000..c25581964
--- /dev/null
+++ b/health/health.d/ipmi.conf
@@ -0,0 +1,20 @@
+ alarm: ipmi_sensors_states
+ on: ipmi.sensors_states
+ calc: $warning + $critical
+ units: sensors
+ every: 10s
+ warn: $this > 0
+ crit: $critical > 0
+ delay: up 5m down 15m multiplier 1.5 max 1h
+ info: the number IPMI sensors in non-nominal state
+ to: sysadmin
+
+ alarm: ipmi_events
+ on: ipmi.events
+ calc: $events
+ units: events
+ every: 10s
+ warn: $this > 0
+ delay: up 5m down 15m multiplier 1.5 max 1h
+ info: the number of events in the IPMI System Event Log (SEL)
+ to: sysadmin
diff --git a/health/health.d/isc_dhcpd.conf b/health/health.d/isc_dhcpd.conf
new file mode 100644
index 000000000..8054656ff
--- /dev/null
+++ b/health/health.d/isc_dhcpd.conf
@@ -0,0 +1,10 @@
+ template: isc_dhcpd_leases_size
+ on: isc_dhcpd.leases_total
+ units: KB
+ every: 60
+ calc: $leases_size
+ warn: $this > 3072
+ crit: $this > 6144
+ delay: up 2m down 5m
+ info: dhcpd.leases file too big! Module can slow down your server.
+ to: sysadmin
diff --git a/health/health.d/lighttpd.conf b/health/health.d/lighttpd.conf
new file mode 100644
index 000000000..915907a4a
--- /dev/null
+++ b/health/health.d/lighttpd.conf
@@ -0,0 +1,14 @@
+
+# make sure lighttpd is running
+
+template: lighttpd_last_collected_secs
+ on: lighttpd.requests
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: webmaster
+
diff --git a/health/health.d/linux_power_supply.conf b/health/health.d/linux_power_supply.conf
new file mode 100644
index 000000000..27a172a14
--- /dev/null
+++ b/health/health.d/linux_power_supply.conf
@@ -0,0 +1,12 @@
+# Alert on low battery capacity.
+
+template: linux_power_supply_capacity
+ on: power_supply.capacity
+ calc: $capacity
+ units: %
+ every: 10s
+ warn: $this < 10
+ crit: $this < 5
+ delay: up 0 down 5m multiplier 1.2 max 1h
+ info: the percentage remaining capacity of the power supply
+ to: sysadmin
diff --git a/health/health.d/load.conf b/health/health.d/load.conf
new file mode 100644
index 000000000..ee0c54b8e
--- /dev/null
+++ b/health/health.d/load.conf
@@ -0,0 +1,56 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# Calculate the base trigger point for the load average alarms.
+# This is the maximum number of CPU's in the system over the past 1
+# minute, with a special case for a single CPU of setting the trigger at 2.
+ alarm: load_trigger
+ on: system.load
+ os: linux
+ hosts: *
+ calc: ($active_processors == nan or $active_processors == inf or $active_processors < 2) ? ( 2 ) : ( $active_processors )
+ units: cpus
+ every: 1m
+ info: trigger point for load average alarms
+
+# Send alarms if the load average is unusually high.
+# These intentionally _do not_ calculate the average over the sampled
+# time period because the values being checked already are averages.
+ alarm: load_average_15
+ on: system.load
+ os: linux
+ hosts: *
+ lookup: max -1m unaligned of load15
+ units: load
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (1.75 * $load_trigger) : (2 * $load_trigger))
+ crit: $this > (($status == $CRITICAL) ? (3.5 * $load_trigger) : (4 * $load_trigger))
+ delay: down 15m multiplier 1.5 max 1h
+ info: fifteen-minute load average
+ to: sysadmin
+
+ alarm: load_average_5
+ on: system.load
+ os: linux
+ hosts: *
+ lookup: max -1m unaligned of load5
+ units: load
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (3.5 * $load_trigger) : (4 * $load_trigger))
+ crit: $this > (($status == $CRITICAL) ? (7 * $load_trigger) : (8 * $load_trigger))
+ delay: down 15m multiplier 1.5 max 1h
+ info: five-minute load average
+ to: sysadmin
+
+ alarm: load_average_1
+ on: system.load
+ os: linux
+ hosts: *
+ lookup: max -1m unaligned of load1
+ units: load
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (7 * $load_trigger) : (8 * $load_trigger))
+ crit: $this > (($status == $CRITICAL) ? (14 * $load_trigger) : (16 * $load_trigger))
+ delay: down 15m multiplier 1.5 max 1h
+ info: one-minute load average
+ to: sysadmin
diff --git a/health/health.d/mdstat.conf b/health/health.d/mdstat.conf
new file mode 100644
index 000000000..0f5f2837e
--- /dev/null
+++ b/health/health.d/mdstat.conf
@@ -0,0 +1,27 @@
+template: mdstat_last_collected
+ on: md.disks
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ info: number of seconds since the last successful data collection
+ to: sysadmin
+
+template: mdstat_disks
+ on: md.disks
+ units: failed devices
+ every: 10s
+ calc: $total - $inuse
+ crit: $this > 0
+ info: Array is degraded!
+ to: sysadmin
+
+template: mdstat_mismatch_cnt
+ on: md.mismatch_cnt
+ units: unsynchronized blocks
+ calc: $count
+ every: 10s
+ crit: $this > 0
+ info: Mismatch count!
+ to: sysadmin
diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf
new file mode 100644
index 000000000..1881a7be1
--- /dev/null
+++ b/health/health.d/megacli.conf
@@ -0,0 +1,48 @@
+ alarm: adapter_state
+ on: megacli.adapter_degraded
+ units: is degraded
+ lookup: sum -10s
+ every: 10s
+ crit: $this > 0
+ info: adapter state
+ to: sysadmin
+
+ template: bbu_relative_charge
+ on: megacli.bbu_relative_charge
+ units: percent
+ lookup: average -10s
+ every: 10s
+ warn: $this <= (($status >= $WARNING) ? (85) : (80))
+ crit: $this <= (($status == $CRITICAL) ? (50) : (40))
+ info: BBU relative state of charge
+ to: sysadmin
+
+ template: bbu_cycle_count
+ on: megacli.bbu_cycle_count
+ units: cycle count
+ lookup: average -10s
+ every: 10s
+ warn: $this >= 100
+ crit: $this >= 500
+ info: BBU cycle count
+ to: sysadmin
+
+ alarm: pd_media_errors
+ on: megacli.pd_media_error
+ units: media errors
+ lookup: sum -10s
+ every: 10s
+ warn: $this > 0
+ delay: down 1m multiplier 2 max 10m
+ info: physical drive media errors
+ to: sysadmin
+
+ alarm: pd_predictive_failures
+ on: megacli.pd_predictive_failure
+ units: predictive failures
+ lookup: sum -10s
+ every: 10s
+ warn: $this > 0
+ delay: down 1m multiplier 2 max 10m
+ info: physical drive predictive failures
+ to: sysadmin
diff --git a/health/health.d/memcached.conf b/health/health.d/memcached.conf
new file mode 100644
index 000000000..d248ef57a
--- /dev/null
+++ b/health/health.d/memcached.conf
@@ -0,0 +1,52 @@
+
+# make sure memcached is running
+
+template: memcached_last_collected_secs
+ on: memcached.cache
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: dba
+
+
+# detect if memcached cache is full
+
+template: memcached_cache_memory_usage
+ on: memcached.cache
+ calc: $used * 100 / ($used + $available)
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (70) : (80))
+ crit: $this > (($status == $CRITICAL) ? (80) : (90))
+ delay: up 0 down 15m multiplier 1.5 max 1h
+ info: current cache memory usage
+ to: dba
+
+
+# find the rate memcached cache is filling
+
+template: cache_fill_rate
+ on: memcached.cache
+ lookup: min -10m at -50m unaligned of available
+ calc: ($this - $available) / (($now - $after) / 3600)
+ units: KB/hour
+ every: 1m
+ info: average rate the cache fills up (positive), or frees up (negative) space, for the last hour
+
+
+# find the hours remaining until memcached cache is full
+
+template: out_of_cache_space_time
+ on: memcached.cache
+ calc: ($cache_fill_rate > 0) ? ($available / $cache_fill_rate) : (inf)
+ units: hours
+ every: 10s
+ warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
+ crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+ delay: down 15m multiplier 1.5 max 1h
+ info: estimated time the cache will run out of space, if the system continues to add data with the rate of the last hour
+ to: dba
diff --git a/health/health.d/memory.conf b/health/health.d/memory.conf
new file mode 100644
index 000000000..4a0e6e522
--- /dev/null
+++ b/health/health.d/memory.conf
@@ -0,0 +1,38 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ alarm: 1hour_ecc_memory_correctable
+ on: mem.ecc_ce
+ os: linux
+ hosts: *
+ lookup: sum -10m unaligned
+ units: errors
+ every: 1m
+ warn: $this > 0
+ delay: down 1h multiplier 1.5 max 1h
+ info: number of ECC correctable errors during the last hour
+ to: sysadmin
+
+ alarm: 1hour_ecc_memory_uncorrectable
+ on: mem.ecc_ue
+ os: linux
+ hosts: *
+ lookup: sum -10m unaligned
+ units: errors
+ every: 1m
+ crit: $this > 0
+ delay: down 1h multiplier 1.5 max 1h
+ info: number of ECC uncorrectable errors during the last hour
+ to: sysadmin
+
+ alarm: 1hour_memory_hw_corrupted
+ on: mem.hwcorrupt
+ os: linux
+ hosts: *
+ calc: $HardwareCorrupted
+ units: MB
+ every: 10s
+ warn: $this > 0
+ delay: down 1h multiplier 1.5 max 1h
+ info: amount of memory corrupted due to a hardware failure
+ to: sysadmin
diff --git a/health/health.d/mongodb.conf b/health/health.d/mongodb.conf
new file mode 100644
index 000000000..a80cb3112
--- /dev/null
+++ b/health/health.d/mongodb.conf
@@ -0,0 +1,13 @@
+
+# make sure mongodb is running
+
+template: mongodb_last_collected_secs
+ on: mongodb.read_operations
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: dba
diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf
new file mode 100644
index 000000000..39c401915
--- /dev/null
+++ b/health/health.d/mysql.conf
@@ -0,0 +1,100 @@
+
+# make sure mysql is running
+
+template: mysql_last_collected_secs
+ on: mysql.queries
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: dba
+
+
+# -----------------------------------------------------------------------------
+# slow queries
+
+template: mysql_10s_slow_queries
+ on: mysql.queries
+ lookup: sum -10s of slow_queries
+ units: slow queries
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (5) : (10))
+ crit: $this > (($status == $CRITICAL) ? (10) : (20))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of mysql slow queries over the last 10 seconds
+ to: dba
+
+
+# -----------------------------------------------------------------------------
+# lock waits
+
+template: mysql_10s_table_locks_immediate
+ on: mysql.table_locks
+ lookup: sum -10s absolute of immediate
+ units: immediate locks
+ every: 10s
+ info: number of table immediate locks over the last 10 seconds
+ to: dba
+
+template: mysql_10s_table_locks_waited
+ on: mysql.table_locks
+ lookup: sum -10s absolute of waited
+ units: waited locks
+ every: 10s
+ info: number of table waited locks over the last 10 seconds
+ to: dba
+
+template: mysql_10s_waited_locks_ratio
+ on: mysql.table_locks
+ calc: ( ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate) > 0 ) ? (($mysql_10s_table_locks_waited * 100) / ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate)) : 0
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (10) : (25))
+ crit: $this > (($status == $CRITICAL) ? (25) : (50))
+ delay: down 30m multiplier 1.5 max 1h
+ info: the ratio of mysql waited table locks, for the last 10 seconds
+ to: dba
+
+
+# -----------------------------------------------------------------------------
+# connections
+
+template: mysql_connections
+ on: mysql.connections_active
+ calc: $active * 100 / $limit
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (60) : (70))
+ crit: $this > (($status == $CRITICAL) ? (80) : (90))
+ delay: down 15m multiplier 1.5 max 1h
+ info: the ratio of current active connections vs the maximum possible number of connections
+ to: dba
+
+
+# -----------------------------------------------------------------------------
+# replication
+
+template: mysql_replication
+ on: mysql.slave_status
+ calc: ($sql_running == -1 OR $io_running == -1)?0:1
+ units: ok/failed
+ every: 10s
+ crit: $this == 0
+ delay: down 5m multiplier 1.5 max 1h
+ info: checks if mysql replication has stopped
+ to: dba
+
+template: mysql_replication_lag
+ on: mysql.slave_behind
+ calc: $seconds
+ units: seconds
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (5) : (10))
+ crit: $this > (($status == $CRITICAL) ? (10) : (30))
+ delay: down 15m multiplier 1.5 max 1h
+ info: the number of seconds mysql replication is behind this master
+ to: dba
+
diff --git a/health/health.d/named.conf b/health/health.d/named.conf
new file mode 100644
index 000000000..4fc65c8ee
--- /dev/null
+++ b/health/health.d/named.conf
@@ -0,0 +1,14 @@
+
+# make sure named is running
+
+template: named_last_collected_secs
+ on: named.global_queries
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: domainadmin
+
diff --git a/health/health.d/net.conf b/health/health.d/net.conf
new file mode 100644
index 000000000..489016dd5
--- /dev/null
+++ b/health/health.d/net.conf
@@ -0,0 +1,155 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# -----------------------------------------------------------------------------
+# net traffic overflow
+
+ template: 1m_received_traffic_overflow
+ on: net.net
+ os: linux
+ hosts: *
+ families: *
+ lookup: average -1m unaligned absolute of received
+ calc: ($nic_speed_max > 0) ? ($this * 100 / ($nic_speed_max * 1000)) : ( nan )
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (85))
+ crit: $this > (($status == $CRITICAL) ? (85) : (90))
+ delay: down 1m multiplier 1.5 max 1h
+ info: interface received bandwidth usage over net device speed max
+ to: sysadmin
+
+ template: 1m_sent_traffic_overflow
+ on: net.net
+ os: linux
+ hosts: *
+ families: *
+ lookup: average -1m unaligned absolute of sent
+ calc: ($nic_speed_max > 0) ? ($this * 100 / ($nic_speed_max * 1000)) : ( nan )
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (85))
+ crit: $this > (($status == $CRITICAL) ? (85) : (90))
+ delay: down 1m multiplier 1.5 max 1h
+ info: interface sent bandwidth usage over net device speed max
+ to: sysadmin
+
+# -----------------------------------------------------------------------------
+# dropped packets
+
+# check if an interface is dropping packets
+# the alarm is checked every 1 minute
+# and examines the last 10 minutes of data
+
+template: inbound_packets_dropped
+ on: net.drops
+ os: linux
+ hosts: *
+families: *
+ lookup: sum -10m unaligned absolute of inbound
+ units: packets
+ every: 1m
+ warn: $this >= 5
+ delay: down 1h multiplier 1.5 max 2h
+ info: interface inbound dropped packets in the last 10 minutes
+ to: sysadmin
+
+template: outbound_packets_dropped
+ on: net.drops
+ os: linux
+ hosts: *
+families: *
+ lookup: sum -10m unaligned absolute of outbound
+ units: packets
+ every: 1m
+ warn: $this >= 5
+ delay: down 1h multiplier 1.5 max 2h
+ info: interface outbound dropped packets in the last 10 minutes
+ to: sysadmin
+
+template: inbound_packets_dropped_ratio
+ on: net.packets
+ os: linux
+ hosts: *
+families: *
+ lookup: sum -10m unaligned absolute of received
+ calc: (($inbound_packets_dropped != nan AND $this > 0) ? ($inbound_packets_dropped * 100 / $this) : (0))
+ units: %
+ every: 1m
+ warn: $this >= 0.1
+ crit: $this >= 2
+ delay: down 1h multiplier 1.5 max 2h
+ info: the ratio of inbound dropped packets vs the total number of received packets of the network interface, during the last 10 minutes
+ to: sysadmin
+
+template: outbound_packets_dropped_ratio
+ on: net.packets
+ os: linux
+ hosts: *
+families: *
+ lookup: sum -10m unaligned absolute of sent
+ calc: (($outbound_packets_dropped != nan AND $this > 0) ? ($outbound_packets_dropped * 100 / $this) : (0))
+ units: %
+ every: 1m
+ warn: $this >= 0.1
+ crit: $this >= 2
+ delay: down 1h multiplier 1.5 max 2h
+ info: the ratio of outbound dropped packets vs the total number of sent packets of the network interface, during the last 10 minutes
+ to: sysadmin
+
+
+# -----------------------------------------------------------------------------
+# FIFO errors
+
+# check if an interface is having FIFO
+# buffer errors
+# the alarm is checked every 1 minute
+# and examines the last 10 minutes of data
+
+template: 10min_fifo_errors
+ on: net.fifo
+ os: linux
+ hosts: *
+families: *
+ lookup: sum -10m unaligned absolute
+ units: errors
+ every: 1m
+ warn: $this > 0
+ delay: down 1h multiplier 1.5 max 2h
+ info: interface fifo errors in the last 10 minutes
+ to: sysadmin
+
+
+# -----------------------------------------------------------------------------
+# check for packet storms
+
+# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
+# 2. do the same for the last 10s
+# 3. raise an alarm if the later is 10x or 20x the first
+# we assume the minimum packet storm should at least have
+# 10000 packets/s, average of the last 10 seconds
+
+template: 1m_received_packets_rate
+ on: net.packets
+ os: linux freebsd
+ hosts: *
+families: *
+ lookup: average -1m unaligned of received
+ units: packets
+ every: 10s
+ info: the average number of packets received during the last minute
+
+template: 10s_received_packets_storm
+ on: net.packets
+ os: linux freebsd
+ hosts: *
+families: *
+ lookup: average -10s unaligned of received
+ calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
+ every: 10s
+ units: %
+ warn: $this > (($status >= $WARNING)?(200):(5000))
+ crit: $this > (($status >= $WARNING)?(5000):(6000))
+options: no-clear-notification
+ info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute (clear notification for this alarm will not be sent)
+ to: sysadmin
diff --git a/health/health.d/netfilter.conf b/health/health.d/netfilter.conf
new file mode 100644
index 000000000..1d07752cc
--- /dev/null
+++ b/health/health.d/netfilter.conf
@@ -0,0 +1,29 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ alarm: netfilter_last_collected_secs
+ on: netfilter.conntrack_sockets
+ os: linux
+ hosts: *
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: sysadmin
+
+ alarm: netfilter_conntrack_full
+ on: netfilter.conntrack_sockets
+ os: linux
+ hosts: *
+ lookup: max -10s unaligned of connections
+ calc: $this * 100 / $netfilter_conntrack_max
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (70) : (80))
+ crit: $this > (($status == $CRITICAL) ? (80) : (90))
+ delay: down 5m multiplier 1.5 max 1h
+ info: the number of connections tracked by the netfilter connection tracker, as a percentage of the connection tracker table size
+ to: sysadmin
diff --git a/health/health.d/nginx.conf b/health/health.d/nginx.conf
new file mode 100644
index 000000000..a686c3d99
--- /dev/null
+++ b/health/health.d/nginx.conf
@@ -0,0 +1,14 @@
+
+# make sure nginx is running
+
+template: nginx_last_collected_secs
+ on: nginx.requests
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: webmaster
+
diff --git a/health/health.d/nginx_plus.conf b/health/health.d/nginx_plus.conf
new file mode 100644
index 000000000..5a171a76d
--- /dev/null
+++ b/health/health.d/nginx_plus.conf
@@ -0,0 +1,14 @@
+
+# make sure nginx_plus is running
+
+template: nginx_plus_last_collected_secs
+ on: nginx_plus.requests_total
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: webmaster
+
diff --git a/health/health.d/portcheck.conf b/health/health.d/portcheck.conf
new file mode 100644
index 000000000..f42b63d30
--- /dev/null
+++ b/health/health.d/portcheck.conf
@@ -0,0 +1,48 @@
+template: portcheck_last_collected_secs
+families: *
+ on: portcheck.status
+ calc: $now - $last_collected_t
+ every: 10s
+ units: seconds ago
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: sysadmin
+
+# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
+template: service_reachable
+families: *
+ on: portcheck.status
+ lookup: average -1m unaligned percentage of success
+ calc: ($this < 75) ? (0) : ($this)
+ every: 5s
+ units: up/down
+ info: at least 75% successful connections during last 60 seconds, ideal for badges
+ to: silent
+
+template: connection_timeouts
+families: *
+ on: portcheck.status
+ lookup: average -5m unaligned percentage of timeout
+ every: 10s
+ units: %
+ warn: $this >= 10 AND $this < 40
+ crit: $this >= 40
+ delay: down 5m multiplier 1.5 max 1h
+ info: average of timeouts during the last 5 minutes
+ options: no-clear-notification
+ to: sysadmin
+
+template: connection_fails
+families: *
+ on: portcheck.status
+ lookup: average -5m unaligned percentage of no_connection
+ every: 10s
+ units: %
+ warn: $this >= 10 AND $this < 40
+ crit: $this >= 40
+ delay: down 5m multiplier 1.5 max 1h
+ info: average of failed connections during the last 5 minutes
+ options: no-clear-notification
+ to: sysadmin
diff --git a/health/health.d/postgres.conf b/health/health.d/postgres.conf
new file mode 100644
index 000000000..4e0583b85
--- /dev/null
+++ b/health/health.d/postgres.conf
@@ -0,0 +1,13 @@
+
+# make sure postgres is running
+
+template: postgres_last_collected_secs
+ on: postgres.db_stat_transactions
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: dba
diff --git a/health/health.d/qos.conf b/health/health.d/qos.conf
new file mode 100644
index 000000000..7290d15ff
--- /dev/null
+++ b/health/health.d/qos.conf
@@ -0,0 +1,18 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# check if a QoS class is dropping packets
+# the alarm is checked every 10 seconds
+# and examines the last minute of data
+
+#template: 10min_qos_packet_drops
+# on: tc.qos_dropped
+# os: linux
+# hosts: *
+# lookup: sum -10m unaligned absolute
+# every: 30s
+# warn: $this > 0
+# delay: up 0 down 30m multiplier 1.5 max 1h
+# units: packets
+# info: dropped packets in the last 30 minutes
+# to: sysadmin
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
new file mode 100644
index 000000000..4e437322c
--- /dev/null
+++ b/health/health.d/ram.conf
@@ -0,0 +1,64 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ alarm: used_ram_to_ignore
+ on: system.ram
+ os: linux freebsd
+ hosts: *
+ calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz)
+ every: 10s
+ info: the amount of memory that is reported as used, but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC)
+
+ alarm: ram_in_use
+ on: system.ram
+ os: linux
+ hosts: *
+# calc: $used * 100 / ($used + $cached + $free)
+ calc: ($used - $used_ram_to_ignore) * 100 / ($used - $used_ram_to_ignore + $cached + $free)
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: down 15m multiplier 1.5 max 1h
+ info: system RAM used
+ to: sysadmin
+
+ alarm: ram_available
+ on: mem.available
+ os: linux
+ hosts: *
+ calc: ($avail + $used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers)
+ units: %
+ every: 10s
+ warn: $this < (($status >= $WARNING) ? ( 5) : (10))
+ crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
+ delay: down 15m multiplier 1.5 max 1h
+ info: estimated amount of RAM available for userspace processes, without causing swapping
+ to: sysadmin
+
+## FreeBSD
+alarm: ram_in_use
+ on: system.ram
+ os: freebsd
+hosts: *
+ calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
+units: %
+every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+delay: down 15m multiplier 1.5 max 1h
+ info: system RAM usage
+ to: sysadmin
+
+ alarm: ram_available
+ on: system.ram
+ os: freebsd
+ hosts: *
+ calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
+ units: %
+ every: 10s
+ warn: $this < (($status >= $WARNING) ? ( 5) : (10))
+ crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
+ delay: down 15m multiplier 1.5 max 1h
+ info: estimated amount of RAM available for userspace processes, without causing swapping
+ to: sysadmin
diff --git a/health/health.d/redis.conf b/health/health.d/redis.conf
new file mode 100644
index 000000000..c08a884a6
--- /dev/null
+++ b/health/health.d/redis.conf
@@ -0,0 +1,34 @@
+
+# make sure redis is running
+
+template: redis_last_collected_secs
+ on: redis.operations
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: dba
+
+template: redis_bgsave_broken
+families: *
+ on: redis.bgsave_health
+ every: 10s
+ crit: $rdb_last_bgsave_status != 0
+ units: ok/failed
+ info: states if redis bgsave is working
+ delay: down 5m multiplier 1.5 max 1h
+ to: dba
+
+template: redis_bgsave_slow
+families: *
+ on: redis.bgsave_now
+ every: 10s
+ warn: $rdb_bgsave_in_progress > 600
+ crit: $rdb_bgsave_in_progress > 1200
+ units: seconds
+ info: the time redis needs to save its database
+ delay: down 5m multiplier 1.5 max 1h
+ to: dba
diff --git a/health/health.d/retroshare.conf b/health/health.d/retroshare.conf
new file mode 100644
index 000000000..2344b60ec
--- /dev/null
+++ b/health/health.d/retroshare.conf
@@ -0,0 +1,25 @@
+# make sure RetroShare is running
+
+template: retroshare_last_collected_secs
+ on: retroshare.peers
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: sysadmin
+
+# make sure the DHT is fine when active
+
+template: retroshare_dht_working
+ on: retroshare.dht
+ calc: $dht_size_all
+ units: peers
+ every: 1m
+ warn: $this < (($status >= $WARNING) ? (120) : (100))
+ crit: $this < (($status == $CRITICAL) ? (10) : (1))
+ delay: up 0 down 15m multiplier 1.5 max 1h
+ info: Checks if the DHT has enough peers to operate
+ to: sysadmin
diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf
new file mode 100644
index 000000000..77c804bfd
--- /dev/null
+++ b/health/health.d/softnet.conf
@@ -0,0 +1,40 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# check for common /proc/net/softnet_stat errors
+
+ alarm: 10min_netdev_backlog_exceeded
+ on: system.softnet_stat
+ os: linux
+ hosts: *
+ lookup: sum -10m unaligned absolute of dropped
+ units: packets
+ every: 1m
+ warn: $this > 0
+ delay: down 1h multiplier 1.5 max 2h
+ info: number of packets dropped in the last 10min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets)
+ to: sysadmin
+
+ alarm: 10min_netdev_budget_ran_outs
+ on: system.softnet_stat
+ os: linux
+ hosts: *
+ lookup: sum -10m unaligned absolute of squeezed
+ units: events
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (0) : (10))
+ delay: down 1h multiplier 1.5 max 2h
+ info: number of times, during the last 10min, ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs, with work remaining (this can be a cause for dropped packets)
+ to: silent
+
+ alarm: 10min_netisr_backlog_exceeded
+ on: system.softnet_stat
+ os: freebsd
+ hosts: *
+ lookup: sum -10m unaligned absolute of qdrops
+ units: packets
+ every: 1m
+ warn: $this > 0
+ delay: down 1h multiplier 1.5 max 2h
+ info: number of drops in the last 10min, because sysctl net.route.netisr_maxqlen was exceeded (this can be a cause for dropped packets)
+ to: sysadmin
diff --git a/health/health.d/squid.conf b/health/health.d/squid.conf
new file mode 100644
index 000000000..06cc9678f
--- /dev/null
+++ b/health/health.d/squid.conf
@@ -0,0 +1,14 @@
+
+# make sure squid is running
+
+template: squid_last_collected_secs
+ on: squid.clients_requests
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: proxyadmin
+
diff --git a/health/health.d/stiebeleltron.conf b/health/health.d/stiebeleltron.conf
new file mode 100644
index 000000000..e0361eb20
--- /dev/null
+++ b/health/health.d/stiebeleltron.conf
@@ -0,0 +1,11 @@
+template: stiebeleltron_last_collected_secs
+families: *
+ on: stiebeleltron.heating.hc1
+ calc: $now - $last_collected_t
+ every: 10s
+ units: seconds ago
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: sitemgr
diff --git a/health/health.d/swap.conf b/health/health.d/swap.conf
new file mode 100644
index 000000000..f920b0807
--- /dev/null
+++ b/health/health.d/swap.conf
@@ -0,0 +1,43 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ alarm: 30min_ram_swapped_out
+ on: system.swapio
+ os: linux freebsd
+ hosts: *
+ lookup: sum -30m unaligned absolute of out
+ # we have to convert KB to MB by dividing $this (i.e. the result of the lookup) with 1024
+ calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
+ units: % of RAM
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (10) : (20))
+ crit: $this > (($status == $CRITICAL) ? (20) : (30))
+ delay: up 0 down 15m multiplier 1.5 max 1h
+ info: the amount of memory swapped in the last 30 minutes, as a percentage of the system RAM
+ to: sysadmin
+
+ alarm: ram_in_swap
+ on: system.swap
+ os: linux
+ hosts: *
+ calc: $used * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
+ units: % of RAM
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (15) : (20))
+ crit: $this > (($status == $CRITICAL) ? (40) : (50))
+ delay: up 30s down 15m multiplier 1.5 max 1h
+ info: the swap memory used, as a percentage of the system RAM
+ to: sysadmin
+
+ alarm: used_swap
+ on: system.swap
+ os: linux freebsd
+ hosts: *
+ calc: $used * 100 / ( $used + $free )
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: up 30s down 15m multiplier 1.5 max 1h
+ info: the percentage of swap memory used
+ to: sysadmin
diff --git a/health/health.d/tcp_conn.conf b/health/health.d/tcp_conn.conf
new file mode 100644
index 000000000..7aa9a9800
--- /dev/null
+++ b/health/health.d/tcp_conn.conf
@@ -0,0 +1,19 @@
+
+#
+# ${tcp_max_connections} may be nan or -1 if the system
+# supports dynamic threshold for TCP connections.
+# In this case, the alarm will always be zero.
+#
+
+ alarm: tcp_connections
+ on: ipv4.tcpsock
+ os: linux
+ hosts: *
+ calc: (${tcp_max_connections} > 0) ? ( ${connections} * 100 / ${tcp_max_connections} ) : 0
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING ) ? ( 60 ) : ( 80 ))
+ crit: $this > (($status >= $CRITICAL) ? ( 80 ) : ( 90 ))
+ delay: up 0 down 5m multiplier 1.5 max 1h
+ info: the percentage of IPv4 TCP connections over the max allowed
+ to: sysadmin
diff --git a/health/health.d/tcp_listen.conf b/health/health.d/tcp_listen.conf
new file mode 100644
index 000000000..552930ab7
--- /dev/null
+++ b/health/health.d/tcp_listen.conf
@@ -0,0 +1,82 @@
+#
+# There are two queues involved when incoming TCP connections are handled
+# (both at the kernel):
+#
+# SYN queue
+# The SYN queue tracks TCP handshakes until connections are fully established.
+# It overflows when too many incoming TCP connection requests hang in the
+# half-open state and the server is not configured to fall back to SYN cookies.
+# Overflows are usually caused by SYN flood DoS attacks (i.e. someone sends
+# lots of SYN packets and never completes the handshakes).
+#
+# Accept queue
+# The accept queue holds fully established TCP connections waiting to be handled
+# by the listening application. It overflows when the server application fails
+# to accept new connections at the rate they are coming in.
+#
+#
+# -----------------------------------------------------------------------------
+# tcp accept queue (at the kernel)
+
+ alarm: 1m_tcp_accept_queue_overflows
+ on: ip.tcp_accept_queue
+ os: linux
+ hosts: *
+ lookup: sum -60s unaligned absolute of ListenOverflows
+ units: overflows
+ every: 10s
+ crit: $this > 0
+ delay: up 0 down 5m multiplier 1.5 max 1h
+ info: the number of times the TCP accept queue of the kernel overflown, during the last minute
+ to: sysadmin
+
+# THIS IS TOO GENERIC
+# CHECK: https://github.com/netdata/netdata/issues/3234#issuecomment-423935842
+ alarm: 1m_tcp_accept_queue_drops
+ on: ip.tcp_accept_queue
+ os: linux
+ hosts: *
+ lookup: sum -60s unaligned absolute of ListenDrops
+ units: drops
+ every: 10s
+# warn: $this > 0
+ crit: $this > (($status == $CRITICAL) ? (0) : (150))
+ delay: up 0 down 5m multiplier 1.5 max 1h
+ info: the number of times the TCP accept queue of the kernel dropped packets, during the last minute (includes bogus packets received)
+ to: sysadmin
+
+
+# -----------------------------------------------------------------------------
+# tcp SYN queue (at the kernel)
+
+# When the SYN queue is full, either TcpExtTCPReqQFullDoCookies or
+# TcpExtTCPReqQFullDrop is incremented, depending on whether SYN cookies are
+# enabled or not. In both cases this probably indicates a SYN flood attack,
+# so i guess a notification should be sent.
+
+ alarm: 1m_tcp_syn_queue_drops
+ on: ip.tcp_syn_queue
+ os: linux
+ hosts: *
+ lookup: sum -60s unaligned absolute of TCPReqQFullDrop
+ units: drops
+ every: 10s
+ warn: $this > 0
+ crit: $this > (($status == $CRITICAL) ? (0) : (60))
+ delay: up 0 down 5m multiplier 1.5 max 1h
+ info: the number of times the TCP SYN queue of the kernel was full and dropped packets, during the last minute
+ to: sysadmin
+
+ alarm: 1m_tcp_syn_queue_cookies
+ on: ip.tcp_syn_queue
+ os: linux
+ hosts: *
+ lookup: sum -60s unaligned absolute of TCPReqQFullDoCookies
+ units: cookies
+ every: 10s
+ warn: $this > 0
+ crit: $this > (($status == $CRITICAL) ? (0) : (60))
+ delay: up 0 down 5m multiplier 1.5 max 1h
+ info: the number of times the TCP SYN queue of the kernel was full and sent SYN cookies, during the last minute
+ to: sysadmin
+
diff --git a/health/health.d/tcp_mem.conf b/health/health.d/tcp_mem.conf
new file mode 100644
index 000000000..6927d5765
--- /dev/null
+++ b/health/health.d/tcp_mem.conf
@@ -0,0 +1,20 @@
+#
+# check
+# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html
+#
+# We give a warning when TCP is under memory pressure
+# and a critical when TCP is 90% of its upper memory limit
+#
+
+ alarm: tcp_memory
+ on: ipv4.sockstat_tcp_mem
+ os: linux
+ hosts: *
+ calc: ${mem} * 100 / ${tcp_mem_high}
+ units: %
+ every: 10s
+ warn: ${mem} > (($status >= $WARNING ) ? ( ${tcp_mem_pressure} * 0.8 ) : ( ${tcp_mem_pressure} ))
+ crit: ${mem} > (($status >= $CRITICAL ) ? ( ${tcp_mem_pressure} ) : ( ${tcp_mem_high} * 0.9 ))
+ delay: up 0 down 5m multiplier 1.5 max 1h
+ info: the amount of TCP memory as a percentage of its max memory limit
+ to: sysadmin
diff --git a/health/health.d/tcp_orphans.conf b/health/health.d/tcp_orphans.conf
new file mode 100644
index 000000000..280d6590f
--- /dev/null
+++ b/health/health.d/tcp_orphans.conf
@@ -0,0 +1,21 @@
+
+#
+# check
+# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html
+#
+# The kernel may penalize orphans by 2x or even 4x
+# so we alarm warning at 25% and critical at 50%
+#
+
+ alarm: tcp_orphans
+ on: ipv4.sockstat_tcp_sockets
+ os: linux
+ hosts: *
+ calc: ${orphan} * 100 / ${tcp_max_orphans}
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING ) ? ( 20 ) : ( 25 ))
+ crit: $this > (($status >= $CRITICAL) ? ( 25 ) : ( 50 ))
+ delay: up 0 down 5m multiplier 1.5 max 1h
+ info: the percentage of orphan IPv4 TCP sockets over the max allowed (this may lead to too-many-orphans errors)
+ to: sysadmin
diff --git a/health/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf
new file mode 100644
index 000000000..91dad3c6a
--- /dev/null
+++ b/health/health.d/tcp_resets.conf
@@ -0,0 +1,67 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# -----------------------------------------------------------------------------
+
+ alarm: ipv4_tcphandshake_last_collected_secs
+ on: ipv4.tcphandshake
+ os: linux freebsd
+ hosts: *
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: up 0 down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: sysadmin
+
+# -----------------------------------------------------------------------------
+# tcp resets this host sends
+
+ alarm: 1m_ipv4_tcp_resets_sent
+ on: ipv4.tcphandshake
+ os: linux
+ hosts: *
+ lookup: average -1m at -10s unaligned absolute of OutRsts
+ units: tcp resets/s
+ every: 10s
+ info: average TCP RESETS this host is sending, over the last minute
+
+ alarm: 10s_ipv4_tcp_resets_sent
+ on: ipv4.tcphandshake
+ os: linux
+ hosts: *
+ lookup: average -10s unaligned absolute of OutRsts
+ units: tcp resets/s
+ every: 10s
+ warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (20)))
+ delay: up 0 down 60m multiplier 1.2 max 2h
+ options: no-clear-notification
+ info: average TCP RESETS this host is sending, over the last 10 seconds (this can be an indication that a port scan is made, or that a service running on this host has crashed; clear notification for this alarm will not be sent)
+ to: sysadmin
+
+# -----------------------------------------------------------------------------
+# tcp resets this host receives
+
+ alarm: 1m_ipv4_tcp_resets_received
+ on: ipv4.tcphandshake
+ os: linux freebsd
+ hosts: *
+ lookup: average -1m at -10s unaligned absolute of AttemptFails
+ units: tcp resets/s
+ every: 10s
+ info: average TCP RESETS this host is sending, over the last minute
+
+ alarm: 10s_ipv4_tcp_resets_received
+ on: ipv4.tcphandshake
+ os: linux freebsd
+ hosts: *
+ lookup: average -10s unaligned absolute of AttemptFails
+ units: tcp resets/s
+ every: 10s
+ warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (10)))
+ delay: up 0 down 60m multiplier 1.2 max 2h
+ options: no-clear-notification
+ info: average TCP RESETS this host is receiving, over the last 10 seconds (this can be an indication that a service this host needs, has crashed; clear notification for this alarm will not be sent)
+ to: sysadmin
diff --git a/health/health.d/udp_errors.conf b/health/health.d/udp_errors.conf
new file mode 100644
index 000000000..5140228f5
--- /dev/null
+++ b/health/health.d/udp_errors.conf
@@ -0,0 +1,49 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# -----------------------------------------------------------------------------
+
+ alarm: ipv4_udperrors_last_collected_secs
+ on: ipv4.udperrors
+ os: linux freebsd
+ hosts: *
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: up 0 down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: sysadmin
+
+# -----------------------------------------------------------------------------
+# UDP receive buffer errors
+
+ alarm: 1m_ipv4_udp_receive_buffer_errors
+ on: ipv4.udperrors
+ os: linux freebsd
+ hosts: *
+ lookup: sum -1m unaligned absolute of RcvbufErrors
+ units: errors
+ every: 10s
+ warn: $this > 0
+ crit: $this > (($status == $CRITICAL) ? (0) : (100))
+ info: number of UDP receive buffer errors during the last minute
+ delay: up 0 down 60m multiplier 1.2 max 2h
+ to: sysadmin
+
+# -----------------------------------------------------------------------------
+# UDP send buffer errors
+
+ alarm: 1m_ipv4_udp_send_buffer_errors
+ on: ipv4.udperrors
+ os: linux
+ hosts: *
+ lookup: sum -1m unaligned absolute of SndbufErrors
+ units: errors
+ every: 10s
+ warn: $this > 0
+ crit: $this > (($status == $CRITICAL) ? (0) : (100))
+ info: number of UDP send buffer errors during the last minute
+ delay: up 0 down 60m multiplier 1.2 max 2h
+ to: sysadmin
diff --git a/health/health.d/varnish.conf b/health/health.d/varnish.conf
new file mode 100644
index 000000000..cca7446b4
--- /dev/null
+++ b/health/health.d/varnish.conf
@@ -0,0 +1,9 @@
+ alarm: varnish_last_collected
+ on: varnish.uptime
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ info: number of seconds since the last successful data collection
+ to: sysadmin
diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf
new file mode 100644
index 000000000..d8be88b47
--- /dev/null
+++ b/health/health.d/web_log.conf
@@ -0,0 +1,163 @@
+
+# make sure we can collect web log data
+
+template: last_collected_secs
+ on: web_log.response_codes
+families: *
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: webmaster
+
+
+# -----------------------------------------------------------------------------
+# high level response code alarms
+
+# the following alarms trigger only when there are enough data.
+# we assume there are enough data when:
+#
+# $1m_requests > 120
+#
+# i.e. when there are at least 120 requests during the last minute
+
+template: 1m_requests
+ on: web_log.response_statuses
+families: *
+ lookup: sum -1m unaligned
+ calc: ($this == 0)?(1):($this)
+ units: requests
+ every: 10s
+ info: the sum of all HTTP requests over the last minute
+
+template: 1m_successful
+ on: web_log.response_statuses
+families: *
+ lookup: sum -1m unaligned of successful_requests
+ calc: $this * 100 / $1m_requests
+ units: %
+ every: 10s
+ warn: ($1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
+ crit: ($1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
+ delay: up 2m down 15m multiplier 1.5 max 1h
+ info: the ratio of successful HTTP responses (1xx, 2xx, 304) over the last minute
+ to: webmaster
+
+template: 1m_redirects
+ on: web_log.response_statuses
+families: *
+ lookup: sum -1m unaligned of redirects
+ calc: $this * 100 / $1m_requests
+ units: %
+ every: 10s
+ warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 )
+ crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 )
+ delay: up 2m down 15m multiplier 1.5 max 1h
+ info: the ratio of HTTP redirects (3xx except 304) over the last minute
+ to: webmaster
+
+template: 1m_bad_requests
+ on: web_log.response_statuses
+families: *
+ lookup: sum -1m unaligned of bad_requests
+ calc: $this * 100 / $1m_requests
+ units: %
+ every: 10s
+ warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 )
+ crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 )
+ delay: up 2m down 15m multiplier 1.5 max 1h
+ info: the ratio of HTTP bad requests (4xx) over the last minute
+ to: webmaster
+
+template: 1m_internal_errors
+ on: web_log.response_statuses
+families: *
+ lookup: sum -1m unaligned of server_errors
+ calc: $this * 100 / $1m_requests
+ units: %
+ every: 10s
+ warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 )
+ crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
+ delay: up 2m down 15m multiplier 1.5 max 1h
+ info: the ratio of HTTP internal server errors (5xx), over the last minute
+ to: webmaster
+
+
+# -----------------------------------------------------------------------------
+# web slow
+
+# the following alarms trigger only when there are enough data.
+# we assume there are enough data when:
+#
+# $1m_requests > 120
+#
+# i.e. when there are at least 120 requests during the last minute
+
+template: 10m_response_time
+ on: web_log.response_time
+families: *
+ lookup: average -10m unaligned of avg
+ units: ms
+ every: 30s
+ info: the average time to respond to HTTP requests, over the last 10 minutes
+
+template: web_slow
+ on: web_log.response_time
+families: *
+ lookup: average -1m unaligned of avg
+ units: ms
+ every: 10s
+ green: 500
+ red: 1000
+ warn: ($1m_requests > 120) ? ($this > $green && $this > ($10m_response_time * 2) ) : ( 0 )
+ crit: ($1m_requests > 120) ? ($this > $red && $this > ($10m_response_time * 4) ) : ( 0 )
+ delay: down 15m multiplier 1.5 max 1h
+ info: the average time to respond to HTTP requests, over the last 1 minute
+ options: no-clear-notification
+ to: webmaster
+
+# -----------------------------------------------------------------------------
+# web too many or too few requests
+
+# the following alarms trigger only when there are enough data.
+# we assume there are enough data when:
+#
+# $5m_successful_old > 120
+#
+# i.e. when there were at least 120 requests during the 5 minutes starting
+# at -10m and ending at -5m
+
+template: 5m_successful_old
+ on: web_log.response_statuses
+families: *
+ lookup: average -5m at -5m unaligned of successful_requests
+ units: requests/s
+ every: 30s
+ info: average rate of successful HTTP requests over the last 5 minutes
+
+template: 5m_successful
+ on: web_log.response_statuses
+families: *
+ lookup: average -5m unaligned of successful_requests
+ units: requests/s
+ every: 30s
+ info: average successful HTTP requests over the last 5 minutes
+
+template: 5m_requests_ratio
+ on: web_log.response_codes
+families: *
+ calc: ($5m_successful_old > 0)?($5m_successful * 100 / $5m_successful_old):(100)
+ units: %
+ every: 30s
+ warn: ($5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0)
+ crit: ($5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
+ delay: down 15m multiplier 1.5 max 1h
+options: no-clear-notification
+ info: the percentage of successful web requests over the last 5 minutes, \
+ compared with the previous 5 minutes \
+ (clear notification for this alarm will not be sent)
+ to: webmaster
+
diff --git a/health/health.d/zfs.conf b/health/health.d/zfs.conf
new file mode 100644
index 000000000..af73824e6
--- /dev/null
+++ b/health/health.d/zfs.conf
@@ -0,0 +1,10 @@
+
+ alarm: zfs_memory_throttle
+ on: zfs.memory_ops
+ lookup: sum -10m unaligned absolute of throttled
+ units: events
+ every: 1m
+ warn: $this > 0
+ delay: down 1h multiplier 1.5 max 2h
+ info: the number of times ZFS had to limit the ARC growth in the last 10 minutes
+ to: sysadmin