summaryrefslogtreecommitdiffstats
path: root/conf.d/health.d
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--conf.d/health.d/tcp_listen.conf27
-rw-r--r--health/health.d/apache.conf (renamed from conf.d/health.d/apache.conf)0
-rw-r--r--health/health.d/backend.conf (renamed from conf.d/health.d/backend.conf)0
-rw-r--r--health/health.d/beanstalkd.conf (renamed from conf.d/health.d/beanstalkd.conf)0
-rw-r--r--health/health.d/bind_rndc.conf (renamed from conf.d/health.d/bind_rndc.conf)0
-rw-r--r--health/health.d/btrfs.conf (renamed from conf.d/health.d/btrfs.conf)0
-rw-r--r--health/health.d/ceph.conf (renamed from conf.d/health.d/ceph.conf)0
-rw-r--r--health/health.d/couchdb.conf (renamed from conf.d/health.d/couchdb.conf)0
-rw-r--r--health/health.d/cpu.conf (renamed from conf.d/health.d/cpu.conf)0
-rw-r--r--health/health.d/disks.conf (renamed from conf.d/health.d/disks.conf)0
-rw-r--r--health/health.d/elasticsearch.conf (renamed from conf.d/health.d/elasticsearch.conf)0
-rw-r--r--health/health.d/entropy.conf (renamed from conf.d/health.d/entropy.conf)0
-rw-r--r--health/health.d/fping.conf (renamed from conf.d/health.d/fping.conf)0
-rw-r--r--health/health.d/fronius.conf (renamed from conf.d/health.d/fronius.conf)0
-rw-r--r--health/health.d/haproxy.conf (renamed from conf.d/health.d/haproxy.conf)0
-rw-r--r--health/health.d/httpcheck.conf (renamed from conf.d/health.d/httpcheck.conf)0
-rw-r--r--health/health.d/ipc.conf (renamed from conf.d/health.d/ipc.conf)4
-rw-r--r--health/health.d/ipfs.conf (renamed from conf.d/health.d/ipfs.conf)0
-rw-r--r--health/health.d/ipmi.conf (renamed from conf.d/health.d/ipmi.conf)0
-rw-r--r--health/health.d/isc_dhcpd.conf (renamed from conf.d/health.d/isc_dhcpd.conf)0
-rw-r--r--health/health.d/lighttpd.conf (renamed from conf.d/health.d/lighttpd.conf)0
-rw-r--r--health/health.d/mdstat.conf (renamed from conf.d/health.d/mdstat.conf)23
-rw-r--r--health/health.d/memcached.conf (renamed from conf.d/health.d/memcached.conf)0
-rw-r--r--health/health.d/memory.conf (renamed from conf.d/health.d/memory.conf)0
-rw-r--r--health/health.d/mongodb.conf (renamed from conf.d/health.d/mongodb.conf)0
-rw-r--r--health/health.d/mysql.conf (renamed from conf.d/health.d/mysql.conf)15
-rw-r--r--health/health.d/named.conf (renamed from conf.d/health.d/named.conf)0
-rw-r--r--health/health.d/net.conf (renamed from conf.d/health.d/net.conf)37
-rw-r--r--health/health.d/netfilter.conf (renamed from conf.d/health.d/netfilter.conf)2
-rw-r--r--health/health.d/nginx.conf (renamed from conf.d/health.d/nginx.conf)0
-rw-r--r--health/health.d/nginx_plus.conf (renamed from conf.d/health.d/nginx_plus.conf)0
-rw-r--r--health/health.d/portcheck.conf (renamed from conf.d/health.d/portcheck.conf)0
-rw-r--r--health/health.d/postgres.conf (renamed from conf.d/health.d/postgres.conf)0
-rw-r--r--health/health.d/qos.conf (renamed from conf.d/health.d/qos.conf)0
-rw-r--r--health/health.d/ram.conf (renamed from conf.d/health.d/ram.conf)6
-rw-r--r--health/health.d/redis.conf (renamed from conf.d/health.d/redis.conf)0
-rw-r--r--health/health.d/retroshare.conf (renamed from conf.d/health.d/retroshare.conf)0
-rw-r--r--health/health.d/softnet.conf (renamed from conf.d/health.d/softnet.conf)0
-rw-r--r--health/health.d/squid.conf (renamed from conf.d/health.d/squid.conf)0
-rw-r--r--health/health.d/stiebeleltron.conf (renamed from conf.d/health.d/stiebeleltron.conf)0
-rw-r--r--health/health.d/swap.conf (renamed from conf.d/health.d/swap.conf)0
-rw-r--r--health/health.d/tcp_conn.conf (renamed from conf.d/health.d/tcp_conn.conf)0
-rw-r--r--health/health.d/tcp_mem.conf (renamed from conf.d/health.d/tcp_mem.conf)0
-rw-r--r--health/health.d/tcp_orphans.conf (renamed from conf.d/health.d/tcp_orphans.conf)0
-rw-r--r--health/health.d/tcp_resets.conf (renamed from conf.d/health.d/tcp_resets.conf)0
-rw-r--r--health/health.d/udp_errors.conf (renamed from conf.d/health.d/udp_errors.conf)4
-rw-r--r--health/health.d/varnish.conf (renamed from conf.d/health.d/varnish.conf)0
-rw-r--r--health/health.d/web_log.conf (renamed from conf.d/health.d/web_log.conf)0
-rw-r--r--health/health.d/zfs.conf (renamed from conf.d/health.d/zfs.conf)0
49 files changed, 74 insertions, 44 deletions
diff --git a/conf.d/health.d/tcp_listen.conf b/conf.d/health.d/tcp_listen.conf
deleted file mode 100644
index 957964ae4..000000000
--- a/conf.d/health.d/tcp_listen.conf
+++ /dev/null
@@ -1,27 +0,0 @@
-# -----------------------------------------------------------------------------
-# tcp listen sockets issues
-
- alarm: 1m_ipv4_tcp_listen_overflows
- on: ipv4.tcplistenissues
- os: linux freebsd
- hosts: *
- lookup: sum -60s unaligned absolute of ListenOverflows
- units: overflows
- every: 10s
- crit: $this > 0
- delay: up 0 down 5m multiplier 1.5 max 1h
- info: the number of TCP listen socket overflows during the last minute
- to: sysadmin
-
- alarm: 1m_ipv4_tcp_listen_drops
- on: ipv4.tcplistenissues
- os: linux
- hosts: *
- lookup: sum -60s unaligned absolute of ListenDrops
- units: drops
- every: 10s
- crit: $this > 0
- delay: up 0 down 5m multiplier 1.5 max 1h
- info: the number of TCP listen socket drops during the last minute
- to: sysadmin
-
diff --git a/conf.d/health.d/apache.conf b/health/health.d/apache.conf
index 0c98b8778..0c98b8778 100644
--- a/conf.d/health.d/apache.conf
+++ b/health/health.d/apache.conf
diff --git a/conf.d/health.d/backend.conf b/health/health.d/backend.conf
index 7af100d8f..7af100d8f 100644
--- a/conf.d/health.d/backend.conf
+++ b/health/health.d/backend.conf
diff --git a/conf.d/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf
index 30dc27328..30dc27328 100644
--- a/conf.d/health.d/beanstalkd.conf
+++ b/health/health.d/beanstalkd.conf
diff --git a/conf.d/health.d/bind_rndc.conf b/health/health.d/bind_rndc.conf
index 4145e77cd..4145e77cd 100644
--- a/conf.d/health.d/bind_rndc.conf
+++ b/health/health.d/bind_rndc.conf
diff --git a/conf.d/health.d/btrfs.conf b/health/health.d/btrfs.conf
index b27aa544f..b27aa544f 100644
--- a/conf.d/health.d/btrfs.conf
+++ b/health/health.d/btrfs.conf
diff --git a/conf.d/health.d/ceph.conf b/health/health.d/ceph.conf
index de16f7b6f..de16f7b6f 100644
--- a/conf.d/health.d/ceph.conf
+++ b/health/health.d/ceph.conf
diff --git a/conf.d/health.d/couchdb.conf b/health/health.d/couchdb.conf
index 4a2895280..4a2895280 100644
--- a/conf.d/health.d/couchdb.conf
+++ b/health/health.d/couchdb.conf
diff --git a/conf.d/health.d/cpu.conf b/health/health.d/cpu.conf
index fa8189856..fa8189856 100644
--- a/conf.d/health.d/cpu.conf
+++ b/health/health.d/cpu.conf
diff --git a/conf.d/health.d/disks.conf b/health/health.d/disks.conf
index 26f85848a..26f85848a 100644
--- a/conf.d/health.d/disks.conf
+++ b/health/health.d/disks.conf
diff --git a/conf.d/health.d/elasticsearch.conf b/health/health.d/elasticsearch.conf
index dffd40965..dffd40965 100644
--- a/conf.d/health.d/elasticsearch.conf
+++ b/health/health.d/elasticsearch.conf
diff --git a/conf.d/health.d/entropy.conf b/health/health.d/entropy.conf
index 66d44ec13..66d44ec13 100644
--- a/conf.d/health.d/entropy.conf
+++ b/health/health.d/entropy.conf
diff --git a/conf.d/health.d/fping.conf b/health/health.d/fping.conf
index 43658fef6..43658fef6 100644
--- a/conf.d/health.d/fping.conf
+++ b/health/health.d/fping.conf
diff --git a/conf.d/health.d/fronius.conf b/health/health.d/fronius.conf
index cdf6c8fcb..cdf6c8fcb 100644
--- a/conf.d/health.d/fronius.conf
+++ b/health/health.d/fronius.conf
diff --git a/conf.d/health.d/haproxy.conf b/health/health.d/haproxy.conf
index e49c70d48..e49c70d48 100644
--- a/conf.d/health.d/haproxy.conf
+++ b/health/health.d/haproxy.conf
diff --git a/conf.d/health.d/httpcheck.conf b/health/health.d/httpcheck.conf
index 0ddf35eab..0ddf35eab 100644
--- a/conf.d/health.d/httpcheck.conf
+++ b/health/health.d/httpcheck.conf
diff --git a/conf.d/health.d/ipc.conf b/health/health.d/ipc.conf
index 03cf264d8..989d6e912 100644
--- a/conf.d/health.d/ipc.conf
+++ b/health/health.d/ipc.conf
@@ -5,7 +5,7 @@
on: system.ipc_semaphores
os: linux
hosts: *
- calc: $semaphores * 100 / $ipc.semaphores.max
+ calc: $semaphores * 100 / $ipc_semaphores_max
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (70) : (80))
@@ -18,7 +18,7 @@
on: system.ipc_semaphore_arrays
os: linux
hosts: *
- calc: $arrays * 100 / $ipc.semaphores.arrays.max
+ calc: $arrays * 100 / $ipc_semaphores_arrays_max
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (70) : (80))
diff --git a/conf.d/health.d/ipfs.conf b/health/health.d/ipfs.conf
index 3f77572d6..3f77572d6 100644
--- a/conf.d/health.d/ipfs.conf
+++ b/health/health.d/ipfs.conf
diff --git a/conf.d/health.d/ipmi.conf b/health/health.d/ipmi.conf
index c25581964..c25581964 100644
--- a/conf.d/health.d/ipmi.conf
+++ b/health/health.d/ipmi.conf
diff --git a/conf.d/health.d/isc_dhcpd.conf b/health/health.d/isc_dhcpd.conf
index 8054656ff..8054656ff 100644
--- a/conf.d/health.d/isc_dhcpd.conf
+++ b/health/health.d/isc_dhcpd.conf
diff --git a/conf.d/health.d/lighttpd.conf b/health/health.d/lighttpd.conf
index 915907a4a..915907a4a 100644
--- a/conf.d/health.d/lighttpd.conf
+++ b/health/health.d/lighttpd.conf
diff --git a/conf.d/health.d/mdstat.conf b/health/health.d/mdstat.conf
index c9e7d20db..0f5f2837e 100644
--- a/conf.d/health.d/mdstat.conf
+++ b/health/health.d/mdstat.conf
@@ -1,3 +1,13 @@
+template: mdstat_last_collected
+ on: md.disks
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ info: number of seconds since the last successful data collection
+ to: sysadmin
+
template: mdstat_disks
on: md.disks
units: failed devices
@@ -7,12 +17,11 @@ template: mdstat_disks
info: Array is degraded!
to: sysadmin
-template: mdstat_last_collected
- on: md.disks
- calc: $now - $last_collected_t
- units: seconds ago
+template: mdstat_mismatch_cnt
+ on: md.mismatch_cnt
+ units: unsynchronized blocks
+ calc: $count
every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- info: number of seconds since the last successful data collection
+ crit: $this > 0
+ info: Mismatch count!
to: sysadmin
diff --git a/conf.d/health.d/memcached.conf b/health/health.d/memcached.conf
index d248ef57a..d248ef57a 100644
--- a/conf.d/health.d/memcached.conf
+++ b/health/health.d/memcached.conf
diff --git a/conf.d/health.d/memory.conf b/health/health.d/memory.conf
index 4a0e6e522..4a0e6e522 100644
--- a/conf.d/health.d/memory.conf
+++ b/health/health.d/memory.conf
diff --git a/conf.d/health.d/mongodb.conf b/health/health.d/mongodb.conf
index a80cb3112..a80cb3112 100644
--- a/conf.d/health.d/mongodb.conf
+++ b/health/health.d/mongodb.conf
diff --git a/conf.d/health.d/mysql.conf b/health/health.d/mysql.conf
index 1eeb993f0..39c401915 100644
--- a/conf.d/health.d/mysql.conf
+++ b/health/health.d/mysql.conf
@@ -60,6 +60,21 @@ template: mysql_10s_waited_locks_ratio
# -----------------------------------------------------------------------------
+# connections
+
+template: mysql_connections
+ on: mysql.connections_active
+ calc: $active * 100 / $limit
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (60) : (70))
+ crit: $this > (($status == $CRITICAL) ? (80) : (90))
+ delay: down 15m multiplier 1.5 max 1h
+ info: the ratio of current active connections vs the maximum possible number of connections
+ to: dba
+
+
+# -----------------------------------------------------------------------------
# replication
template: mysql_replication
diff --git a/conf.d/health.d/named.conf b/health/health.d/named.conf
index 4fc65c8ee..4fc65c8ee 100644
--- a/conf.d/health.d/named.conf
+++ b/health/health.d/named.conf
diff --git a/conf.d/health.d/net.conf b/health/health.d/net.conf
index 22a88927d..489016dd5 100644
--- a/conf.d/health.d/net.conf
+++ b/health/health.d/net.conf
@@ -2,6 +2,39 @@
# you can disable an alarm notification by setting the 'to' line to: silent
# -----------------------------------------------------------------------------
+# net traffic overflow
+
+ template: 1m_received_traffic_overflow
+ on: net.net
+ os: linux
+ hosts: *
+ families: *
+ lookup: average -1m unaligned absolute of received
+ calc: ($nic_speed_max > 0) ? ($this * 100 / ($nic_speed_max * 1000)) : ( nan )
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (85))
+ crit: $this > (($status == $CRITICAL) ? (85) : (90))
+ delay: down 1m multiplier 1.5 max 1h
+ info: interface received bandwidth usage over net device speed max
+ to: sysadmin
+
+ template: 1m_sent_traffic_overflow
+ on: net.net
+ os: linux
+ hosts: *
+ families: *
+ lookup: average -1m unaligned absolute of sent
+ calc: ($nic_speed_max > 0) ? ($this * 100 / ($nic_speed_max * 1000)) : ( nan )
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (85))
+ crit: $this > (($status == $CRITICAL) ? (85) : (90))
+ delay: down 1m multiplier 1.5 max 1h
+ info: interface sent bandwidth usage over net device speed max
+ to: sysadmin
+
+# -----------------------------------------------------------------------------
# dropped packets
# check if an interface is dropping packets
@@ -101,7 +134,7 @@ template: 1m_received_packets_rate
os: linux freebsd
hosts: *
families: *
- lookup: average -1m of received
+ lookup: average -1m unaligned of received
units: packets
every: 10s
info: the average number of packets received during the last minute
@@ -111,7 +144,7 @@ template: 10s_received_packets_storm
os: linux freebsd
hosts: *
families: *
- lookup: average -10s of received
+ lookup: average -10s unaligned of received
calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
every: 10s
units: %
diff --git a/conf.d/health.d/netfilter.conf b/health/health.d/netfilter.conf
index fa1732b33..1d07752cc 100644
--- a/conf.d/health.d/netfilter.conf
+++ b/health/health.d/netfilter.conf
@@ -19,7 +19,7 @@
os: linux
hosts: *
lookup: max -10s unaligned of connections
- calc: $this * 100 / $netfilter.conntrack.max
+ calc: $this * 100 / $netfilter_conntrack_max
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (70) : (80))
diff --git a/conf.d/health.d/nginx.conf b/health/health.d/nginx.conf
index a686c3d99..a686c3d99 100644
--- a/conf.d/health.d/nginx.conf
+++ b/health/health.d/nginx.conf
diff --git a/conf.d/health.d/nginx_plus.conf b/health/health.d/nginx_plus.conf
index 5a171a76d..5a171a76d 100644
--- a/conf.d/health.d/nginx_plus.conf
+++ b/health/health.d/nginx_plus.conf
diff --git a/conf.d/health.d/portcheck.conf b/health/health.d/portcheck.conf
index f42b63d30..f42b63d30 100644
--- a/conf.d/health.d/portcheck.conf
+++ b/health/health.d/portcheck.conf
diff --git a/conf.d/health.d/postgres.conf b/health/health.d/postgres.conf
index 4e0583b85..4e0583b85 100644
--- a/conf.d/health.d/postgres.conf
+++ b/health/health.d/postgres.conf
diff --git a/conf.d/health.d/qos.conf b/health/health.d/qos.conf
index 7290d15ff..7290d15ff 100644
--- a/conf.d/health.d/qos.conf
+++ b/health/health.d/qos.conf
diff --git a/conf.d/health.d/ram.conf b/health/health.d/ram.conf
index b6dc5f945..4e437322c 100644
--- a/conf.d/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -3,7 +3,7 @@
alarm: used_ram_to_ignore
on: system.ram
- os: linux
+ os: linux freebsd
hosts: *
calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz)
every: 10s
@@ -41,7 +41,7 @@ alarm: ram_in_use
on: system.ram
os: freebsd
hosts: *
- calc: (($active + $wired) - $used_ram_to_ignore) * 100 / (($active + $wired) - $used_ram_to_ignore + $cached + $free)
+ calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (80) : (90))
@@ -54,7 +54,7 @@ delay: down 15m multiplier 1.5 max 1h
on: system.ram
os: freebsd
hosts: *
- calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $buffers)
+ calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
units: %
every: 10s
warn: $this < (($status >= $WARNING) ? ( 5) : (10))
diff --git a/conf.d/health.d/redis.conf b/health/health.d/redis.conf
index c08a884a6..c08a884a6 100644
--- a/conf.d/health.d/redis.conf
+++ b/health/health.d/redis.conf
diff --git a/conf.d/health.d/retroshare.conf b/health/health.d/retroshare.conf
index 2344b60ec..2344b60ec 100644
--- a/conf.d/health.d/retroshare.conf
+++ b/health/health.d/retroshare.conf
diff --git a/conf.d/health.d/softnet.conf b/health/health.d/softnet.conf
index 77c804bfd..77c804bfd 100644
--- a/conf.d/health.d/softnet.conf
+++ b/health/health.d/softnet.conf
diff --git a/conf.d/health.d/squid.conf b/health/health.d/squid.conf
index 06cc9678f..06cc9678f 100644
--- a/conf.d/health.d/squid.conf
+++ b/health/health.d/squid.conf
diff --git a/conf.d/health.d/stiebeleltron.conf b/health/health.d/stiebeleltron.conf
index e0361eb20..e0361eb20 100644
--- a/conf.d/health.d/stiebeleltron.conf
+++ b/health/health.d/stiebeleltron.conf
diff --git a/conf.d/health.d/swap.conf b/health/health.d/swap.conf
index f920b0807..f920b0807 100644
--- a/conf.d/health.d/swap.conf
+++ b/health/health.d/swap.conf
diff --git a/conf.d/health.d/tcp_conn.conf b/health/health.d/tcp_conn.conf
index 7aa9a9800..7aa9a9800 100644
--- a/conf.d/health.d/tcp_conn.conf
+++ b/health/health.d/tcp_conn.conf
diff --git a/conf.d/health.d/tcp_mem.conf b/health/health.d/tcp_mem.conf
index 6927d5765..6927d5765 100644
--- a/conf.d/health.d/tcp_mem.conf
+++ b/health/health.d/tcp_mem.conf
diff --git a/conf.d/health.d/tcp_orphans.conf b/health/health.d/tcp_orphans.conf
index 280d6590f..280d6590f 100644
--- a/conf.d/health.d/tcp_orphans.conf
+++ b/health/health.d/tcp_orphans.conf
diff --git a/conf.d/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf
index 91dad3c6a..91dad3c6a 100644
--- a/conf.d/health.d/tcp_resets.conf
+++ b/health/health.d/tcp_resets.conf
diff --git a/conf.d/health.d/udp_errors.conf b/health/health.d/udp_errors.conf
index 382b39658..5140228f5 100644
--- a/conf.d/health.d/udp_errors.conf
+++ b/health/health.d/udp_errors.conf
@@ -27,7 +27,7 @@
units: errors
every: 10s
warn: $this > 0
- crit: $this > 100
+ crit: $this > (($status == $CRITICAL) ? (0) : (100))
info: number of UDP receive buffer errors during the last minute
delay: up 0 down 60m multiplier 1.2 max 2h
to: sysadmin
@@ -43,7 +43,7 @@
units: errors
every: 10s
warn: $this > 0
- crit: $this > 100
+ crit: $this > (($status == $CRITICAL) ? (0) : (100))
info: number of UDP send buffer errors during the last minute
delay: up 0 down 60m multiplier 1.2 max 2h
to: sysadmin
diff --git a/conf.d/health.d/varnish.conf b/health/health.d/varnish.conf
index cca7446b4..cca7446b4 100644
--- a/conf.d/health.d/varnish.conf
+++ b/health/health.d/varnish.conf
diff --git a/conf.d/health.d/web_log.conf b/health/health.d/web_log.conf
index d8be88b47..d8be88b47 100644
--- a/conf.d/health.d/web_log.conf
+++ b/health/health.d/web_log.conf
diff --git a/conf.d/health.d/zfs.conf b/health/health.d/zfs.conf
index af73824e6..af73824e6 100644
--- a/conf.d/health.d/zfs.conf
+++ b/health/health.d/zfs.conf