summaryrefslogtreecommitdiffstats
path: root/conf.d/health.d
diff options
context:
space:
mode:
Diffstat (limited to 'conf.d/health.d')
-rw-r--r--conf.d/health.d/apache.conf4
-rw-r--r--conf.d/health.d/backend.conf45
-rw-r--r--conf.d/health.d/bind_rndc.conf9
-rw-r--r--conf.d/health.d/cpu.conf10
-rw-r--r--conf.d/health.d/disks.conf26
-rw-r--r--conf.d/health.d/elasticsearch.conf9
-rw-r--r--conf.d/health.d/entropy.conf8
-rw-r--r--conf.d/health.d/haproxy.conf27
-rw-r--r--conf.d/health.d/ipc.conf22
-rw-r--r--conf.d/health.d/ipfs.conf11
-rw-r--r--conf.d/health.d/isc_dhcpd.conf10
-rw-r--r--conf.d/health.d/mdstat.conf18
-rw-r--r--conf.d/health.d/memcached.conf6
-rw-r--r--conf.d/health.d/memory.conf30
-rw-r--r--conf.d/health.d/mysql.conf76
-rw-r--r--conf.d/health.d/named.conf4
-rw-r--r--conf.d/health.d/net.conf97
-rw-r--r--conf.d/health.d/netfilter.conf23
-rw-r--r--conf.d/health.d/nginx.conf4
-rw-r--r--conf.d/health.d/postgres.conf13
-rw-r--r--conf.d/health.d/ram.conf4
-rw-r--r--conf.d/health.d/redis.conf4
-rw-r--r--conf.d/health.d/retroshare.conf4
-rw-r--r--conf.d/health.d/softnet.conf18
-rw-r--r--conf.d/health.d/squid.conf4
-rw-r--r--conf.d/health.d/swap.conf17
-rw-r--r--conf.d/health.d/tcp_resets.conf32
-rw-r--r--conf.d/health.d/udp_errors.conf40
-rw-r--r--conf.d/health.d/varnish.conf9
29 files changed, 517 insertions, 67 deletions
diff --git a/conf.d/health.d/apache.conf b/conf.d/health.d/apache.conf
index 0aaf0e003..0c98b8778 100644
--- a/conf.d/health.d/apache.conf
+++ b/conf.d/health.d/apache.conf
@@ -6,8 +6,8 @@ template: apache_last_collected_secs
calc: $now - $last_collected_t
units: seconds ago
every: 10s
- warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
to: webmaster
diff --git a/conf.d/health.d/backend.conf b/conf.d/health.d/backend.conf
new file mode 100644
index 000000000..9c193e7b9
--- /dev/null
+++ b/conf.d/health.d/backend.conf
@@ -0,0 +1,45 @@
+
+# make sure we are sending data to backend
+
+ alarm: backend_last_buffering
+ on: netdata.backend_metrics
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful buffering of backend data
+ to: dba
+
+ alarm: backend_metrics_sent
+ on: netdata.backend_metrics
+ units: %
+ calc: abs($sent) * 100 / abs($buffered)
+ every: 10s
+ warn: $this != 100
+ delay: down 5m multiplier 1.5 max 1h
+ info: percentage of metrics sent to the backend server
+ to: dba
+
+ alarm: backend_metrics_lost
+ on: netdata.backend_metrics
+ units: metrics
+ calc: abs($lost)
+ every: 10s
+ crit: $this != 0
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of metrics lost due to repeating failures to contact the backend server
+ to: dba
+
+# this chart has been removed from netdata
+# alarm: backend_slow
+# on: netdata.backend_latency
+# units: %
+# calc: $latency * 100 / ($update_every * 1000)
+# every: 10s
+# warn: $this > 50
+# crit: $this > 100
+# delay: down 5m multiplier 1.5 max 1h
+# info: the percentage of time between iterations needed by the backend time to process the data sent by netdata
+# to: dba
diff --git a/conf.d/health.d/bind_rndc.conf b/conf.d/health.d/bind_rndc.conf
new file mode 100644
index 000000000..028bc9d08
--- /dev/null
+++ b/conf.d/health.d/bind_rndc.conf
@@ -0,0 +1,9 @@
+ alarm: bind_rndc_stats_file_size
+ on: bind_rndc.stats_size
+ units: megabytes
+ every: 60
+ calc: $stats_size
+ warn: $this > 512
+ crit: $this > 1024
+ info: Bind stats file is very large! Consider to create logrotate conf file for it!
+ to: sysadmin
diff --git a/conf.d/health.d/cpu.conf b/conf.d/health.d/cpu.conf
index 4d79fc799..60f494d70 100644
--- a/conf.d/health.d/cpu.conf
+++ b/conf.d/health.d/cpu.conf
@@ -4,8 +4,8 @@ template: 10min_cpu_usage
lookup: average -10m unaligned of user,system,nice,softirq,irq,guest,guest_nice
units: %
every: 1m
- warn: $this > (($status >= $WARNING) ? (70) : (80))
- crit: $this > (($status == $CRITICAL) ? (80) : (90))
+ warn: $this > (($status >= $WARNING) ? (75) : (85))
+ crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 15m multiplier 1.5 max 1h
info: average cpu utilization for the last 10 minutes
to: sysadmin
@@ -15,8 +15,8 @@ template: 10min_cpu_iowait
lookup: average -10m unaligned of iowait
units: %
every: 1m
- warn: $this > (($status >= $WARNING) ? (5) : (10))
- crit: $this > (($status == $CRITICAL) ? (20) : (30))
+ warn: $this > (($status >= $WARNING) ? (20) : (40))
+ crit: $this > (($status == $CRITICAL) ? (40) : (50))
delay: down 15m multiplier 1.5 max 1h
info: average CPU wait I/O for the last 10 minutes
to: sysadmin
@@ -28,6 +28,6 @@ template: 20min_steal_cpu
every: 5m
warn: $this > (($status >= $WARNING) ? (5) : (10))
crit: $this > (($status == $CRITICAL) ? (20) : (30))
- delay: down 15m multiplier 1.5 max 1h
+ delay: down 1h multiplier 1.5 max 2h
info: average CPU steal time for the last 20 minutes
to: sysadmin
diff --git a/conf.d/health.d/disks.conf b/conf.d/health.d/disks.conf
index cc7a47660..0549bac26 100644
--- a/conf.d/health.d/disks.conf
+++ b/conf.d/health.d/disks.conf
@@ -4,11 +4,12 @@
# for mount points
template: disk_space_last_collected_secs
on: disk.space
+families: *
calc: $now - $last_collected_t
units: seconds ago
every: 10s
- warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection of the mount point
to: sysadmin
@@ -16,11 +17,12 @@ template: disk_space_last_collected_secs
# for block devices
template: disk_last_collected_secs
on: disk.io
+families: *
calc: $now - $last_collected_t
units: seconds ago
every: 10s
- warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection of the block device
to: sysadmin
@@ -35,22 +37,24 @@ template: disk_last_collected_secs
template: disk_space_usage
on: disk.space
+families: *
calc: $used * 100 / ($avail + $used)
units: %
every: 1m
- warn: $this > (($status >= $WARNING ) ? (70) : (80))
- crit: $this > (($status == $CRITICAL) ? (85) : (95))
+ warn: $this > (($status >= $WARNING ) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
delay: up 1m down 15m multiplier 1.5 max 1h
info: current disk space usage
to: sysadmin
template: disk_inode_usage
on: disk.inodes
+families: *
calc: $used * 100 / ($avail + $used)
units: %
every: 1m
- warn: $this > (($status >= $WARNING) ? (75) : (80))
- crit: $this > (($status == $CRITICAL) ? (90) : (95))
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
delay: up 1m down 15m multiplier 1.5 max 1h
info: current disk inode usage
to: sysadmin
@@ -69,6 +73,7 @@ template: disk_inode_usage
template: disk_fill_rate
on: disk.space
+families: *
lookup: min -10m at -50m unaligned of avail
calc: ($this - $avail) / (($now - $after) / 3600)
every: 1m
@@ -82,7 +87,8 @@ template: disk_fill_rate
template: out_of_disk_space_time
on: disk.space
- calc: $avail / $disk_fill_rate
+families: *
+ calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (0)
units: hours
every: 10s
warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
@@ -101,6 +107,7 @@ template: out_of_disk_space_time
template: 10min_disk_utilization
on: disk.util
+families: *
lookup: average -10m unaligned
units: %
every: 1m
@@ -120,6 +127,7 @@ template: 10min_disk_utilization
template: 10min_disk_backlog
on: disk.backlog
+families: *
lookup: average -10m unaligned
units: ms
every: 1m
diff --git a/conf.d/health.d/elasticsearch.conf b/conf.d/health.d/elasticsearch.conf
new file mode 100644
index 000000000..dffd40965
--- /dev/null
+++ b/conf.d/health.d/elasticsearch.conf
@@ -0,0 +1,9 @@
+ alarm: elasticsearch_last_collected
+ on: elasticsearch_local.cluster_health_status
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ info: number of seconds since the last successful data collection
+ to: sysadmin
diff --git a/conf.d/health.d/entropy.conf b/conf.d/health.d/entropy.conf
index d0eca8a6c..5dd8af502 100644
--- a/conf.d/health.d/entropy.conf
+++ b/conf.d/health.d/entropy.conf
@@ -3,12 +3,12 @@
# the alarm is checked every 1 minute
# and examines the last hour of data
- alarm: 1hour_lowest_entropy
+ alarm: lowest_entropy
on: system.entropy
- lookup: min -1h unaligned
+ lookup: min -10m unaligned
units: entries
every: 5m
warn: $this < (($status >= $WARNING) ? (200) : (100))
- delay: down 1h multiplier 1.5 max 1h
- info: minimum entries in the random numbers pool in the last 30 minutes
+ delay: down 1h multiplier 1.5 max 2h
+ info: minimum entries in the random numbers pool in the last 10 minutes
to: silent
diff --git a/conf.d/health.d/haproxy.conf b/conf.d/health.d/haproxy.conf
new file mode 100644
index 000000000..e49c70d48
--- /dev/null
+++ b/conf.d/health.d/haproxy.conf
@@ -0,0 +1,27 @@
+template: haproxy_backend_server_status
+ on: haproxy_hs.down
+ units: failed servers
+ every: 10s
+ lookup: average -10s
+ crit: $this > 0
+ info: number of failed haproxy backend servers
+ to: sysadmin
+
+template: haproxy_backend_status
+ on: haproxy_hb.down
+ units: failed backend
+ every: 10s
+ lookup: average -10s
+ crit: $this > 0
+ info: number of failed haproxy backends
+ to: sysadmin
+
+template: haproxy_last_collected
+ on: haproxy_hb.down
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ info: number of seconds since the last successful data collection
+ to: sysadmin
diff --git a/conf.d/health.d/ipc.conf b/conf.d/health.d/ipc.conf
new file mode 100644
index 000000000..ee7c4badd
--- /dev/null
+++ b/conf.d/health.d/ipc.conf
@@ -0,0 +1,22 @@
+
+ alarm: semaphores_used
+ on: system.ipc_semaphores
+ calc: $semaphores * 100 / $ipc.semaphores.max
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (70) : (80))
+ crit: $this > (($status == $CRITICAL) ? (70) : (90))
+ delay: down 5m multiplier 1.5 max 1h
+ info: the percentage of IPC semaphores used
+ to: sysadmin
+
+ alarm: semaphore_arrays_used
+ on: system.ipc_semaphore_arrays
+ calc: $arrays * 100 / $ipc.semaphores.arrays.max
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (70) : (80))
+ crit: $this > (($status == $CRITICAL) ? (70) : (90))
+ delay: down 5m multiplier 1.5 max 1h
+ info: the percentage of IPC semaphore arrays used
+ to: sysadmin
diff --git a/conf.d/health.d/ipfs.conf b/conf.d/health.d/ipfs.conf
new file mode 100644
index 000000000..3f77572d6
--- /dev/null
+++ b/conf.d/health.d/ipfs.conf
@@ -0,0 +1,11 @@
+
+template: ipfs_datastore_usage
+ on: ipfs.repo_size
+ calc: $size * 100 / $avail
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: down 15m multiplier 1.5 max 1h
+ info: ipfs Datastore close to running out of space
+ to: sysadmin
diff --git a/conf.d/health.d/isc_dhcpd.conf b/conf.d/health.d/isc_dhcpd.conf
new file mode 100644
index 000000000..4345619aa
--- /dev/null
+++ b/conf.d/health.d/isc_dhcpd.conf
@@ -0,0 +1,10 @@
+ alarm: isc_dhcpd_parse_time
+ on: isc_dhcpd.parse_time
+ units: ms
+ every: 60
+ calc: $ptime
+ warn: $this > 100
+ crit: $this > 250
+ delay: up 2m down 5m
+ info: Parsing too slow! It can slow down your server. Check dhcpd.leases file size.
+ to: sysadmin
diff --git a/conf.d/health.d/mdstat.conf b/conf.d/health.d/mdstat.conf
new file mode 100644
index 000000000..c9e7d20db
--- /dev/null
+++ b/conf.d/health.d/mdstat.conf
@@ -0,0 +1,18 @@
+template: mdstat_disks
+ on: md.disks
+ units: failed devices
+ every: 10s
+ calc: $total - $inuse
+ crit: $this > 0
+ info: Array is degraded!
+ to: sysadmin
+
+template: mdstat_last_collected
+ on: md.disks
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ info: number of seconds since the last successful data collection
+ to: sysadmin
diff --git a/conf.d/health.d/memcached.conf b/conf.d/health.d/memcached.conf
index 46a8ca0e5..7917e36af 100644
--- a/conf.d/health.d/memcached.conf
+++ b/conf.d/health.d/memcached.conf
@@ -6,8 +6,8 @@ template: memcached_last_collected_secs
calc: $now - $last_collected_t
units: seconds ago
every: 10s
- warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
to: dba
@@ -42,7 +42,7 @@ template: cache_fill_rate
template: out_of_cache_space_time
on: memcached.cache
- calc: $available / $cache_fill_rate
+ calc: ($cache_fill_rate > 0) ? ($available / $cache_fill_rate) : (0)
units: hours
every: 10s
warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
diff --git a/conf.d/health.d/memory.conf b/conf.d/health.d/memory.conf
new file mode 100644
index 000000000..3c904f6b1
--- /dev/null
+++ b/conf.d/health.d/memory.conf
@@ -0,0 +1,30 @@
+
+ alarm: 1hour_ecc_memory_correctable
+ on: mem.ecc_ce
+ lookup: sum -10m unaligned
+ units: errors
+ every: 1m
+ warn: $this > 0
+ delay: down 1h multiplier 1.5 max 1h
+ info: number of ECC correctable errors during the last hour
+ to: sysadmin
+
+ alarm: 1hour_ecc_memory_uncorrectable
+ on: mem.ecc_ue
+ lookup: sum -10m unaligned
+ units: errors
+ every: 1m
+ crit: $this > 0
+ delay: down 1h multiplier 1.5 max 1h
+ info: number of ECC uncorrectable errors during the last hour
+ to: sysadmin
+
+ alarm: 1hour_memory_hw_corrupted
+ on: mem.hwcorrupt
+ calc: $HardwareCorrupted
+ units: MB
+ every: 10s
+ warn: $this > 0
+ delay: down 1h multiplier 1.5 max 1h
+ info: amount of memory corrupted due to a hardware failure
+ to: sysadmin
diff --git a/conf.d/health.d/mysql.conf b/conf.d/health.d/mysql.conf
index a2cfa3ec5..78773e5b5 100644
--- a/conf.d/health.d/mysql.conf
+++ b/conf.d/health.d/mysql.conf
@@ -6,8 +6,80 @@ template: mysql_last_collected_secs
calc: $now - $last_collected_t
units: seconds ago
every: 10s
- warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
to: dba
+
+
+# -----------------------------------------------------------------------------
+# slow queries
+
+template: mysql_10s_slow_queries
+ on: mysql.queries
+ lookup: sum -10s of slow_queries
+ units: slow queries
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (5) : (10))
+ crit: $this > (($status == $CRITICAL) ? (10) : (20))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of mysql slow queries over the last 10 seconds
+ to: dba
+
+
+# -----------------------------------------------------------------------------
+# lock waits
+
+template: mysql_10s_table_locks_immediate
+ on: mysql.table_locks
+ lookup: sum -10s absolute of immediate
+ units: immediate locks
+ every: 10s
+ info: number of table immediate locks over the last 10 seconds
+ to: dba
+
+template: mysql_10s_table_locks_waited
+ on: mysql.table_locks
+ lookup: sum -10s absolute of waited
+ units: waited locks
+ every: 10s
+ info: number of table waited locks over the last 10 seconds
+ to: dba
+
+template: mysql_10s_waited_locks_ratio
+ on: mysql.table_locks
+ calc: ($mysql_10s_table_locks_waited * 100) / ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate)
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (10) : (25))
+ crit: $this > (($status == $CRITICAL) ? (25) : (50))
+ delay: down 30m multiplier 1.5 max 1h
+ info: the ratio of mysql waited table locks, for the last 10 seconds
+ to: dba
+
+
+# -----------------------------------------------------------------------------
+# replication
+
+template: mysql_replication
+ on: mysql.slave_status
+ calc: ($sql_running == -1 OR $io_running == -1)?0:1
+ units: status
+ every: 10s
+ crit: $this == 0
+ delay: down 5m multiplier 1.5 max 1h
+ info: checks if mysql replication has stopped
+ to: dba
+
+template: mysql_replication_lag
+ on: mysql.slave_behind
+ calc: $seconds
+ units: seconds
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (5) : (10))
+ crit: $this > (($status == $CRITICAL) ? (10) : (30))
+ delay: down 15m multiplier 1.5 max 1h
+ info: the number of seconds mysql replication is behind this master
+ to: dba
+
diff --git a/conf.d/health.d/named.conf b/conf.d/health.d/named.conf
index f2eaa83c7..4fc65c8ee 100644
--- a/conf.d/health.d/named.conf
+++ b/conf.d/health.d/named.conf
@@ -6,8 +6,8 @@ template: named_last_collected_secs
calc: $now - $last_collected_t
units: seconds ago
every: 10s
- warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
to: domainadmin
diff --git a/conf.d/health.d/net.conf b/conf.d/health.d/net.conf
index 7753aa184..924acccc3 100644
--- a/conf.d/health.d/net.conf
+++ b/conf.d/health.d/net.conf
@@ -3,46 +3,119 @@
template: interface_last_collected_secs
on: net.net
+families: *
calc: $now - $last_collected_t
units: seconds ago
every: 10s
- warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
to: sysadmin
# -----------------------------------------------------------------------------
+# dropped packets
# check if an interface is dropping packets
# the alarm is checked every 1 minute
-# and examines the last hour of data
+# and examines the last 10 minutes of data
-template: 1hour_packet_drops
+template: inbound_packets_dropped
on: net.drops
- lookup: sum -1h unaligned absolute
+families: *
+ lookup: sum -10m unaligned absolute of inbound
units: packets
every: 1m
warn: $this > 0
- delay: down 30m multiplier 1.5 max 1h
- info: interface dropped packets in the last hour
+ delay: down 1h multiplier 1.5 max 2h
+ info: interface inbound dropped packets in the last 10 minutes
+ to: sysadmin
+
+template: outbound_packets_dropped
+ on: net.drops
+families: *
+ lookup: sum -10m unaligned absolute of outbound
+ units: packets
+ every: 1m
+ warn: $this > 0
+ delay: down 1h multiplier 1.5 max 2h
+ info: interface outbound dropped packets in the last 10 minutes
+ to: sysadmin
+
+template: inbound_packets_dropped_ratio
+ on: net.packets
+families: *
+ lookup: sum -10m unaligned absolute of received
+ calc: (($inbound_packets_dropped != nan AND $this > 0) ? ($inbound_packets_dropped * 100 / $this) : (0))
+ units: %
+ every: 1m
+ warn: $this > 0.5
+ crit: $this > 3
+ delay: down 1h multiplier 1.5 max 2h
+ info: the ratio of inbound dropped packets vs the total number of received packets of the network interface, during the last 10 minutes
+ to: sysadmin
+
+template: outbound_packets_dropped_ratio
+ on: net.packets
+families: *
+ lookup: sum -10m unaligned absolute of sent
+ calc: (($outbound_packets_dropped != nan AND $this > 0) ? ($outbound_packets_dropped * 100 / $this) : (0))
+ units: %
+ every: 1m
+ warn: $this > 0.5
+ crit: $this > 3
+ delay: down 1h multiplier 1.5 max 2h
+ info: the ratio of outbound dropped packets vs the total number of sent packets of the network interface, during the last 10 minutes
to: sysadmin
# -----------------------------------------------------------------------------
+# FIFO errors
# check if an interface is having FIFO
# buffer errors
# the alarm is checked every 1 minute
-# and examines the last hour of data
+# and examines the last 10 minutes of data
-template: 1hour_fifo_errors
+template: 10min_fifo_errors
on: net.fifo
- lookup: sum -1h unaligned absolute
+families: *
+ lookup: sum -10m unaligned absolute
units: errors
every: 1m
warn: $this > 0
- delay: down 30m multiplier 1.5 max 1h
- info: interface fifo errors in the last hour
+ delay: down 1h multiplier 1.5 max 2h
+ info: interface fifo errors in the last 10 minutes
to: sysadmin
+
+
+# -----------------------------------------------------------------------------
+# check for packet storms
+
+# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
+# 2. do the same for the last 10s
+# 3. raise an alarm if the later is 10x or 20x the first
+# we assume the minimum packet storm should at least have
+# 10000 packets/s, average of the last 10 seconds
+
+template: 1m_received_packets_rate
+ on: net.packets
+families: *
+ lookup: average -1m of received
+ units: packets
+ every: 10s
+ info: the average number of packets received during the last minute
+
+template: 10s_received_packets_storm
+ on: net.packets
+families: *
+ lookup: average -10s of received
+ calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
+ every: 10s
+ units: %
+ warn: $this > (($status >= $WARNING)?(200):(1000))
+ crit: $this > (($status >= $WARNING)?(1000):(2000))
+ info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute
+ to: silent
+
diff --git a/conf.d/health.d/netfilter.conf b/conf.d/health.d/netfilter.conf
new file mode 100644
index 000000000..3dd6a67b3
--- /dev/null
+++ b/conf.d/health.d/netfilter.conf
@@ -0,0 +1,23 @@
+
+ alarm: netfilter_last_collected_secs
+ on: netfilter.conntrack_sockets
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: sysadmin
+
+ alarm: netfilter_conntrack_full
+ on: netfilter.conntrack_sockets
+ lookup: max -10s unaligned of connections
+ calc: $this * 100 / $netfilter.conntrack.max
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (70) : (80))
+ crit: $this > (($status == $CRITICAL) ? (80) : (90))
+ delay: down 5m multiplier 1.5 max 1h
+ info: the number of connections tracked by the netfilter connection tracker, as a percentage of the connection tracker table size
+ to: sysadmin
diff --git a/conf.d/health.d/nginx.conf b/conf.d/health.d/nginx.conf
index d70d6a59b..a686c3d99 100644
--- a/conf.d/health.d/nginx.conf
+++ b/conf.d/health.d/nginx.conf
@@ -6,8 +6,8 @@ template: nginx_last_collected_secs
calc: $now - $last_collected_t
units: seconds ago
every: 10s
- warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
to: webmaster
diff --git a/conf.d/health.d/postgres.conf b/conf.d/health.d/postgres.conf
new file mode 100644
index 000000000..4e0583b85
--- /dev/null
+++ b/conf.d/health.d/postgres.conf
@@ -0,0 +1,13 @@
+
+# make sure postgres is running
+
+template: postgres_last_collected_secs
+ on: postgres.db_stat_transactions
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: dba
diff --git a/conf.d/health.d/ram.conf b/conf.d/health.d/ram.conf
index 216b82fed..d60df75b2 100644
--- a/conf.d/health.d/ram.conf
+++ b/conf.d/health.d/ram.conf
@@ -4,8 +4,8 @@
calc: $used * 100 / ($used + $cached + $free)
units: %
every: 10s
- warn: $this > (($status >= $WARNING) ? (70) : (80))
- crit: $this > (($status == $CRITICAL) ? (80) : (90))
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
delay: down 15m multiplier 1.5 max 1h
info: system RAM usage
to: sysadmin
diff --git a/conf.d/health.d/redis.conf b/conf.d/health.d/redis.conf
index 3e648d85d..5f6d397ea 100644
--- a/conf.d/health.d/redis.conf
+++ b/conf.d/health.d/redis.conf
@@ -6,8 +6,8 @@ template: redis_last_collected_secs
calc: $now - $last_collected_t
units: seconds ago
every: 10s
- warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
to: dba
diff --git a/conf.d/health.d/retroshare.conf b/conf.d/health.d/retroshare.conf
index 1af7b4686..2344b60ec 100644
--- a/conf.d/health.d/retroshare.conf
+++ b/conf.d/health.d/retroshare.conf
@@ -5,8 +5,8 @@ template: retroshare_last_collected_secs
calc: $now - $last_collected_t
units: seconds ago
every: 10s
- warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
to: sysadmin
diff --git a/conf.d/health.d/softnet.conf b/conf.d/health.d/softnet.conf
index 0c3709f46..5faf9a9ee 100644
--- a/conf.d/health.d/softnet.conf
+++ b/conf.d/health.d/softnet.conf
@@ -1,21 +1,21 @@
# check for common /proc/net/softnet_stat errors
- alarm: 1hour_netdev_backlog_exceeded
+ alarm: 10min_netdev_backlog_exceeded
on: system.softnet_stat
- lookup: sum -1h unaligned absolute of dropped
+ lookup: sum -10m unaligned absolute of dropped
units: packets
every: 1m
warn: $this > 0
- delay: down 30m multiplier 1.5 max 1h
- info: number of packets dropped because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets)
+ delay: down 1h multiplier 1.5 max 2h
+ info: number of packets dropped in the last 10min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets)
to: sysadmin
- alarm: 1hour_netdev_budget_ran_outs
+ alarm: 10min_netdev_budget_ran_outs
on: system.softnet_stat
- lookup: sum -1h unaligned absolute of squeezed
+ lookup: sum -10m unaligned absolute of squeezed
units: events
every: 1m
- warn: $this > 0
- delay: down 30m multiplier 1.5 max 1h
- info: number of times ksoftirq ran out of sysctl net.core.netdev_budget or time slice, with work remaining (this can be a cause for dropped packets)
+ warn: $this > (($status >= $WARNING) ? (0) : (10))
+ delay: down 1h multiplier 1.5 max 2h
+ info: number of times, during the last 10min, ksoftirq ran out of sysctl net.core.netdev_budget or time slice, with work remaining (this can be a cause for dropped packets)
to: silent
diff --git a/conf.d/health.d/squid.conf b/conf.d/health.d/squid.conf
index 76143c5d7..06cc9678f 100644
--- a/conf.d/health.d/squid.conf
+++ b/conf.d/health.d/squid.conf
@@ -6,8 +6,8 @@ template: squid_last_collected_secs
calc: $now - $last_collected_t
units: seconds ago
every: 10s
- warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
to: proxyadmin
diff --git a/conf.d/health.d/swap.conf b/conf.d/health.d/swap.conf
index 0cfa888c4..7f57560e2 100644
--- a/conf.d/health.d/swap.conf
+++ b/conf.d/health.d/swap.conf
@@ -6,13 +6,13 @@
calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
units: % of RAM
every: 1m
- warn: $this > (($status >= $WARNING) ? (5) : (10))
- crit: $this > (($status == $CRITICAL) ? (15) : (20))
+ warn: $this > (($status >= $WARNING) ? (10) : (20))
+ crit: $this > (($status == $CRITICAL) ? (20) : (30))
delay: up 0 down 15m multiplier 1.5 max 1h
info: the amount of memory swapped in the last 30 minutes, as a percentage of the system RAM
to: sysadmin
- alarm: used_swap_space
+ alarm: ram_in_swap
on: system.swap
calc: $used * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
units: % of RAM
@@ -22,3 +22,14 @@
delay: up 0 down 15m multiplier 1.5 max 1h
info: the swap memory used, as a percentage of the system RAM
to: sysadmin
+
+ alarm: used_swap
+ on: system.swap
+ calc: $used * 100 / ( $used + $free )
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: up 0 down 15m multiplier 1.5 max 1h
+ info: the percentage of swap memory used
+ to: sysadmin
diff --git a/conf.d/health.d/tcp_resets.conf b/conf.d/health.d/tcp_resets.conf
index 8e93c4793..daf24a1cd 100644
--- a/conf.d/health.d/tcp_resets.conf
+++ b/conf.d/health.d/tcp_resets.conf
@@ -5,28 +5,48 @@
calc: $now - $last_collected_t
units: seconds ago
every: 10s
- warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
delay: up 0 down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
to: sysadmin
# -----------------------------------------------------------------------------
+# tcp resets this host sends
- alarm: 1m_ipv4_tcp_resets
+ alarm: 1m_ipv4_tcp_resets_sent
on: ipv4.tcphandshake
lookup: average -1m at -10s unaligned absolute of OutRsts
units: tcp resets/s
every: 10s
info: average TCP RESETS this host is sending, over the last minute
- alarm: 10s_ipv4_tcp_resets
+ alarm: 10s_ipv4_tcp_resets_sent
on: ipv4.tcphandshake
lookup: average -10s unaligned absolute of OutRsts
units: tcp resets/s
every: 10s
- warn: $this > ((($1m_ipv4_tcp_resets < 5)?(5):($1m_ipv4_tcp_resets)) * (($status >= $WARNING) ? (1) : (4)))
+ warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (4)))
delay: up 0 down 60m multiplier 1.2 max 2h
info: average TCP RESETS this host is sending, over the last 10 seconds (this can be an indication that a port scan is made, or that a service running on this host has crashed)
- to: sysadmin
+ to: silent
+
+# -----------------------------------------------------------------------------
+# tcp resets this host receives
+
+ alarm: 1m_ipv4_tcp_resets_received
+ on: ipv4.tcphandshake
+ lookup: average -1m at -10s unaligned absolute of AttemptFails
+ units: tcp resets/s
+ every: 10s
+ info: average TCP RESETS this host is sending, over the last minute
+ alarm: 10s_ipv4_tcp_resets_received
+ on: ipv4.tcphandshake
+ lookup: average -10s unaligned absolute of AttemptFails
+ units: tcp resets/s
+ every: 10s
+ warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (4)))
+ delay: up 0 down 60m multiplier 1.2 max 2h
+ info: average TCP RESETS this host is receiving, over the last 10 seconds (this can be an indication that a service this host needs, has crashed)
+ to: silent
diff --git a/conf.d/health.d/udp_errors.conf b/conf.d/health.d/udp_errors.conf
new file mode 100644
index 000000000..98e955c02
--- /dev/null
+++ b/conf.d/health.d/udp_errors.conf
@@ -0,0 +1,40 @@
+# -----------------------------------------------------------------------------
+
+ alarm: ipv4_udperrors_last_collected_secs
+ on: ipv4.udperrors
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: up 0 down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: sysadmin
+
+# -----------------------------------------------------------------------------
+# UDP receive buffer errors
+
+ alarm: 1m_ipv4_udp_receive_buffer_errors
+ on: ipv4.udperrors
+ lookup: sum -1m unaligned absolute of RcvbufErrors
+ units: errors
+ every: 10s
+ warn: $this > 0
+ crit: $this > 100
+ info: number of UDP receive buffer errors during the last minute
+ delay: up 0 down 60m multiplier 1.2 max 2h
+ to: sysadmin
+
+# -----------------------------------------------------------------------------
+# UDP send buffer errors
+
+ alarm: 1m_ipv4_udp_send_buffer_errors
+ on: ipv4.udperrors
+ lookup: sum -1m unaligned absolute of SndbufErrors
+ units: errors
+ every: 10s
+ warn: $this > 0
+ crit: $this > 100
+ info: number of UDP send buffer errors during the last minute
+ delay: up 0 down 60m multiplier 1.2 max 2h
+ to: sysadmin
diff --git a/conf.d/health.d/varnish.conf b/conf.d/health.d/varnish.conf
new file mode 100644
index 000000000..cca7446b4
--- /dev/null
+++ b/conf.d/health.d/varnish.conf
@@ -0,0 +1,9 @@
+ alarm: varnish_last_collected
+ on: varnish.uptime
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ info: number of seconds since the last successful data collection
+ to: sysadmin