diff options
Diffstat (limited to 'conf.d/health.d')
29 files changed, 517 insertions, 67 deletions
diff --git a/conf.d/health.d/apache.conf b/conf.d/health.d/apache.conf index 0aaf0e003..0c98b8778 100644 --- a/conf.d/health.d/apache.conf +++ b/conf.d/health.d/apache.conf @@ -6,8 +6,8 @@ template: apache_last_collected_secs calc: $now - $last_collected_t units: seconds ago every: 10s - warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) delay: down 5m multiplier 1.5 max 1h info: number of seconds since the last successful data collection to: webmaster diff --git a/conf.d/health.d/backend.conf b/conf.d/health.d/backend.conf new file mode 100644 index 000000000..9c193e7b9 --- /dev/null +++ b/conf.d/health.d/backend.conf @@ -0,0 +1,45 @@ + +# make sure we are sending data to backend + + alarm: backend_last_buffering + on: netdata.backend_metrics + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful buffering of backend data + to: dba + + alarm: backend_metrics_sent + on: netdata.backend_metrics + units: % + calc: abs($sent) * 100 / abs($buffered) + every: 10s + warn: $this != 100 + delay: down 5m multiplier 1.5 max 1h + info: percentage of metrics sent to the backend server + to: dba + + alarm: backend_metrics_lost + on: netdata.backend_metrics + units: metrics + calc: abs($lost) + every: 10s + crit: $this != 0 + delay: down 5m multiplier 1.5 max 1h + info: number of metrics lost due to repeating failures to contact the backend server + to: dba + +# this chart has been removed from netdata +# alarm: backend_slow +# on: netdata.backend_latency +# units: % +# calc: $latency * 100 / ($update_every * 1000) +# every: 10s +# warn: $this > 50 +# crit: $this > 100 +# delay: down 5m multiplier 1.5 max 1h +# info: the percentage of time between iterations needed by the backend time to process the data sent by netdata +# to: dba diff --git a/conf.d/health.d/bind_rndc.conf b/conf.d/health.d/bind_rndc.conf new file mode 100644 index 000000000..028bc9d08 --- /dev/null +++ b/conf.d/health.d/bind_rndc.conf @@ -0,0 +1,9 @@ + alarm: bind_rndc_stats_file_size + on: bind_rndc.stats_size + units: megabytes + every: 60 + calc: $stats_size + warn: $this > 512 + crit: $this > 1024 + info: Bind stats file is very large! Consider to create logrotate conf file for it! + to: sysadmin diff --git a/conf.d/health.d/cpu.conf b/conf.d/health.d/cpu.conf index 4d79fc799..60f494d70 100644 --- a/conf.d/health.d/cpu.conf +++ b/conf.d/health.d/cpu.conf @@ -4,8 +4,8 @@ template: 10min_cpu_usage lookup: average -10m unaligned of user,system,nice,softirq,irq,guest,guest_nice units: % every: 1m - warn: $this > (($status >= $WARNING) ? (70) : (80)) - crit: $this > (($status == $CRITICAL) ? (80) : (90)) + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 15m multiplier 1.5 max 1h info: average cpu utilization for the last 10 minutes to: sysadmin @@ -15,8 +15,8 @@ template: 10min_cpu_iowait lookup: average -10m unaligned of iowait units: % every: 1m - warn: $this > (($status >= $WARNING) ? (5) : (10)) - crit: $this > (($status == $CRITICAL) ? (20) : (30)) + warn: $this > (($status >= $WARNING) ? (20) : (40)) + crit: $this > (($status == $CRITICAL) ? (40) : (50)) delay: down 15m multiplier 1.5 max 1h info: average CPU wait I/O for the last 10 minutes to: sysadmin @@ -28,6 +28,6 @@ template: 20min_steal_cpu every: 5m warn: $this > (($status >= $WARNING) ? (5) : (10)) crit: $this > (($status == $CRITICAL) ? (20) : (30)) - delay: down 15m multiplier 1.5 max 1h + delay: down 1h multiplier 1.5 max 2h info: average CPU steal time for the last 20 minutes to: sysadmin diff --git a/conf.d/health.d/disks.conf b/conf.d/health.d/disks.conf index cc7a47660..0549bac26 100644 --- a/conf.d/health.d/disks.conf +++ b/conf.d/health.d/disks.conf @@ -4,11 +4,12 @@ # for mount points template: disk_space_last_collected_secs on: disk.space +families: * calc: $now - $last_collected_t units: seconds ago every: 10s - warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) delay: down 5m multiplier 1.5 max 1h info: number of seconds since the last successful data collection of the mount point to: sysadmin @@ -16,11 +17,12 @@ template: disk_space_last_collected_secs # for block devices template: disk_last_collected_secs on: disk.io +families: * calc: $now - $last_collected_t units: seconds ago every: 10s - warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) delay: down 5m multiplier 1.5 max 1h info: number of seconds since the last successful data collection of the block device to: sysadmin @@ -35,22 +37,24 @@ template: disk_last_collected_secs template: disk_space_usage on: disk.space +families: * calc: $used * 100 / ($avail + $used) units: % every: 1m - warn: $this > (($status >= $WARNING ) ? (70) : (80)) - crit: $this > (($status == $CRITICAL) ? (85) : (95)) + warn: $this > (($status >= $WARNING ) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: up 1m down 15m multiplier 1.5 max 1h info: current disk space usage to: sysadmin template: disk_inode_usage on: disk.inodes +families: * calc: $used * 100 / ($avail + $used) units: % every: 1m - warn: $this > (($status >= $WARNING) ? (75) : (80)) - crit: $this > (($status == $CRITICAL) ? (90) : (95)) + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: up 1m down 15m multiplier 1.5 max 1h info: current disk inode usage to: sysadmin @@ -69,6 +73,7 @@ template: disk_inode_usage template: disk_fill_rate on: disk.space +families: * lookup: min -10m at -50m unaligned of avail calc: ($this - $avail) / (($now - $after) / 3600) every: 1m @@ -82,7 +87,8 @@ template: disk_fill_rate template: out_of_disk_space_time on: disk.space - calc: $avail / $disk_fill_rate +families: * + calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (0) units: hours every: 10s warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) @@ -101,6 +107,7 @@ template: out_of_disk_space_time template: 10min_disk_utilization on: disk.util +families: * lookup: average -10m unaligned units: % every: 1m @@ -120,6 +127,7 @@ template: 10min_disk_utilization template: 10min_disk_backlog on: disk.backlog +families: * lookup: average -10m unaligned units: ms every: 1m diff --git a/conf.d/health.d/elasticsearch.conf b/conf.d/health.d/elasticsearch.conf new file mode 100644 index 000000000..dffd40965 --- /dev/null +++ b/conf.d/health.d/elasticsearch.conf @@ -0,0 +1,9 @@ + alarm: elasticsearch_last_collected + on: elasticsearch_local.cluster_health_status + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + info: number of seconds since the last successful data collection + to: sysadmin diff --git a/conf.d/health.d/entropy.conf b/conf.d/health.d/entropy.conf index d0eca8a6c..5dd8af502 100644 --- a/conf.d/health.d/entropy.conf +++ b/conf.d/health.d/entropy.conf @@ -3,12 +3,12 @@ # the alarm is checked every 1 minute # and examines the last hour of data - alarm: 1hour_lowest_entropy + alarm: lowest_entropy on: system.entropy - lookup: min -1h unaligned + lookup: min -10m unaligned units: entries every: 5m warn: $this < (($status >= $WARNING) ? (200) : (100)) - delay: down 1h multiplier 1.5 max 1h - info: minimum entries in the random numbers pool in the last 30 minutes + delay: down 1h multiplier 1.5 max 2h + info: minimum entries in the random numbers pool in the last 10 minutes to: silent diff --git a/conf.d/health.d/haproxy.conf b/conf.d/health.d/haproxy.conf new file mode 100644 index 000000000..e49c70d48 --- /dev/null +++ b/conf.d/health.d/haproxy.conf @@ -0,0 +1,27 @@ +template: haproxy_backend_server_status + on: haproxy_hs.down + units: failed servers + every: 10s + lookup: average -10s + crit: $this > 0 + info: number of failed haproxy backend servers + to: sysadmin + +template: haproxy_backend_status + on: haproxy_hb.down + units: failed backend + every: 10s + lookup: average -10s + crit: $this > 0 + info: number of failed haproxy backends + to: sysadmin + +template: haproxy_last_collected + on: haproxy_hb.down + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + info: number of seconds since the last successful data collection + to: sysadmin diff --git a/conf.d/health.d/ipc.conf b/conf.d/health.d/ipc.conf new file mode 100644 index 000000000..ee7c4badd --- /dev/null +++ b/conf.d/health.d/ipc.conf @@ -0,0 +1,22 @@ + + alarm: semaphores_used + on: system.ipc_semaphores + calc: $semaphores * 100 / $ipc.semaphores.max + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (70) : (90)) + delay: down 5m multiplier 1.5 max 1h + info: the percentage of IPC semaphores used + to: sysadmin + + alarm: semaphore_arrays_used + on: system.ipc_semaphore_arrays + calc: $arrays * 100 / $ipc.semaphores.arrays.max + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (70) : (90)) + delay: down 5m multiplier 1.5 max 1h + info: the percentage of IPC semaphore arrays used + to: sysadmin diff --git a/conf.d/health.d/ipfs.conf b/conf.d/health.d/ipfs.conf new file mode 100644 index 000000000..3f77572d6 --- /dev/null +++ b/conf.d/health.d/ipfs.conf @@ -0,0 +1,11 @@ + +template: ipfs_datastore_usage + on: ipfs.repo_size + calc: $size * 100 / $avail + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: ipfs Datastore close to running out of space + to: sysadmin diff --git a/conf.d/health.d/isc_dhcpd.conf b/conf.d/health.d/isc_dhcpd.conf new file mode 100644 index 000000000..4345619aa --- /dev/null +++ b/conf.d/health.d/isc_dhcpd.conf @@ -0,0 +1,10 @@ + alarm: isc_dhcpd_parse_time + on: isc_dhcpd.parse_time + units: ms + every: 60 + calc: $ptime + warn: $this > 100 + crit: $this > 250 + delay: up 2m down 5m + info: Parsing too slow! It can slow down your server. Check dhcpd.leases file size. + to: sysadmin diff --git a/conf.d/health.d/mdstat.conf b/conf.d/health.d/mdstat.conf new file mode 100644 index 000000000..c9e7d20db --- /dev/null +++ b/conf.d/health.d/mdstat.conf @@ -0,0 +1,18 @@ +template: mdstat_disks + on: md.disks + units: failed devices + every: 10s + calc: $total - $inuse + crit: $this > 0 + info: Array is degraded! + to: sysadmin + +template: mdstat_last_collected + on: md.disks + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + info: number of seconds since the last successful data collection + to: sysadmin diff --git a/conf.d/health.d/memcached.conf b/conf.d/health.d/memcached.conf index 46a8ca0e5..7917e36af 100644 --- a/conf.d/health.d/memcached.conf +++ b/conf.d/health.d/memcached.conf @@ -6,8 +6,8 @@ template: memcached_last_collected_secs calc: $now - $last_collected_t units: seconds ago every: 10s - warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) delay: down 5m multiplier 1.5 max 1h info: number of seconds since the last successful data collection to: dba @@ -42,7 +42,7 @@ template: cache_fill_rate template: out_of_cache_space_time on: memcached.cache - calc: $available / $cache_fill_rate + calc: ($cache_fill_rate > 0) ? ($available / $cache_fill_rate) : (0) units: hours every: 10s warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) diff --git a/conf.d/health.d/memory.conf b/conf.d/health.d/memory.conf new file mode 100644 index 000000000..3c904f6b1 --- /dev/null +++ b/conf.d/health.d/memory.conf @@ -0,0 +1,30 @@ + + alarm: 1hour_ecc_memory_correctable + on: mem.ecc_ce + lookup: sum -10m unaligned + units: errors + every: 1m + warn: $this > 0 + delay: down 1h multiplier 1.5 max 1h + info: number of ECC correctable errors during the last hour + to: sysadmin + + alarm: 1hour_ecc_memory_uncorrectable + on: mem.ecc_ue + lookup: sum -10m unaligned + units: errors + every: 1m + crit: $this > 0 + delay: down 1h multiplier 1.5 max 1h + info: number of ECC uncorrectable errors during the last hour + to: sysadmin + + alarm: 1hour_memory_hw_corrupted + on: mem.hwcorrupt + calc: $HardwareCorrupted + units: MB + every: 10s + warn: $this > 0 + delay: down 1h multiplier 1.5 max 1h + info: amount of memory corrupted due to a hardware failure + to: sysadmin diff --git a/conf.d/health.d/mysql.conf b/conf.d/health.d/mysql.conf index a2cfa3ec5..78773e5b5 100644 --- a/conf.d/health.d/mysql.conf +++ b/conf.d/health.d/mysql.conf @@ -6,8 +6,80 @@ template: mysql_last_collected_secs calc: $now - $last_collected_t units: seconds ago every: 10s - warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) delay: down 5m multiplier 1.5 max 1h info: number of seconds since the last successful data collection to: dba + + +# ----------------------------------------------------------------------------- +# slow queries + +template: mysql_10s_slow_queries + on: mysql.queries + lookup: sum -10s of slow_queries + units: slow queries + every: 10s + warn: $this > (($status >= $WARNING) ? (5) : (10)) + crit: $this > (($status == $CRITICAL) ? (10) : (20)) + delay: down 5m multiplier 1.5 max 1h + info: number of mysql slow queries over the last 10 seconds + to: dba + + +# ----------------------------------------------------------------------------- +# lock waits + +template: mysql_10s_table_locks_immediate + on: mysql.table_locks + lookup: sum -10s absolute of immediate + units: immediate locks + every: 10s + info: number of table immediate locks over the last 10 seconds + to: dba + +template: mysql_10s_table_locks_waited + on: mysql.table_locks + lookup: sum -10s absolute of waited + units: waited locks + every: 10s + info: number of table waited locks over the last 10 seconds + to: dba + +template: mysql_10s_waited_locks_ratio + on: mysql.table_locks + calc: ($mysql_10s_table_locks_waited * 100) / ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (10) : (25)) + crit: $this > (($status == $CRITICAL) ? (25) : (50)) + delay: down 30m multiplier 1.5 max 1h + info: the ratio of mysql waited table locks, for the last 10 seconds + to: dba + + +# ----------------------------------------------------------------------------- +# replication + +template: mysql_replication + on: mysql.slave_status + calc: ($sql_running == -1 OR $io_running == -1)?0:1 + units: status + every: 10s + crit: $this == 0 + delay: down 5m multiplier 1.5 max 1h + info: checks if mysql replication has stopped + to: dba + +template: mysql_replication_lag + on: mysql.slave_behind + calc: $seconds + units: seconds + every: 10s + warn: $this > (($status >= $WARNING) ? (5) : (10)) + crit: $this > (($status == $CRITICAL) ? (10) : (30)) + delay: down 15m multiplier 1.5 max 1h + info: the number of seconds mysql replication is behind this master + to: dba + diff --git a/conf.d/health.d/named.conf b/conf.d/health.d/named.conf index f2eaa83c7..4fc65c8ee 100644 --- a/conf.d/health.d/named.conf +++ b/conf.d/health.d/named.conf @@ -6,8 +6,8 @@ template: named_last_collected_secs calc: $now - $last_collected_t units: seconds ago every: 10s - warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) delay: down 5m multiplier 1.5 max 1h info: number of seconds since the last successful data collection to: domainadmin diff --git a/conf.d/health.d/net.conf b/conf.d/health.d/net.conf index 7753aa184..924acccc3 100644 --- a/conf.d/health.d/net.conf +++ b/conf.d/health.d/net.conf @@ -3,46 +3,119 @@ template: interface_last_collected_secs on: net.net +families: * calc: $now - $last_collected_t units: seconds ago every: 10s - warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) delay: down 5m multiplier 1.5 max 1h info: number of seconds since the last successful data collection to: sysadmin # ----------------------------------------------------------------------------- +# dropped packets # check if an interface is dropping packets # the alarm is checked every 1 minute -# and examines the last hour of data +# and examines the last 10 minutes of data -template: 1hour_packet_drops +template: inbound_packets_dropped on: net.drops - lookup: sum -1h unaligned absolute +families: * + lookup: sum -10m unaligned absolute of inbound units: packets every: 1m warn: $this > 0 - delay: down 30m multiplier 1.5 max 1h - info: interface dropped packets in the last hour + delay: down 1h multiplier 1.5 max 2h + info: interface inbound dropped packets in the last 10 minutes + to: sysadmin + +template: outbound_packets_dropped + on: net.drops +families: * + lookup: sum -10m unaligned absolute of outbound + units: packets + every: 1m + warn: $this > 0 + delay: down 1h multiplier 1.5 max 2h + info: interface outbound dropped packets in the last 10 minutes + to: sysadmin + +template: inbound_packets_dropped_ratio + on: net.packets +families: * + lookup: sum -10m unaligned absolute of received + calc: (($inbound_packets_dropped != nan AND $this > 0) ? ($inbound_packets_dropped * 100 / $this) : (0)) + units: % + every: 1m + warn: $this > 0.5 + crit: $this > 3 + delay: down 1h multiplier 1.5 max 2h + info: the ratio of inbound dropped packets vs the total number of received packets of the network interface, during the last 10 minutes + to: sysadmin + +template: outbound_packets_dropped_ratio + on: net.packets +families: * + lookup: sum -10m unaligned absolute of sent + calc: (($outbound_packets_dropped != nan AND $this > 0) ? ($outbound_packets_dropped * 100 / $this) : (0)) + units: % + every: 1m + warn: $this > 0.5 + crit: $this > 3 + delay: down 1h multiplier 1.5 max 2h + info: the ratio of outbound dropped packets vs the total number of sent packets of the network interface, during the last 10 minutes to: sysadmin # ----------------------------------------------------------------------------- +# FIFO errors # check if an interface is having FIFO # buffer errors # the alarm is checked every 1 minute -# and examines the last hour of data +# and examines the last 10 minutes of data -template: 1hour_fifo_errors +template: 10min_fifo_errors on: net.fifo - lookup: sum -1h unaligned absolute +families: * + lookup: sum -10m unaligned absolute units: errors every: 1m warn: $this > 0 - delay: down 30m multiplier 1.5 max 1h - info: interface fifo errors in the last hour + delay: down 1h multiplier 1.5 max 2h + info: interface fifo errors in the last 10 minutes to: sysadmin + + +# ----------------------------------------------------------------------------- +# check for packet storms + +# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate +# 2. do the same for the last 10s +# 3. raise an alarm if the later is 10x or 20x the first +# we assume the minimum packet storm should at least have +# 10000 packets/s, average of the last 10 seconds + +template: 1m_received_packets_rate + on: net.packets +families: * + lookup: average -1m of received + units: packets + every: 10s + info: the average number of packets received during the last minute + +template: 10s_received_packets_storm + on: net.packets +families: * + lookup: average -10s of received + calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate)) + every: 10s + units: % + warn: $this > (($status >= $WARNING)?(200):(1000)) + crit: $this > (($status >= $WARNING)?(1000):(2000)) + info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute + to: silent + diff --git a/conf.d/health.d/netfilter.conf b/conf.d/health.d/netfilter.conf new file mode 100644 index 000000000..3dd6a67b3 --- /dev/null +++ b/conf.d/health.d/netfilter.conf @@ -0,0 +1,23 @@ + + alarm: netfilter_last_collected_secs + on: netfilter.conntrack_sockets + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + + alarm: netfilter_conntrack_full + on: netfilter.conntrack_sockets + lookup: max -10s unaligned of connections + calc: $this * 100 / $netfilter.conntrack.max + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (80) : (90)) + delay: down 5m multiplier 1.5 max 1h + info: the number of connections tracked by the netfilter connection tracker, as a percentage of the connection tracker table size + to: sysadmin diff --git a/conf.d/health.d/nginx.conf b/conf.d/health.d/nginx.conf index d70d6a59b..a686c3d99 100644 --- a/conf.d/health.d/nginx.conf +++ b/conf.d/health.d/nginx.conf @@ -6,8 +6,8 @@ template: nginx_last_collected_secs calc: $now - $last_collected_t units: seconds ago every: 10s - warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) delay: down 5m multiplier 1.5 max 1h info: number of seconds since the last successful data collection to: webmaster diff --git a/conf.d/health.d/postgres.conf b/conf.d/health.d/postgres.conf new file mode 100644 index 000000000..4e0583b85 --- /dev/null +++ b/conf.d/health.d/postgres.conf @@ -0,0 +1,13 @@ + +# make sure postgres is running + +template: postgres_last_collected_secs + on: postgres.db_stat_transactions + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: dba diff --git a/conf.d/health.d/ram.conf b/conf.d/health.d/ram.conf index 216b82fed..d60df75b2 100644 --- a/conf.d/health.d/ram.conf +++ b/conf.d/health.d/ram.conf @@ -4,8 +4,8 @@ calc: $used * 100 / ($used + $cached + $free) units: % every: 10s - warn: $this > (($status >= $WARNING) ? (70) : (80)) - crit: $this > (($status == $CRITICAL) ? (80) : (90)) + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 15m multiplier 1.5 max 1h info: system RAM usage to: sysadmin diff --git a/conf.d/health.d/redis.conf b/conf.d/health.d/redis.conf index 3e648d85d..5f6d397ea 100644 --- a/conf.d/health.d/redis.conf +++ b/conf.d/health.d/redis.conf @@ -6,8 +6,8 @@ template: redis_last_collected_secs calc: $now - $last_collected_t units: seconds ago every: 10s - warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) delay: down 5m multiplier 1.5 max 1h info: number of seconds since the last successful data collection to: dba diff --git a/conf.d/health.d/retroshare.conf b/conf.d/health.d/retroshare.conf index 1af7b4686..2344b60ec 100644 --- a/conf.d/health.d/retroshare.conf +++ b/conf.d/health.d/retroshare.conf @@ -5,8 +5,8 @@ template: retroshare_last_collected_secs calc: $now - $last_collected_t units: seconds ago every: 10s - warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) delay: down 5m multiplier 1.5 max 1h info: number of seconds since the last successful data collection to: sysadmin diff --git a/conf.d/health.d/softnet.conf b/conf.d/health.d/softnet.conf index 0c3709f46..5faf9a9ee 100644 --- a/conf.d/health.d/softnet.conf +++ b/conf.d/health.d/softnet.conf @@ -1,21 +1,21 @@ # check for common /proc/net/softnet_stat errors - alarm: 1hour_netdev_backlog_exceeded + alarm: 10min_netdev_backlog_exceeded on: system.softnet_stat - lookup: sum -1h unaligned absolute of dropped + lookup: sum -10m unaligned absolute of dropped units: packets every: 1m warn: $this > 0 - delay: down 30m multiplier 1.5 max 1h - info: number of packets dropped because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets) + delay: down 1h multiplier 1.5 max 2h + info: number of packets dropped in the last 10min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets) to: sysadmin - alarm: 1hour_netdev_budget_ran_outs + alarm: 10min_netdev_budget_ran_outs on: system.softnet_stat - lookup: sum -1h unaligned absolute of squeezed + lookup: sum -10m unaligned absolute of squeezed units: events every: 1m - warn: $this > 0 - delay: down 30m multiplier 1.5 max 1h - info: number of times ksoftirq ran out of sysctl net.core.netdev_budget or time slice, with work remaining (this can be a cause for dropped packets) + warn: $this > (($status >= $WARNING) ? (0) : (10)) + delay: down 1h multiplier 1.5 max 2h + info: number of times, during the last 10min, ksoftirq ran out of sysctl net.core.netdev_budget or time slice, with work remaining (this can be a cause for dropped packets) to: silent diff --git a/conf.d/health.d/squid.conf b/conf.d/health.d/squid.conf index 76143c5d7..06cc9678f 100644 --- a/conf.d/health.d/squid.conf +++ b/conf.d/health.d/squid.conf @@ -6,8 +6,8 @@ template: squid_last_collected_secs calc: $now - $last_collected_t units: seconds ago every: 10s - warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) delay: down 5m multiplier 1.5 max 1h info: number of seconds since the last successful data collection to: proxyadmin diff --git a/conf.d/health.d/swap.conf b/conf.d/health.d/swap.conf index 0cfa888c4..7f57560e2 100644 --- a/conf.d/health.d/swap.conf +++ b/conf.d/health.d/swap.conf @@ -6,13 +6,13 @@ calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free ) units: % of RAM every: 1m - warn: $this > (($status >= $WARNING) ? (5) : (10)) - crit: $this > (($status == $CRITICAL) ? (15) : (20)) + warn: $this > (($status >= $WARNING) ? (10) : (20)) + crit: $this > (($status == $CRITICAL) ? (20) : (30)) delay: up 0 down 15m multiplier 1.5 max 1h info: the amount of memory swapped in the last 30 minutes, as a percentage of the system RAM to: sysadmin - alarm: used_swap_space + alarm: ram_in_swap on: system.swap calc: $used * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free ) units: % of RAM @@ -22,3 +22,14 @@ delay: up 0 down 15m multiplier 1.5 max 1h info: the swap memory used, as a percentage of the system RAM to: sysadmin + + alarm: used_swap + on: system.swap + calc: $used * 100 / ( $used + $free ) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: up 0 down 15m multiplier 1.5 max 1h + info: the percentage of swap memory used + to: sysadmin diff --git a/conf.d/health.d/tcp_resets.conf b/conf.d/health.d/tcp_resets.conf index 8e93c4793..daf24a1cd 100644 --- a/conf.d/health.d/tcp_resets.conf +++ b/conf.d/health.d/tcp_resets.conf @@ -5,28 +5,48 @@ calc: $now - $last_collected_t units: seconds ago every: 10s - warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) delay: up 0 down 5m multiplier 1.5 max 1h info: number of seconds since the last successful data collection to: sysadmin # ----------------------------------------------------------------------------- +# tcp resets this host sends - alarm: 1m_ipv4_tcp_resets + alarm: 1m_ipv4_tcp_resets_sent on: ipv4.tcphandshake lookup: average -1m at -10s unaligned absolute of OutRsts units: tcp resets/s every: 10s info: average TCP RESETS this host is sending, over the last minute - alarm: 10s_ipv4_tcp_resets + alarm: 10s_ipv4_tcp_resets_sent on: ipv4.tcphandshake lookup: average -10s unaligned absolute of OutRsts units: tcp resets/s every: 10s - warn: $this > ((($1m_ipv4_tcp_resets < 5)?(5):($1m_ipv4_tcp_resets)) * (($status >= $WARNING) ? (1) : (4))) + warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (4))) delay: up 0 down 60m multiplier 1.2 max 2h info: average TCP RESETS this host is sending, over the last 10 seconds (this can be an indication that a port scan is made, or that a service running on this host has crashed) - to: sysadmin + to: silent + +# ----------------------------------------------------------------------------- +# tcp resets this host receives + + alarm: 1m_ipv4_tcp_resets_received + on: ipv4.tcphandshake + lookup: average -1m at -10s unaligned absolute of AttemptFails + units: tcp resets/s + every: 10s + info: average TCP RESETS this host is sending, over the last minute + alarm: 10s_ipv4_tcp_resets_received + on: ipv4.tcphandshake + lookup: average -10s unaligned absolute of AttemptFails + units: tcp resets/s + every: 10s + warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (4))) + delay: up 0 down 60m multiplier 1.2 max 2h + info: average TCP RESETS this host is receiving, over the last 10 seconds (this can be an indication that a service this host needs, has crashed) + to: silent diff --git a/conf.d/health.d/udp_errors.conf b/conf.d/health.d/udp_errors.conf new file mode 100644 index 000000000..98e955c02 --- /dev/null +++ b/conf.d/health.d/udp_errors.conf @@ -0,0 +1,40 @@ +# ----------------------------------------------------------------------------- + + alarm: ipv4_udperrors_last_collected_secs + on: ipv4.udperrors + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: up 0 down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +# ----------------------------------------------------------------------------- +# UDP receive buffer errors + + alarm: 1m_ipv4_udp_receive_buffer_errors + on: ipv4.udperrors + lookup: sum -1m unaligned absolute of RcvbufErrors + units: errors + every: 10s + warn: $this > 0 + crit: $this > 100 + info: number of UDP receive buffer errors during the last minute + delay: up 0 down 60m multiplier 1.2 max 2h + to: sysadmin + +# ----------------------------------------------------------------------------- +# UDP send buffer errors + + alarm: 1m_ipv4_udp_send_buffer_errors + on: ipv4.udperrors + lookup: sum -1m unaligned absolute of SndbufErrors + units: errors + every: 10s + warn: $this > 0 + crit: $this > 100 + info: number of UDP send buffer errors during the last minute + delay: up 0 down 60m multiplier 1.2 max 2h + to: sysadmin diff --git a/conf.d/health.d/varnish.conf b/conf.d/health.d/varnish.conf new file mode 100644 index 000000000..cca7446b4 --- /dev/null +++ b/conf.d/health.d/varnish.conf @@ -0,0 +1,9 @@ + alarm: varnish_last_collected + on: varnish.uptime + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + info: number of seconds since the last successful data collection + to: sysadmin |