From 87649cf32bd0e14d5a903fb85b01e9f41a253540 Mon Sep 17 00:00:00 2001 From: Federico Ceratto Date: Wed, 23 Nov 2016 15:49:10 +0000 Subject: New upstream version 1.4.0+dfsg --- conf.d/health.d/apache.conf | 9 ++-- conf.d/health.d/cpu.conf | 37 ++++++++++------ conf.d/health.d/disks.conf | 95 ++++++++++++++++++++++++++++++----------- conf.d/health.d/entropy.conf | 15 ++++--- conf.d/health.d/memcached.conf | 42 ++++++++++-------- conf.d/health.d/mysql.conf | 13 ++++++ conf.d/health.d/named.conf | 8 ++-- conf.d/health.d/net.conf | 51 +++++++++++++++------- conf.d/health.d/nginx.conf | 8 ++-- conf.d/health.d/qos.conf | 2 + conf.d/health.d/ram.conf | 10 +++-- conf.d/health.d/redis.conf | 8 ++-- conf.d/health.d/retroshare.conf | 25 +++++++++++ conf.d/health.d/softnet.conf | 21 +++++++++ conf.d/health.d/squid.conf | 8 ++-- conf.d/health.d/swap.conf | 22 ++++++---- conf.d/health.d/tcp_resets.conf | 32 ++++++++++++++ 17 files changed, 299 insertions(+), 107 deletions(-) create mode 100644 conf.d/health.d/mysql.conf create mode 100644 conf.d/health.d/retroshare.conf create mode 100644 conf.d/health.d/softnet.conf create mode 100644 conf.d/health.d/tcp_resets.conf (limited to 'conf.d/health.d') diff --git a/conf.d/health.d/apache.conf b/conf.d/health.d/apache.conf index 1fddbc99f..0aaf0e003 100644 --- a/conf.d/health.d/apache.conf +++ b/conf.d/health.d/apache.conf @@ -4,10 +4,11 @@ template: apache_last_collected_secs on: apache.requests calc: $now - $last_collected_t - every: 10s - warn: $this > ( 5 * $update_every) - crit: $this > (10 * $update_every) units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h info: number of seconds since the last successful data collection - + to: webmaster diff --git a/conf.d/health.d/cpu.conf b/conf.d/health.d/cpu.conf index 9332e508a..4d79fc799 100644 --- a/conf.d/health.d/cpu.conf +++ b/conf.d/health.d/cpu.conf @@ -1,24 +1,33 @@ -template: 5min_cpu_pcent +template: 10min_cpu_usage on: system.cpu - lookup: average -5m unaligned of user,system,nice,softirq,irq,guest,guest_nice - every: 1m - warn: $this > 90 + lookup: average -10m unaligned of user,system,nice,softirq,irq,guest,guest_nice units: % - info: average cpu utilization for the last 5 minutes + every: 1m + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (80) : (90)) + delay: down 15m multiplier 1.5 max 1h + info: average cpu utilization for the last 10 minutes + to: sysadmin -template: 5min_iowait_cpu_pcent +template: 10min_cpu_iowait on: system.cpu - lookup: average -5m unaligned of iowait - every: 1m - warn: $this > 10 + lookup: average -10m unaligned of iowait units: % - info: average wait I/O for the last 5 minutes + every: 1m + warn: $this > (($status >= $WARNING) ? (5) : (10)) + crit: $this > (($status == $CRITICAL) ? (20) : (30)) + delay: down 15m multiplier 1.5 max 1h + info: average CPU wait I/O for the last 10 minutes + to: sysadmin -template: 20min_steal_cpu_pcent +template: 20min_steal_cpu on: system.cpu lookup: average -20m unaligned of steal - every: 5m - warn: $this > 10 units: % - info: average stolen CPU time for the last 20 minutes + every: 5m + warn: $this > (($status >= $WARNING) ? (5) : (10)) + crit: $this > (($status == $CRITICAL) ? (20) : (30)) + delay: down 15m multiplier 1.5 max 1h + info: average CPU steal time for the last 20 minutes + to: sysadmin diff --git a/conf.d/health.d/disks.conf b/conf.d/health.d/disks.conf index c38f1a0a0..cc7a47660 100644 --- a/conf.d/health.d/disks.conf +++ b/conf.d/health.d/disks.conf @@ -1,3 +1,31 @@ +# ----------------------------------------------------------------------------- +# make sure we collect values for each disk + +# for mount points +template: disk_space_last_collected_secs + on: disk.space + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection of the mount point + to: sysadmin + +# for block devices +template: disk_last_collected_secs + on: disk.io + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection of the block device + to: sysadmin + + # ----------------------------------------------------------------------------- # low disk space @@ -5,14 +33,27 @@ # raise an alarm if the disk is low on # available disk space -template: disk_full_percent +template: disk_space_usage on: disk.space calc: $used * 100 / ($avail + $used) - every: 1m - warn: $this > 80 - crit: $this > 95 units: % + every: 1m + warn: $this > (($status >= $WARNING ) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: up 1m down 15m multiplier 1.5 max 1h info: current disk space usage + to: sysadmin + +template: disk_inode_usage + on: disk.inodes + calc: $used * 100 / ($avail + $used) + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (80)) + crit: $this > (($status == $CRITICAL) ? (90) : (95)) + delay: up 1m down 15m multiplier 1.5 max 1h + info: current disk inode usage + to: sysadmin # ----------------------------------------------------------------------------- @@ -20,7 +61,7 @@ template: disk_full_percent # calculate the rate the disk fills # use as base, the available space change -# during the last 30 minutes +# during the last hour # this is just a calculation - it has no alarm # we will use it in the next template to find @@ -28,25 +69,27 @@ template: disk_full_percent template: disk_fill_rate on: disk.space - lookup: max -1s at -30m unaligned of avail - calc: ($this - $avail) / ($now - $after) - every: 15s - units: MB/s - info: average rate the disk fills up (positive), or frees up (negative) space, for the last 30 minutes + lookup: min -10m at -50m unaligned of avail + calc: ($this - $avail) / (($now - $after) / 3600) + every: 1m + units: GB/hour + info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour # calculate the hours remaining # if the disk continues to fill # in this rate -template: disk_full_after_hours +template: out_of_disk_space_time on: disk.space - calc: $avail / $disk_fill_rate / 3600 - every: 10s - warn: $this > 0 and $this < 48 - crit: $this > 0 and $this < 24 + calc: $avail / $disk_fill_rate units: hours - info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last 30 minutes + every: 10s + warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) + crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) + delay: down 15m multiplier 1.2 max 1h + info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour + to: sysadmin # ----------------------------------------------------------------------------- @@ -59,13 +102,15 @@ template: disk_full_after_hours template: 10min_disk_utilization on: disk.util lookup: average -10m unaligned + units: % every: 1m green: 90 red: 98 - warn: $this > $green - crit: $this > $red - units: % + warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1)) + crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1)) + delay: down 15m multiplier 1.2 max 1h info: the percentage of time the disk was busy, during the last 10 minutes + to: sysadmin # raise an alarm if the disk backlog @@ -76,10 +121,12 @@ template: 10min_disk_utilization template: 10min_disk_backlog on: disk.backlog lookup: average -10m unaligned - every: 1m - green: 1000 - red: 2000 - warn: $this > $green - crit: $this > $red units: ms + every: 1m + green: 2000 + red: 5000 + warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1)) + crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1)) + delay: down 15m multiplier 1.2 max 1h info: average of the kernel estimated disk backlog, for the last 10 minutes + to: sysadmin diff --git a/conf.d/health.d/entropy.conf b/conf.d/health.d/entropy.conf index 6f8b6e851..d0eca8a6c 100644 --- a/conf.d/health.d/entropy.conf +++ b/conf.d/health.d/entropy.conf @@ -1,13 +1,14 @@ # check if entropy is too low # the alarm is checked every 1 minute -# and examines the last 30 minutes of data +# and examines the last hour of data - alarm: min_30min_entropy + alarm: 1hour_lowest_entropy on: system.entropy - lookup: min -30m unaligned - every: 1m - warn: $this < 200 - crit: $this < 100 + lookup: min -1h unaligned units: entries - info: minimum entries in the random numbers pool (entropy), for the last 30 minutes + every: 5m + warn: $this < (($status >= $WARNING) ? (200) : (100)) + delay: down 1h multiplier 1.5 max 1h + info: minimum entries in the random numbers pool in the last 30 minutes + to: silent diff --git a/conf.d/health.d/memcached.conf b/conf.d/health.d/memcached.conf index 05ff14711..46a8ca0e5 100644 --- a/conf.d/health.d/memcached.conf +++ b/conf.d/health.d/memcached.conf @@ -4,43 +4,49 @@ template: memcached_last_collected_secs on: memcached.cache calc: $now - $last_collected_t - every: 10s - warn: $this > ( 5 * $update_every) - crit: $this > (10 * $update_every) units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h info: number of seconds since the last successful data collection + to: dba # detect if memcached cache is full -template: cache_full_pcent +template: memcached_cache_memory_usage on: memcached.cache calc: $used * 100 / ($used + $available) - every: 10s - warn: $this > 80 - crit: $this > 90 units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (80) : (90)) + delay: up 0 down 15m multiplier 1.5 max 1h info: current cache memory usage + to: dba # find the rate memcached cache is filling template: cache_fill_rate on: memcached.cache - lookup: max -1s at -30m unaligned of available - calc: ($this - $available) / ($now - $after) - every: 15s - units: KB/s - info: average rate the cache fills up (positive), or frees up (negative) space, for the last 30 minutes + lookup: min -10m at -50m unaligned of available + calc: ($this - $available) / (($now - $after) / 3600) + units: KB/hour + every: 1m + info: average rate the cache fills up (positive), or frees up (negative) space, for the last hour # find the hours remaining until memcached cache is full -template: cache_full_after_hours +template: out_of_cache_space_time on: memcached.cache - calc: $available / $cache_fill_rate / 3600 - every: 10s - warn: $this > 0 and $this < 48 - crit: $this > 0 and $this < 24 + calc: $available / $cache_fill_rate units: hours - info: estimated time the cache will run out of space, if the system continues to add data with the rate of the last 30 minutes + every: 10s + warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) + crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) + delay: down 15m multiplier 1.5 max 1h + info: estimated time the cache will run out of space, if the system continues to add data with the rate of the last hour + to: dba diff --git a/conf.d/health.d/mysql.conf b/conf.d/health.d/mysql.conf new file mode 100644 index 000000000..a2cfa3ec5 --- /dev/null +++ b/conf.d/health.d/mysql.conf @@ -0,0 +1,13 @@ + +# make sure mysql is running + +template: mysql_last_collected_secs + on: mysql.queries + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: dba diff --git a/conf.d/health.d/named.conf b/conf.d/health.d/named.conf index e46d1d330..f2eaa83c7 100644 --- a/conf.d/health.d/named.conf +++ b/conf.d/health.d/named.conf @@ -4,9 +4,11 @@ template: named_last_collected_secs on: named.global_queries calc: $now - $last_collected_t - every: 10s - warn: $this > ( 5 * $update_every) - crit: $this > (10 * $update_every) units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h info: number of seconds since the last successful data collection + to: domainadmin diff --git a/conf.d/health.d/net.conf b/conf.d/health.d/net.conf index f65bc4fcb..7753aa184 100644 --- a/conf.d/health.d/net.conf +++ b/conf.d/health.d/net.conf @@ -1,27 +1,48 @@ +# ----------------------------------------------------------------------------- +# make sure we collect values for each interface + +template: interface_last_collected_secs + on: net.net + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + + +# ----------------------------------------------------------------------------- # check if an interface is dropping packets -# the alarm is checked every 10 seconds -# and examines the last 30 minutes of data +# the alarm is checked every 1 minute +# and examines the last hour of data -template: 30min_packet_drops +template: 1hour_packet_drops on: net.drops - lookup: sum -30m unaligned absolute - every: 1m - crit: $this > 0 + lookup: sum -1h unaligned absolute units: packets - info: dropped packets in the last 30 minutes + every: 1m + warn: $this > 0 + delay: down 30m multiplier 1.5 max 1h + info: interface dropped packets in the last hour + to: sysadmin +# ----------------------------------------------------------------------------- + # check if an interface is having FIFO # buffer errors -# the alarm is checked every 10 seconds -# and examines the last 30 minutes of data +# the alarm is checked every 1 minute +# and examines the last hour of data -template: 30min_fifo_errors +template: 1hour_fifo_errors on: net.fifo - lookup: sum -30m unaligned absolute - every: 1m - crit: $this > 0 + lookup: sum -1h unaligned absolute units: errors - info: network interface fifo errors in the last 30 minutes - + every: 1m + warn: $this > 0 + delay: down 30m multiplier 1.5 max 1h + info: interface fifo errors in the last hour + to: sysadmin diff --git a/conf.d/health.d/nginx.conf b/conf.d/health.d/nginx.conf index da13008e3..d70d6a59b 100644 --- a/conf.d/health.d/nginx.conf +++ b/conf.d/health.d/nginx.conf @@ -4,9 +4,11 @@ template: nginx_last_collected_secs on: nginx.requests calc: $now - $last_collected_t - every: 10s - warn: $this > ( 5 * $update_every) - crit: $this > (10 * $update_every) units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h info: number of seconds since the last successful data collection + to: webmaster diff --git a/conf.d/health.d/qos.conf b/conf.d/health.d/qos.conf index ac3bf8ff4..9e5939fdc 100644 --- a/conf.d/health.d/qos.conf +++ b/conf.d/health.d/qos.conf @@ -8,5 +8,7 @@ # lookup: sum -10m unaligned absolute # every: 30s # warn: $this > 0 +# delay: up 0 down 30m multiplier 1.5 max 1h # units: packets # info: dropped packets in the last 30 minutes +# to: sysadmin diff --git a/conf.d/health.d/ram.conf b/conf.d/health.d/ram.conf index 1d3681128..216b82fed 100644 --- a/conf.d/health.d/ram.conf +++ b/conf.d/health.d/ram.conf @@ -1,9 +1,11 @@ - alarm: used_ram_pcent + alarm: ram_in_use on: system.ram calc: $used * 100 / ($used + $cached + $free) - every: 10s - warn: $this > 80 - crit: $this > 90 units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (80) : (90)) + delay: down 15m multiplier 1.5 max 1h info: system RAM usage + to: sysadmin diff --git a/conf.d/health.d/redis.conf b/conf.d/health.d/redis.conf index 3750176c5..3e648d85d 100644 --- a/conf.d/health.d/redis.conf +++ b/conf.d/health.d/redis.conf @@ -4,9 +4,11 @@ template: redis_last_collected_secs on: redis.operations calc: $now - $last_collected_t - every: 10s - warn: $this > ( 5 * $update_every) - crit: $this > (10 * $update_every) units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h info: number of seconds since the last successful data collection + to: dba diff --git a/conf.d/health.d/retroshare.conf b/conf.d/health.d/retroshare.conf new file mode 100644 index 000000000..1af7b4686 --- /dev/null +++ b/conf.d/health.d/retroshare.conf @@ -0,0 +1,25 @@ +# make sure RetroShare is running + +template: retroshare_last_collected_secs + on: retroshare.peers + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +# make sure the DHT is fine when active + +template: retroshare_dht_working + on: retroshare.dht + calc: $dht_size_all + units: peers + every: 1m + warn: $this < (($status >= $WARNING) ? (120) : (100)) + crit: $this < (($status == $CRITICAL) ? (10) : (1)) + delay: up 0 down 15m multiplier 1.5 max 1h + info: Checks if the DHT has enough peers to operate + to: sysadmin diff --git a/conf.d/health.d/softnet.conf b/conf.d/health.d/softnet.conf new file mode 100644 index 000000000..0c3709f46 --- /dev/null +++ b/conf.d/health.d/softnet.conf @@ -0,0 +1,21 @@ +# check for common /proc/net/softnet_stat errors + + alarm: 1hour_netdev_backlog_exceeded + on: system.softnet_stat + lookup: sum -1h unaligned absolute of dropped + units: packets + every: 1m + warn: $this > 0 + delay: down 30m multiplier 1.5 max 1h + info: number of packets dropped because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets) + to: sysadmin + + alarm: 1hour_netdev_budget_ran_outs + on: system.softnet_stat + lookup: sum -1h unaligned absolute of squeezed + units: events + every: 1m + warn: $this > 0 + delay: down 30m multiplier 1.5 max 1h + info: number of times ksoftirq ran out of sysctl net.core.netdev_budget or time slice, with work remaining (this can be a cause for dropped packets) + to: silent diff --git a/conf.d/health.d/squid.conf b/conf.d/health.d/squid.conf index cc5ce1c3a..76143c5d7 100644 --- a/conf.d/health.d/squid.conf +++ b/conf.d/health.d/squid.conf @@ -4,9 +4,11 @@ template: squid_last_collected_secs on: squid.clients_requests calc: $now - $last_collected_t - every: 10s - warn: $this > ( 5 * $update_every) - crit: $this > (10 * $update_every) units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h info: number of seconds since the last successful data collection + to: proxyadmin diff --git a/conf.d/health.d/swap.conf b/conf.d/health.d/swap.conf index 552dd310a..0cfa888c4 100644 --- a/conf.d/health.d/swap.conf +++ b/conf.d/health.d/swap.conf @@ -4,17 +4,21 @@ lookup: sum -30m unaligned absolute of out # we have to convert KB to MB by dividing $this (i.e. the result of the lookup) with 1024 calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free ) - every: 1m - warn: $this > 1 - crit: $this > 10 units: % of RAM - info: the sum of all memory swapped out during the last 30 minutes, as a percentage of the available RAM + every: 1m + warn: $this > (($status >= $WARNING) ? (5) : (10)) + crit: $this > (($status == $CRITICAL) ? (15) : (20)) + delay: up 0 down 15m multiplier 1.5 max 1h + info: the amount of memory swapped in the last 30 minutes, as a percentage of the system RAM + to: sysadmin - alarm: pcent_of_ram_in_swap + alarm: used_swap_space on: system.swap calc: $used * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free ) - every: 10s - warn: $this > 10 - crit: $this > 50 units: % of RAM - info: the currently used swap space, as a percentage of the available RAM + every: 10s + warn: $this > (($status >= $WARNING) ? (15) : (20)) + crit: $this > (($status == $CRITICAL) ? (40) : (50)) + delay: up 0 down 15m multiplier 1.5 max 1h + info: the swap memory used, as a percentage of the system RAM + to: sysadmin diff --git a/conf.d/health.d/tcp_resets.conf b/conf.d/health.d/tcp_resets.conf new file mode 100644 index 000000000..8e93c4793 --- /dev/null +++ b/conf.d/health.d/tcp_resets.conf @@ -0,0 +1,32 @@ +# ----------------------------------------------------------------------------- + + alarm: ipv4_tcphandshake_last_collected_secs + on: ipv4.tcphandshake + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every)) + delay: up 0 down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +# ----------------------------------------------------------------------------- + + alarm: 1m_ipv4_tcp_resets + on: ipv4.tcphandshake + lookup: average -1m at -10s unaligned absolute of OutRsts + units: tcp resets/s + every: 10s + info: average TCP RESETS this host is sending, over the last minute + + alarm: 10s_ipv4_tcp_resets + on: ipv4.tcphandshake + lookup: average -10s unaligned absolute of OutRsts + units: tcp resets/s + every: 10s + warn: $this > ((($1m_ipv4_tcp_resets < 5)?(5):($1m_ipv4_tcp_resets)) * (($status >= $WARNING) ? (1) : (4))) + delay: up 0 down 60m multiplier 1.2 max 2h + info: average TCP RESETS this host is sending, over the last 10 seconds (this can be an indication that a port scan is made, or that a service running on this host has crashed) + to: sysadmin + -- cgit v1.2.3