diff options
author | Lennart Weller <lhw@ring0.de> | 2016-09-05 08:27:26 +0000 |
---|---|---|
committer | Lennart Weller <lhw@ring0.de> | 2016-09-05 08:27:26 +0000 |
commit | 58d9525d7fcacffe52eff7282b7a888dd0dcc1d0 (patch) | |
tree | 251a805eb38d4d75b2a7f44c2cc22e7ea4849513 /conf.d/health.d | |
parent | Fixes for service startup and extra config files (diff) | |
parent | Imported Upstream version 1.3.0+dfsg (diff) | |
download | netdata-58d9525d7fcacffe52eff7282b7a888dd0dcc1d0.tar.xz netdata-58d9525d7fcacffe52eff7282b7a888dd0dcc1d0.zip |
Merge tag 'upstream/1.3.0+dfsg'
Upstream version 1.3.0+dfsg
Diffstat (limited to 'conf.d/health.d')
-rw-r--r-- | conf.d/health.d/apache.conf | 13 | ||||
-rw-r--r-- | conf.d/health.d/cpu.conf | 24 | ||||
-rw-r--r-- | conf.d/health.d/disks.conf | 85 | ||||
-rw-r--r-- | conf.d/health.d/entropy.conf | 13 | ||||
-rw-r--r-- | conf.d/health.d/memcached.conf | 46 | ||||
-rw-r--r-- | conf.d/health.d/named.conf | 12 | ||||
-rw-r--r-- | conf.d/health.d/net.conf | 27 | ||||
-rw-r--r-- | conf.d/health.d/nginx.conf | 12 | ||||
-rw-r--r-- | conf.d/health.d/qos.conf | 12 | ||||
-rw-r--r-- | conf.d/health.d/ram.conf | 9 | ||||
-rw-r--r-- | conf.d/health.d/redis.conf | 12 | ||||
-rw-r--r-- | conf.d/health.d/squid.conf | 12 | ||||
-rw-r--r-- | conf.d/health.d/swap.conf | 20 |
13 files changed, 297 insertions, 0 deletions
diff --git a/conf.d/health.d/apache.conf b/conf.d/health.d/apache.conf new file mode 100644 index 000000000..1fddbc99f --- /dev/null +++ b/conf.d/health.d/apache.conf @@ -0,0 +1,13 @@ + +# make sure apache is running + +template: apache_last_collected_secs + on: apache.requests + calc: $now - $last_collected_t + every: 10s + warn: $this > ( 5 * $update_every) + crit: $this > (10 * $update_every) + units: seconds ago + info: number of seconds since the last successful data collection + + diff --git a/conf.d/health.d/cpu.conf b/conf.d/health.d/cpu.conf new file mode 100644 index 000000000..9332e508a --- /dev/null +++ b/conf.d/health.d/cpu.conf @@ -0,0 +1,24 @@ + +template: 5min_cpu_pcent + on: system.cpu + lookup: average -5m unaligned of user,system,nice,softirq,irq,guest,guest_nice + every: 1m + warn: $this > 90 + units: % + info: average cpu utilization for the last 5 minutes + +template: 5min_iowait_cpu_pcent + on: system.cpu + lookup: average -5m unaligned of iowait + every: 1m + warn: $this > 10 + units: % + info: average wait I/O for the last 5 minutes + +template: 20min_steal_cpu_pcent + on: system.cpu + lookup: average -20m unaligned of steal + every: 5m + warn: $this > 10 + units: % + info: average stolen CPU time for the last 20 minutes diff --git a/conf.d/health.d/disks.conf b/conf.d/health.d/disks.conf new file mode 100644 index 000000000..c38f1a0a0 --- /dev/null +++ b/conf.d/health.d/disks.conf @@ -0,0 +1,85 @@ +# ----------------------------------------------------------------------------- +# low disk space + +# checking the latest collected values +# raise an alarm if the disk is low on +# available disk space + +template: disk_full_percent + on: disk.space + calc: $used * 100 / ($avail + $used) + every: 1m + warn: $this > 80 + crit: $this > 95 + units: % + info: current disk space usage + + +# ----------------------------------------------------------------------------- +# disk fill rate + +# calculate the rate the disk fills +# use as base, the available space change +# during the last 30 minutes + +# this is just a calculation - it has no alarm +# we will use it in the next template to find +# the hours remaining + +template: disk_fill_rate + on: disk.space + lookup: max -1s at -30m unaligned of avail + calc: ($this - $avail) / ($now - $after) + every: 15s + units: MB/s + info: average rate the disk fills up (positive), or frees up (negative) space, for the last 30 minutes + + +# calculate the hours remaining +# if the disk continues to fill +# in this rate + +template: disk_full_after_hours + on: disk.space + calc: $avail / $disk_fill_rate / 3600 + every: 10s + warn: $this > 0 and $this < 48 + crit: $this > 0 and $this < 24 + units: hours + info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last 30 minutes + + +# ----------------------------------------------------------------------------- +# disk congestion + +# raise an alarm if the disk is congested +# by calculating the average disk utilization +# for the last 10 minutes + +template: 10min_disk_utilization + on: disk.util + lookup: average -10m unaligned + every: 1m + green: 90 + red: 98 + warn: $this > $green + crit: $this > $red + units: % + info: the percentage of time the disk was busy, during the last 10 minutes + + +# raise an alarm if the disk backlog +# is above 1000ms (1s) per second +# for 10 minutes +# (i.e. the disk cannot catch up) + +template: 10min_disk_backlog + on: disk.backlog + lookup: average -10m unaligned + every: 1m + green: 1000 + red: 2000 + warn: $this > $green + crit: $this > $red + units: ms + info: average of the kernel estimated disk backlog, for the last 10 minutes diff --git a/conf.d/health.d/entropy.conf b/conf.d/health.d/entropy.conf new file mode 100644 index 000000000..6f8b6e851 --- /dev/null +++ b/conf.d/health.d/entropy.conf @@ -0,0 +1,13 @@ + +# check if entropy is too low +# the alarm is checked every 1 minute +# and examines the last 30 minutes of data + + alarm: min_30min_entropy + on: system.entropy + lookup: min -30m unaligned + every: 1m + warn: $this < 200 + crit: $this < 100 + units: entries + info: minimum entries in the random numbers pool (entropy), for the last 30 minutes diff --git a/conf.d/health.d/memcached.conf b/conf.d/health.d/memcached.conf new file mode 100644 index 000000000..05ff14711 --- /dev/null +++ b/conf.d/health.d/memcached.conf @@ -0,0 +1,46 @@ + +# make sure memcached is running + +template: memcached_last_collected_secs + on: memcached.cache + calc: $now - $last_collected_t + every: 10s + warn: $this > ( 5 * $update_every) + crit: $this > (10 * $update_every) + units: seconds ago + info: number of seconds since the last successful data collection + + +# detect if memcached cache is full + +template: cache_full_pcent + on: memcached.cache + calc: $used * 100 / ($used + $available) + every: 10s + warn: $this > 80 + crit: $this > 90 + units: % + info: current cache memory usage + + +# find the rate memcached cache is filling + +template: cache_fill_rate + on: memcached.cache + lookup: max -1s at -30m unaligned of available + calc: ($this - $available) / ($now - $after) + every: 15s + units: KB/s + info: average rate the cache fills up (positive), or frees up (negative) space, for the last 30 minutes + + +# find the hours remaining until memcached cache is full + +template: cache_full_after_hours + on: memcached.cache + calc: $available / $cache_fill_rate / 3600 + every: 10s + warn: $this > 0 and $this < 48 + crit: $this > 0 and $this < 24 + units: hours + info: estimated time the cache will run out of space, if the system continues to add data with the rate of the last 30 minutes diff --git a/conf.d/health.d/named.conf b/conf.d/health.d/named.conf new file mode 100644 index 000000000..e46d1d330 --- /dev/null +++ b/conf.d/health.d/named.conf @@ -0,0 +1,12 @@ + +# make sure named is running + +template: named_last_collected_secs + on: named.global_queries + calc: $now - $last_collected_t + every: 10s + warn: $this > ( 5 * $update_every) + crit: $this > (10 * $update_every) + units: seconds ago + info: number of seconds since the last successful data collection + diff --git a/conf.d/health.d/net.conf b/conf.d/health.d/net.conf new file mode 100644 index 000000000..f65bc4fcb --- /dev/null +++ b/conf.d/health.d/net.conf @@ -0,0 +1,27 @@ + +# check if an interface is dropping packets +# the alarm is checked every 10 seconds +# and examines the last 30 minutes of data + +template: 30min_packet_drops + on: net.drops + lookup: sum -30m unaligned absolute + every: 1m + crit: $this > 0 + units: packets + info: dropped packets in the last 30 minutes + + +# check if an interface is having FIFO +# buffer errors +# the alarm is checked every 10 seconds +# and examines the last 30 minutes of data + +template: 30min_fifo_errors + on: net.fifo + lookup: sum -30m unaligned absolute + every: 1m + crit: $this > 0 + units: errors + info: network interface fifo errors in the last 30 minutes + diff --git a/conf.d/health.d/nginx.conf b/conf.d/health.d/nginx.conf new file mode 100644 index 000000000..da13008e3 --- /dev/null +++ b/conf.d/health.d/nginx.conf @@ -0,0 +1,12 @@ + +# make sure nginx is running + +template: nginx_last_collected_secs + on: nginx.requests + calc: $now - $last_collected_t + every: 10s + warn: $this > ( 5 * $update_every) + crit: $this > (10 * $update_every) + units: seconds ago + info: number of seconds since the last successful data collection + diff --git a/conf.d/health.d/qos.conf b/conf.d/health.d/qos.conf new file mode 100644 index 000000000..ac3bf8ff4 --- /dev/null +++ b/conf.d/health.d/qos.conf @@ -0,0 +1,12 @@ + +# check if a QoS class is dropping packets +# the alarm is checked every 10 seconds +# and examines the last minute of data + +#template: 10min_qos_packet_drops +# on: tc.qos_dropped +# lookup: sum -10m unaligned absolute +# every: 30s +# warn: $this > 0 +# units: packets +# info: dropped packets in the last 30 minutes diff --git a/conf.d/health.d/ram.conf b/conf.d/health.d/ram.conf new file mode 100644 index 000000000..1d3681128 --- /dev/null +++ b/conf.d/health.d/ram.conf @@ -0,0 +1,9 @@ + + alarm: used_ram_pcent + on: system.ram + calc: $used * 100 / ($used + $cached + $free) + every: 10s + warn: $this > 80 + crit: $this > 90 + units: % + info: system RAM usage diff --git a/conf.d/health.d/redis.conf b/conf.d/health.d/redis.conf new file mode 100644 index 000000000..3750176c5 --- /dev/null +++ b/conf.d/health.d/redis.conf @@ -0,0 +1,12 @@ + +# make sure redis is running + +template: redis_last_collected_secs + on: redis.operations + calc: $now - $last_collected_t + every: 10s + warn: $this > ( 5 * $update_every) + crit: $this > (10 * $update_every) + units: seconds ago + info: number of seconds since the last successful data collection + diff --git a/conf.d/health.d/squid.conf b/conf.d/health.d/squid.conf new file mode 100644 index 000000000..cc5ce1c3a --- /dev/null +++ b/conf.d/health.d/squid.conf @@ -0,0 +1,12 @@ + +# make sure squid is running + +template: squid_last_collected_secs + on: squid.clients_requests + calc: $now - $last_collected_t + every: 10s + warn: $this > ( 5 * $update_every) + crit: $this > (10 * $update_every) + units: seconds ago + info: number of seconds since the last successful data collection + diff --git a/conf.d/health.d/swap.conf b/conf.d/health.d/swap.conf new file mode 100644 index 000000000..552dd310a --- /dev/null +++ b/conf.d/health.d/swap.conf @@ -0,0 +1,20 @@ + + alarm: 30min_ram_swapped_out + on: system.swapio + lookup: sum -30m unaligned absolute of out + # we have to convert KB to MB by dividing $this (i.e. the result of the lookup) with 1024 + calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free ) + every: 1m + warn: $this > 1 + crit: $this > 10 + units: % of RAM + info: the sum of all memory swapped out during the last 30 minutes, as a percentage of the available RAM + + alarm: pcent_of_ram_in_swap + on: system.swap + calc: $used * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free ) + every: 10s + warn: $this > 10 + crit: $this > 50 + units: % of RAM + info: the currently used swap space, as a percentage of the available RAM |