diff options
author | Federico Ceratto <federico.ceratto@gmail.com> | 2017-04-30 16:09:37 +0000 |
---|---|---|
committer | Federico Ceratto <federico.ceratto@gmail.com> | 2017-04-30 16:09:37 +0000 |
commit | 51f689a8e17ff3929acd2dbf39e936d2cd3ac723 (patch) | |
tree | 92e54f543171b69dcbc639be09d11221cf96ba28 /conf.d/health.d | |
parent | New upstream version 1.5.0+dfsg (diff) | |
download | netdata-51f689a8e17ff3929acd2dbf39e936d2cd3ac723.tar.xz netdata-51f689a8e17ff3929acd2dbf39e936d2cd3ac723.zip |
New upstream version 1.6.0+dfsgupstream/1.6.0+dfsg
Diffstat (limited to 'conf.d/health.d')
-rw-r--r-- | conf.d/health.d/cpu.conf | 4 | ||||
-rw-r--r-- | conf.d/health.d/disks.conf | 32 | ||||
-rw-r--r-- | conf.d/health.d/fping.conf | 53 | ||||
-rw-r--r-- | conf.d/health.d/ipmi.conf | 20 | ||||
-rw-r--r-- | conf.d/health.d/memcached.conf | 2 | ||||
-rw-r--r-- | conf.d/health.d/mysql.conf | 4 | ||||
-rw-r--r-- | conf.d/health.d/net.conf | 18 | ||||
-rw-r--r-- | conf.d/health.d/tcp_resets.conf | 6 | ||||
-rw-r--r-- | conf.d/health.d/web_log.conf | 161 |
9 files changed, 246 insertions, 54 deletions
diff --git a/conf.d/health.d/cpu.conf b/conf.d/health.d/cpu.conf index 60f494d70..30a714097 100644 --- a/conf.d/health.d/cpu.conf +++ b/conf.d/health.d/cpu.conf @@ -1,13 +1,13 @@ template: 10min_cpu_usage on: system.cpu - lookup: average -10m unaligned of user,system,nice,softirq,irq,guest,guest_nice + lookup: average -10m unaligned of user,system,softirq,irq,guest units: % every: 1m warn: $this > (($status >= $WARNING) ? (75) : (85)) crit: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 15m multiplier 1.5 max 1h - info: average cpu utilization for the last 10 minutes + info: average cpu utilization for the last 10 minutes (excluding iowait, nice and steal) to: sysadmin template: 10min_cpu_iowait diff --git a/conf.d/health.d/disks.conf b/conf.d/health.d/disks.conf index 0549bac26..9548f9ee0 100644 --- a/conf.d/health.d/disks.conf +++ b/conf.d/health.d/disks.conf @@ -1,34 +1,4 @@ # ----------------------------------------------------------------------------- -# make sure we collect values for each disk - -# for mount points -template: disk_space_last_collected_secs - on: disk.space -families: * - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection of the mount point - to: sysadmin - -# for block devices -template: disk_last_collected_secs - on: disk.io -families: * - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection of the block device - to: sysadmin - - -# ----------------------------------------------------------------------------- # low disk space # checking the latest collected values @@ -88,7 +58,7 @@ families: * template: out_of_disk_space_time on: disk.space families: * - calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (0) + calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf) units: hours every: 10s warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) diff --git a/conf.d/health.d/fping.conf b/conf.d/health.d/fping.conf new file mode 100644 index 000000000..69251b182 --- /dev/null +++ b/conf.d/health.d/fping.conf @@ -0,0 +1,53 @@ + +template: fping_last_collected_secs +families: * + on: fping.latency + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +template: host_reachable +families: * + on: fping.latency + calc: $average != nan + units: up/down + every: 10s + crit: $this == 0 + info: states if the remote host is reachable + delay: down 30m multiplier 1.5 max 2h + to: sysadmin + +template: host_latency +families: * + on: fping.latency + lookup: average -10s unaligned of average + units: ms + every: 10s + green: 300 + red: 1000 + warn: $this > $green OR $max > $red + crit: $this > $red + info: average round trip delay during the last 10 seconds + delay: down 30m multiplier 1.5 max 2h + to: sysadmin + +template: packet_loss +families: * + on: fping.quality + lookup: average -10m unaligned of returned + calc: 100 - $this + green: 1 + red: 10 + units: % + every: 10s + warn: $this > $green + crit: $this > $red + info: packet loss percentage + delay: down 30m multiplier 1.5 max 2h + to: sysadmin + diff --git a/conf.d/health.d/ipmi.conf b/conf.d/health.d/ipmi.conf new file mode 100644 index 000000000..c25581964 --- /dev/null +++ b/conf.d/health.d/ipmi.conf @@ -0,0 +1,20 @@ + alarm: ipmi_sensors_states + on: ipmi.sensors_states + calc: $warning + $critical + units: sensors + every: 10s + warn: $this > 0 + crit: $critical > 0 + delay: up 5m down 15m multiplier 1.5 max 1h + info: the number IPMI sensors in non-nominal state + to: sysadmin + + alarm: ipmi_events + on: ipmi.events + calc: $events + units: events + every: 10s + warn: $this > 0 + delay: up 5m down 15m multiplier 1.5 max 1h + info: the number of events in the IPMI System Event Log (SEL) + to: sysadmin diff --git a/conf.d/health.d/memcached.conf b/conf.d/health.d/memcached.conf index 7917e36af..d248ef57a 100644 --- a/conf.d/health.d/memcached.conf +++ b/conf.d/health.d/memcached.conf @@ -42,7 +42,7 @@ template: cache_fill_rate template: out_of_cache_space_time on: memcached.cache - calc: ($cache_fill_rate > 0) ? ($available / $cache_fill_rate) : (0) + calc: ($cache_fill_rate > 0) ? ($available / $cache_fill_rate) : (inf) units: hours every: 10s warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) diff --git a/conf.d/health.d/mysql.conf b/conf.d/health.d/mysql.conf index 78773e5b5..1eeb993f0 100644 --- a/conf.d/health.d/mysql.conf +++ b/conf.d/health.d/mysql.conf @@ -49,7 +49,7 @@ template: mysql_10s_table_locks_waited template: mysql_10s_waited_locks_ratio on: mysql.table_locks - calc: ($mysql_10s_table_locks_waited * 100) / ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate) + calc: ( ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate) > 0 ) ? (($mysql_10s_table_locks_waited * 100) / ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate)) : 0 units: % every: 10s warn: $this > (($status >= $WARNING) ? (10) : (25)) @@ -65,7 +65,7 @@ template: mysql_10s_waited_locks_ratio template: mysql_replication on: mysql.slave_status calc: ($sql_running == -1 OR $io_running == -1)?0:1 - units: status + units: ok/failed every: 10s crit: $this == 0 delay: down 5m multiplier 1.5 max 1h diff --git a/conf.d/health.d/net.conf b/conf.d/health.d/net.conf index 924acccc3..0232395ac 100644 --- a/conf.d/health.d/net.conf +++ b/conf.d/health.d/net.conf @@ -1,18 +1,3 @@ -# ----------------------------------------------------------------------------- -# make sure we collect values for each interface - -template: interface_last_collected_secs - on: net.net -families: * - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - # ----------------------------------------------------------------------------- # dropped packets @@ -116,6 +101,7 @@ families: * units: % warn: $this > (($status >= $WARNING)?(200):(1000)) crit: $this > (($status >= $WARNING)?(1000):(2000)) +options: no-clear-notification info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute - to: silent + to: sysadmin diff --git a/conf.d/health.d/tcp_resets.conf b/conf.d/health.d/tcp_resets.conf index daf24a1cd..49fb1b924 100644 --- a/conf.d/health.d/tcp_resets.conf +++ b/conf.d/health.d/tcp_resets.conf @@ -28,8 +28,9 @@ every: 10s warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (4))) delay: up 0 down 60m multiplier 1.2 max 2h +options: no-clear-notification info: average TCP RESETS this host is sending, over the last 10 seconds (this can be an indication that a port scan is made, or that a service running on this host has crashed) - to: silent + to: sysadmin # ----------------------------------------------------------------------------- # tcp resets this host receives @@ -48,5 +49,6 @@ every: 10s warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (4))) delay: up 0 down 60m multiplier 1.2 max 2h +options: no-clear-notification info: average TCP RESETS this host is receiving, over the last 10 seconds (this can be an indication that a service this host needs, has crashed) - to: silent + to: sysadmin diff --git a/conf.d/health.d/web_log.conf b/conf.d/health.d/web_log.conf new file mode 100644 index 000000000..c668959f5 --- /dev/null +++ b/conf.d/health.d/web_log.conf @@ -0,0 +1,161 @@ + +# make sure we can collect web log data + +template: last_collected_secs + on: web_log.response_codes +families: * + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + + +# ----------------------------------------------------------------------------- +# high level response code alarms + +# the following alarms trigger only when there are enough data. +# we assume there are enough data when: +# +# $1m_requests > 120 +# +# i.e. when there are at least 120 requests during the last minute + +template: 1m_requests + on: web_log.response_statuses +families: * + lookup: sum -1m unaligned + calc: ($this == 0)?(1):($this) + units: requests + every: 10s + info: the sum of all HTTP requests over the last minute + +template: 1m_successful + on: web_log.response_statuses +families: * + lookup: sum -1m unaligned of successful_requests + calc: $this * 100 / $1m_requests + units: % + every: 10s + warn: ($1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 ) + crit: ($1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 ) + delay: up 2m down 15m multiplier 1.5 max 1h + info: the ratio of successful HTTP responses (1xx, 2xx, 304) over the last minute + to: webmaster + +template: 1m_redirects + on: web_log.response_statuses +families: * + lookup: sum -1m unaligned of redirects + calc: $this * 100 / $1m_requests + units: % + every: 10s + warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 ) + crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 ) + delay: up 2m down 15m multiplier 1.5 max 1h + info: the ratio of HTTP redirects (3xx except 304) over the last minute + to: webmaster + +template: 1m_bad_requests + on: web_log.response_statuses +families: * + lookup: sum -1m unaligned of bad_requests + calc: $this * 100 / $1m_requests + units: % + every: 10s + warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 ) + crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 ) + delay: up 2m down 15m multiplier 1.5 max 1h + info: the ratio of HTTP bad requests (4xx) over the last minute + to: webmaster + +template: 1m_internal_errors + on: web_log.response_statuses +families: * + lookup: sum -1m unaligned of server_errors + calc: $this * 100 / $1m_requests + units: % + every: 10s + warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 ) + crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 ) + delay: up 2m down 15m multiplier 1.5 max 1h + info: the ratio of HTTP internal server errors (5xx), over the last minute + to: webmaster + + +# ----------------------------------------------------------------------------- +# web slow + +# the following alarms trigger only when there are enough data. +# we assume there are enough data when: +# +# $1m_requests > 120 +# +# i.e. when there are at least 120 requests during the last minute + +template: 10m_response_time + on: web_log.response_time +families: * + lookup: average -10m unaligned of avg + units: ms + every: 30s + info: the average time to respond to HTTP requests, over the last 10 minutes + +template: web_slow + on: web_log.response_time +families: * + lookup: average -1m unaligned of avg + units: ms + every: 10s + green: 500 + red: 1000 + warn: ($1m_requests > 120) ? ($this > $green && $this > ($10m_response_time * 2) ) : ( 0 ) + crit: ($1m_requests > 120) ? ($this > $red && $this > ($10m_response_time * 4) ) : ( 0 ) + delay: down 15m multiplier 1.5 max 1h + info: the average time to respond to HTTP requests, over the last 1 minute + to: webmaster + +# ----------------------------------------------------------------------------- +# web too many or too few requests + +# the following alarms trigger only when there are enough data. +# we assume there are enough data when: +# +# $5m_successful_old > 120 +# +# i.e. when there were at least 120 requests during the 5 minutes starting +# at -10m and ending at -5m + +template: 5m_successful_old + on: web_log.response_statuses +families: * + lookup: average -5m at -5m unaligned of successful_requests + units: requests/s + every: 30s + info: average rate of successful HTTP requests over the last 5 minutes + +template: 5m_successful + on: web_log.response_statuses +families: * + lookup: average -5m unaligned of successful_requests + units: requests/s + every: 30s + info: average successful HTTP requests over the last 5 minutes + +template: 5m_requests_ratio + on: web_log.response_codes +families: * + calc: ($5m_successful_old > 0)?($5m_successful * 100 / $5m_successful_old):(100) + units: % + every: 30s + warn: ($5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0) + crit: ($5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0) + delay: down 15m multiplier 1.5 max 1h +options: no-clear-notification + info: the percentage of successful web requests over the last 5 minutes, \ + compared with the previous 5 minutes + to: webmaster + |