diff options
author | Lennart Weller <lhw@ring0.de> | 2017-07-27 09:55:47 +0000 |
---|---|---|
committer | Lennart Weller <lhw@ring0.de> | 2017-07-27 09:55:47 +0000 |
commit | a133c9c3b637b1dbe7b5b053f7e2572c1950cead (patch) | |
tree | 2207939a88e96bca329457f40a9d9d18ab659dc1 /conf.d/health.d | |
parent | New upstream version 1.6.0+dfsg (diff) | |
download | netdata-a133c9c3b637b1dbe7b5b053f7e2572c1950cead.tar.xz netdata-a133c9c3b637b1dbe7b5b053f7e2572c1950cead.zip |
New upstream version 1.7.0+dfsgupstream/1.7.0+dfsg
Diffstat (limited to 'conf.d/health.d')
-rw-r--r-- | conf.d/health.d/fping.conf | 2 | ||||
-rw-r--r-- | conf.d/health.d/lighttpd.conf | 14 | ||||
-rw-r--r-- | conf.d/health.d/mongodb.conf | 13 | ||||
-rw-r--r-- | conf.d/health.d/net.conf | 6 | ||||
-rw-r--r-- | conf.d/health.d/ram.conf | 9 | ||||
-rw-r--r-- | conf.d/health.d/tcp_resets.conf | 8 | ||||
-rw-r--r-- | conf.d/health.d/web_log.conf | 3 | ||||
-rw-r--r-- | conf.d/health.d/zfs.conf | 10 |
8 files changed, 55 insertions, 10 deletions
diff --git a/conf.d/health.d/fping.conf b/conf.d/health.d/fping.conf index 69251b18..43658fef 100644 --- a/conf.d/health.d/fping.conf +++ b/conf.d/health.d/fping.conf @@ -28,7 +28,7 @@ families: * lookup: average -10s unaligned of average units: ms every: 10s - green: 300 + green: 500 red: 1000 warn: $this > $green OR $max > $red crit: $this > $red diff --git a/conf.d/health.d/lighttpd.conf b/conf.d/health.d/lighttpd.conf new file mode 100644 index 00000000..915907a4 --- /dev/null +++ b/conf.d/health.d/lighttpd.conf @@ -0,0 +1,14 @@ + +# make sure lighttpd is running + +template: lighttpd_last_collected_secs + on: lighttpd.requests + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + diff --git a/conf.d/health.d/mongodb.conf b/conf.d/health.d/mongodb.conf new file mode 100644 index 00000000..a80cb311 --- /dev/null +++ b/conf.d/health.d/mongodb.conf @@ -0,0 +1,13 @@ + +# make sure mongodb is running + +template: mongodb_last_collected_secs + on: mongodb.read_operations + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: dba diff --git a/conf.d/health.d/net.conf b/conf.d/health.d/net.conf index 0232395a..bd288817 100644 --- a/conf.d/health.d/net.conf +++ b/conf.d/health.d/net.conf @@ -99,9 +99,9 @@ families: * calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate)) every: 10s units: % - warn: $this > (($status >= $WARNING)?(200):(1000)) - crit: $this > (($status >= $WARNING)?(1000):(2000)) + warn: $this > (($status >= $WARNING)?(200):(5000)) + crit: $this > (($status >= $WARNING)?(5000):(6000)) options: no-clear-notification - info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute + info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute (clear notification for this alarm will not be sent) to: sysadmin diff --git a/conf.d/health.d/ram.conf b/conf.d/health.d/ram.conf index d60df75b..b99e5e22 100644 --- a/conf.d/health.d/ram.conf +++ b/conf.d/health.d/ram.conf @@ -1,7 +1,14 @@ + alarm: used_ram_to_ignore + on: system.ram + calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz) + every: 10s + info: the amount of memory that is reported as used, but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC) + alarm: ram_in_use on: system.ram - calc: $used * 100 / ($used + $cached + $free) +# calc: $used * 100 / ($used + $cached + $free) + calc: ($used - $used_ram_to_ignore) * 100 / ($used - $used_ram_to_ignore + $cached + $free) units: % every: 10s warn: $this > (($status >= $WARNING) ? (80) : (90)) diff --git a/conf.d/health.d/tcp_resets.conf b/conf.d/health.d/tcp_resets.conf index 49fb1b92..803c88a8 100644 --- a/conf.d/health.d/tcp_resets.conf +++ b/conf.d/health.d/tcp_resets.conf @@ -26,10 +26,10 @@ lookup: average -10s unaligned absolute of OutRsts units: tcp resets/s every: 10s - warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (4))) + warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (20))) delay: up 0 down 60m multiplier 1.2 max 2h options: no-clear-notification - info: average TCP RESETS this host is sending, over the last 10 seconds (this can be an indication that a port scan is made, or that a service running on this host has crashed) + info: average TCP RESETS this host is sending, over the last 10 seconds (this can be an indication that a port scan is made, or that a service running on this host has crashed; clear notification for this alarm will not be sent) to: sysadmin # ----------------------------------------------------------------------------- @@ -47,8 +47,8 @@ options: no-clear-notification lookup: average -10s unaligned absolute of AttemptFails units: tcp resets/s every: 10s - warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (4))) + warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (10))) delay: up 0 down 60m multiplier 1.2 max 2h options: no-clear-notification - info: average TCP RESETS this host is receiving, over the last 10 seconds (this can be an indication that a service this host needs, has crashed) + info: average TCP RESETS this host is receiving, over the last 10 seconds (this can be an indication that a service this host needs, has crashed; clear notification for this alarm will not be sent) to: sysadmin diff --git a/conf.d/health.d/web_log.conf b/conf.d/health.d/web_log.conf index c668959f..d1808817 100644 --- a/conf.d/health.d/web_log.conf +++ b/conf.d/health.d/web_log.conf @@ -156,6 +156,7 @@ families: * delay: down 15m multiplier 1.5 max 1h options: no-clear-notification info: the percentage of successful web requests over the last 5 minutes, \ - compared with the previous 5 minutes + compared with the previous 5 minutes \ + (clear notification for this alarm will not be sent) to: webmaster diff --git a/conf.d/health.d/zfs.conf b/conf.d/health.d/zfs.conf new file mode 100644 index 00000000..af73824e --- /dev/null +++ b/conf.d/health.d/zfs.conf @@ -0,0 +1,10 @@ + + alarm: zfs_memory_throttle + on: zfs.memory_ops + lookup: sum -10m unaligned absolute of throttled + units: events + every: 1m + warn: $this > 0 + delay: down 1h multiplier 1.5 max 2h + info: the number of times ZFS had to limit the ARC growth in the last 10 minutes + to: sysadmin |