diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2019-11-28 04:53:29 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2019-11-28 04:53:29 +0000 |
commit | 17c93e2be4ad7b3af0cd6878bdd5d8a4a3e6da99 (patch) | |
tree | 3e0c96613972e8bb4afdeeb97a034806363ddfa9 /health/health.d | |
parent | Releasing debian version 1.18.1-1. (diff) | |
download | netdata-17c93e2be4ad7b3af0cd6878bdd5d8a4a3e6da99.tar.xz netdata-17c93e2be4ad7b3af0cd6878bdd5d8a4a3e6da99.zip |
Merging upstream version 1.19.0.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'health/health.d')
-rw-r--r-- | health/health.d/am2320.conf | 12 | ||||
-rw-r--r-- | health/health.d/dbengine.conf | 14 | ||||
-rw-r--r-- | health/health.d/dnsmasq_dhcp.conf | 4 | ||||
-rw-r--r-- | health/health.d/megacli.conf | 32 | ||||
-rw-r--r-- | health/health.d/mysql.conf | 12 | ||||
-rw-r--r-- | health/health.d/net.conf | 10 | ||||
-rw-r--r-- | health/health.d/pihole.conf | 90 | ||||
-rw-r--r-- | health/health.d/ram.conf | 48 | ||||
-rw-r--r-- | health/health.d/softnet.conf | 28 | ||||
-rw-r--r-- | health/health.d/tcp_listen.conf | 31 | ||||
-rw-r--r-- | health/health.d/udp_errors.conf | 14 | ||||
-rw-r--r-- | health/health.d/web_log.conf | 198 |
12 files changed, 361 insertions, 132 deletions
diff --git a/health/health.d/am2320.conf b/health/health.d/am2320.conf new file mode 100644 index 000000000..ddf8b704d --- /dev/null +++ b/health/health.d/am2320.conf @@ -0,0 +1,12 @@ +# make sure am2320 is sending stats + +template: am2320_last_collected_secs + on: am2320.temperature + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster
\ No newline at end of file diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf index ce6427cd2..ce9839ef1 100644 --- a/health/health.d/dbengine.conf +++ b/health/health.d/dbengine.conf @@ -23,4 +23,16 @@ lookup: sum -10m unaligned of I/O errors crit: $this > 0 delay: down 1h multiplier 1.5 max 3h info: number of IO errors dbengine came across the last 10 minutes (CRC errors, out of space, bad disk etc) - to: sysadmin
\ No newline at end of file + to: sysadmin + + alarm: 10min_dbengine_global_flushing_errors + on: netdata.dbengine_global_errors + os: linux freebsd macos + hosts: * +lookup: sum -10m unaligned of flushing errors + units: errors + every: 3s + crit: $this > 0 + delay: down 1h multiplier 1.5 max 3h + info: number of times in the last 10 minutes that the dbengine failed to completely flush data to disk, metric data will not be stored in the database, please reduce disk load or use a faster disk + to: sysadmin diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf index b7eb4e0a3..ecf3b84a8 100644 --- a/health/health.d/dnsmasq_dhcp.conf +++ b/health/health.d/dnsmasq_dhcp.conf @@ -1,6 +1,6 @@ - # dhcp-range utilization +# dhcp-range utilization - template: dnsmasq_dhcp_dhcp_range_utilization +template: dnsmasq_dhcp_dhcp_range_utilization on: dnsmasq_dhcp.dhcp_range_utilization every: 10s units: % diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf index 1881a7be1..73b87dcc0 100644 --- a/health/health.d/megacli.conf +++ b/health/health.d/megacli.conf @@ -1,48 +1,48 @@ alarm: adapter_state on: megacli.adapter_degraded units: is degraded - lookup: sum -10s - every: 10s + lookup: sum -10s + every: 10s crit: $this > 0 info: adapter state to: sysadmin - template: bbu_relative_charge +template: bbu_relative_charge on: megacli.bbu_relative_charge units: percent - lookup: average -10s - every: 10s + lookup: average -10s + every: 10s warn: $this <= (($status >= $WARNING) ? (85) : (80)) crit: $this <= (($status == $CRITICAL) ? (50) : (40)) info: BBU relative state of charge to: sysadmin - template: bbu_cycle_count +template: bbu_cycle_count on: megacli.bbu_cycle_count units: cycle count - lookup: average -10s - every: 10s + lookup: average -10s + every: 10s warn: $this >= 100 crit: $this >= 500 info: BBU cycle count to: sysadmin - alarm: pd_media_errors + alarm: pd_media_errors on: megacli.pd_media_error units: media errors - lookup: sum -10s - every: 10s + lookup: sum -10s + every: 10s warn: $this > 0 - delay: down 1m multiplier 2 max 10m + delay: down 1m multiplier 2 max 10m info: physical drive media errors to: sysadmin - alarm: pd_predictive_failures + alarm: pd_predictive_failures on: megacli.pd_predictive_failure units: predictive failures - lookup: sum -10s - every: 10s + lookup: sum -10s + every: 10s warn: $this > 0 - delay: down 1m multiplier 2 max 10m + delay: down 1m multiplier 2 max 10m info: physical drive predictive failures to: sysadmin diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf index ce7b98a87..2bec56387 100644 --- a/health/health.d/mysql.conf +++ b/health/health.d/mysql.conf @@ -132,3 +132,15 @@ template: mysql_galera_cluster_state delay: up 30s down 5m multiplier 1.5 max 1h info: node state (0: undefined, 1: joining, 2: donor/desynced, 3: joined, 4: synced) to: dba + + +# galera node status + +template: mysql_galera_cluster_status + on: mysql.galera_cluster_status + calc: $wsrep_cluster_status + every: 10s + crit: $mysql_galera_cluster_state != nan AND $this != 0 + delay: up 30s down 5m multiplier 1.5 max 1h + info: node and cluster status (-1: unknown, 0: primary/quorum present, 1: non-primary/quorum lost, 2: disconnected) + to: dba diff --git a/health/health.d/net.conf b/health/health.d/net.conf index 255ab9982..e43cb1691 100644 --- a/health/health.d/net.conf +++ b/health/health.d/net.conf @@ -161,8 +161,8 @@ families: * calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate)) every: 10s units: % - warn: $this > (($status >= $WARNING)?(200):(5000)) - crit: $this > (($status >= $WARNING)?(5000):(6000)) -options: no-clear-notification - info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute (clear notification for this alarm will not be sent) - to: sysadmin + warn: $this > (($status >= $WARNING)?(200):(5000)) + crit: $this > (($status >= $WARNING)?(5000):(6000)) + options: no-clear-notification + info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute (clear notification for this alarm will not be sent) + to: sysadmin diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf index 4a1217239..b255d35f9 100644 --- a/health/health.d/pihole.conf +++ b/health/health.d/pihole.conf @@ -1,5 +1,5 @@ - # Make sure Pi-hole is responding. +# Make sure Pi-hole is responding. template: pihole_last_collected_secs on: pihole.dns_queries_total @@ -12,56 +12,54 @@ template: pihole_last_collected_secs info: number of seconds since the last successful data collection to: webmaster - # Blocked DNS queries. +# Blocked DNS queries. - template: pihole_blocked_queries - on: pihole.dns_queries_percentage - every: 10s - units: % - calc: $blocked - warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) ) - crit: $this > ( ($status >= $CRITICAL) ? ( 55 ) : ( 75 ) ) - delay: up 2m down 5m - info: percentage of blocked dns queries for the last 24 hour - to: sysadmin - - - # Blocklist last update time. - # Default update interval is a week. +template: pihole_blocked_queries + on: pihole.dns_queries_percentage + every: 10s + units: % + calc: $blocked + warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) ) + crit: $this > ( ($status >= $CRITICAL) ? ( 55 ) : ( 75 ) ) + delay: up 2m down 5m + info: percentage of blocked dns queries for the last 24 hour + to: sysadmin - template: pihole_blocklist_last_update - on: pihole.blocklist_last_update - every: 10s - units: seconds - calc: $ago - warn: $this > 60 * 60 * 24 * 8 - crit: $this > 60 * 60 * 24 * 8 * 2 - info: blocklist last update time - to: sysadmin +# Blocklist last update time. +# Default update interval is a week. - # Gravity file check (gravity.list). +template: pihole_blocklist_last_update + on: pihole.blocklist_last_update + every: 10s + units: seconds + calc: $ago + warn: $this > 60 * 60 * 24 * 8 + crit: $this > 60 * 60 * 24 * 8 * 2 + info: blocklist last update time + to: sysadmin - template: pihole_blocklist_gravity_file - on: pihole.blocklist_last_update - every: 10s - units: boolean - calc: $file_exists - crit: $this != 1 - delay: up 2m down 5m - info: gravity file existence - to: sysadmin +# Gravity file check (gravity.list). +template: pihole_blocklist_gravity_file + on: pihole.blocklist_last_update + every: 10s + units: boolean + calc: $file_exists + crit: $this != 1 + delay: up 2m down 5m + info: gravity file existence + to: sysadmin - # Pi-hole's ability to block unwanted domains. - # Should be enabled. The whole point of Pi-hole! +# Pi-hole's ability to block unwanted domains. +# Should be enabled. The whole point of Pi-hole! - template: pihole_status - on: pihole.unwanted_domains_blocking_status - every: 10s - units: boolean - calc: $enabled - warn: $this != 1 - delay: up 2m down 5m - info: unwanted domains blocking status - to: sysadmin +template: pihole_status + on: pihole.unwanted_domains_blocking_status + every: 10s + units: boolean + calc: $enabled + warn: $this != 1 + delay: up 2m down 5m + info: unwanted domains blocking status + to: sysadmin diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf index 4e41bb496..15e8e8464 100644 --- a/health/health.d/ram.conf +++ b/health/health.d/ram.conf @@ -37,28 +37,28 @@ to: sysadmin ## FreeBSD -alarm: ram_in_use - on: system.ram - os: freebsd -hosts: * - calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive) -units: % -every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) -delay: down 15m multiplier 1.5 max 1h - info: system RAM usage - to: sysadmin + alarm: ram_in_use + on: system.ram + os: freebsd + hosts: * + calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: system RAM usage + to: sysadmin - alarm: ram_available - on: system.ram - os: freebsd - hosts: * - calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers) - units: % - every: 10s - warn: $this < (($status >= $WARNING) ? (15) : (10)) - crit: $this < (($status == $CRITICAL) ? (10) : ( 5)) - delay: down 15m multiplier 1.5 max 1h - info: estimated amount of RAM available for userspace processes, without causing swapping - to: sysadmin + alarm: ram_available + on: system.ram + os: freebsd + hosts: * + calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers) + units: % + every: 10s + warn: $this < (($status >= $WARNING) ? (15) : (10)) + crit: $this < (($status == $CRITICAL) ? (10) : ( 5)) + delay: down 15m multiplier 1.5 max 1h + info: estimated amount of RAM available for userspace processes, without causing swapping + to: sysadmin diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf index 77c804bfd..ff3648626 100644 --- a/health/health.d/softnet.conf +++ b/health/health.d/softnet.conf @@ -3,38 +3,38 @@ # check for common /proc/net/softnet_stat errors - alarm: 10min_netdev_backlog_exceeded + alarm: 1min_netdev_backlog_exceeded on: system.softnet_stat os: linux hosts: * - lookup: sum -10m unaligned absolute of dropped + lookup: average -1m unaligned absolute of dropped units: packets - every: 1m - warn: $this > 0 + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : (10) delay: down 1h multiplier 1.5 max 2h - info: number of packets dropped in the last 10min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets) + info: average number of packets dropped in the last 1min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets) to: sysadmin - alarm: 10min_netdev_budget_ran_outs + alarm: 1min_netdev_budget_ran_outs on: system.softnet_stat os: linux hosts: * - lookup: sum -10m unaligned absolute of squeezed + lookup: average -1m unaligned absolute of squeezed units: events - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (10)) + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : (10)) delay: down 1h multiplier 1.5 max 2h - info: number of times, during the last 10min, ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs, with work remaining (this can be a cause for dropped packets) + info: average number of times, during the last 1min, ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs, with work remaining (this can be a cause for dropped packets) to: silent alarm: 10min_netisr_backlog_exceeded on: system.softnet_stat os: freebsd hosts: * - lookup: sum -10m unaligned absolute of qdrops + lookup: average -1m unaligned absolute of qdrops units: packets - every: 1m - warn: $this > 0 + every: 10s + warn: $this > (($status >+ $WARNING) ? (0) : (10)) delay: down 1h multiplier 1.5 max 2h - info: number of drops in the last 10min, because sysctl net.route.netisr_maxqlen was exceeded (this can be a cause for dropped packets) + info: average number of drops in the last 1min, because sysctl net.route.netisr_maxqlen was exceeded (this can be a cause for dropped packets) to: sysadmin diff --git a/health/health.d/tcp_listen.conf b/health/health.d/tcp_listen.conf index 552930ab7..3b3072577 100644 --- a/health/health.d/tcp_listen.conf +++ b/health/health.d/tcp_listen.conf @@ -22,12 +22,13 @@ on: ip.tcp_accept_queue os: linux hosts: * - lookup: sum -60s unaligned absolute of ListenOverflows + lookup: average -60s unaligned absolute of ListenOverflows units: overflows every: 10s - crit: $this > 0 + warn: $this > 1 + crit: $this > (($status == $CRITICAL) ? (1) : (5)) delay: up 0 down 5m multiplier 1.5 max 1h - info: the number of times the TCP accept queue of the kernel overflown, during the last minute + info: the average number of times the TCP accept queue of the kernel overflown, during the last minute to: sysadmin # THIS IS TOO GENERIC @@ -36,13 +37,13 @@ on: ip.tcp_accept_queue os: linux hosts: * - lookup: sum -60s unaligned absolute of ListenDrops + lookup: average -60s unaligned absolute of ListenDrops units: drops every: 10s -# warn: $this > 0 - crit: $this > (($status == $CRITICAL) ? (0) : (150)) + warn: $this > 1 + crit: $this > (($status == $CRITICAL) ? (1) : (5)) delay: up 0 down 5m multiplier 1.5 max 1h - info: the number of times the TCP accept queue of the kernel dropped packets, during the last minute (includes bogus packets received) + info: the average number of times the TCP accept queue of the kernel dropped packets, during the last minute (includes bogus packets received) to: sysadmin @@ -58,12 +59,12 @@ on: ip.tcp_syn_queue os: linux hosts: * - lookup: sum -60s unaligned absolute of TCPReqQFullDrop + lookup: average -60s unaligned absolute of TCPReqQFullDrop units: drops every: 10s - warn: $this > 0 - crit: $this > (($status == $CRITICAL) ? (0) : (60)) - delay: up 0 down 5m multiplier 1.5 max 1h + warn: $this > 1 + crit: $this > (($status == $CRITICAL) ? (0) : (5)) + delay: up 10 down 5m multiplier 1.5 max 1h info: the number of times the TCP SYN queue of the kernel was full and dropped packets, during the last minute to: sysadmin @@ -71,12 +72,12 @@ on: ip.tcp_syn_queue os: linux hosts: * - lookup: sum -60s unaligned absolute of TCPReqQFullDoCookies + lookup: average -60s unaligned absolute of TCPReqQFullDoCookies units: cookies every: 10s - warn: $this > 0 - crit: $this > (($status == $CRITICAL) ? (0) : (60)) - delay: up 0 down 5m multiplier 1.5 max 1h + warn: $this > 1 + crit: $this > (($status == $CRITICAL) ? (0) : (5)) + delay: up 10 down 5m multiplier 1.5 max 1h info: the number of times the TCP SYN queue of the kernel was full and sent SYN cookies, during the last minute to: sysadmin diff --git a/health/health.d/udp_errors.conf b/health/health.d/udp_errors.conf index 5140228f5..1e47b5c8b 100644 --- a/health/health.d/udp_errors.conf +++ b/health/health.d/udp_errors.conf @@ -23,12 +23,12 @@ on: ipv4.udperrors os: linux freebsd hosts: * - lookup: sum -1m unaligned absolute of RcvbufErrors + lookup: average -1m unaligned absolute of RcvbufErrors units: errors every: 10s - warn: $this > 0 - crit: $this > (($status == $CRITICAL) ? (0) : (100)) - info: number of UDP receive buffer errors during the last minute + warn: $this > 1 + crit: $this > (($status == $CRITICAL) ? (0) : (10)) + info: average number of UDP receive buffer errors during the last minute delay: up 0 down 60m multiplier 1.2 max 2h to: sysadmin @@ -39,11 +39,11 @@ on: ipv4.udperrors os: linux hosts: * - lookup: sum -1m unaligned absolute of SndbufErrors + lookup: average -1m unaligned absolute of SndbufErrors units: errors every: 10s - warn: $this > 0 - crit: $this > (($status == $CRITICAL) ? (0) : (100)) + warn: $this > 1 + crit: $this > (($status == $CRITICAL) ? (0) : (10)) info: number of UDP send buffer errors during the last minute delay: up 0 down 60m multiplier 1.2 max 2h to: sysadmin diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf index 031adc2ea..1aefd7b00 100644 --- a/health/health.d/web_log.conf +++ b/health/health.d/web_log.conf @@ -43,7 +43,7 @@ families: * warn: ($1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 ) crit: ($1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 ) delay: up 2m down 15m multiplier 1.5 max 1h - info: the ratio of successful HTTP responses (1xx, 2xx, 304) over the last minute + info: the ratio of successful HTTP responses (1xx, 2xx, 304, 401) over the last minute to: webmaster template: 1m_redirects @@ -69,7 +69,7 @@ families: * warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 ) crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 ) delay: up 2m down 15m multiplier 1.5 max 1h - info: the ratio of HTTP bad requests (4xx) over the last minute + info: the ratio of HTTP bad requests (4xx except 401) over the last minute to: webmaster template: 1m_internal_errors @@ -191,3 +191,197 @@ options: no-clear-notification (clear notification for this alarm will not be sent) to: webmaster + + +# ---------------------------------------------------GO-VERSION--------------------------------------------------------- + +# make sure we can collect web log data + +template: web_log_last_collected_secs + on: web_log.requests +families: * + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + +# unmatched lines + +# the following alarms trigger only when there are enough data. +# we assume there are enough data when: +# +# $1m_total_requests > 120 +# +# i.e. when there are at least 120 requests during the last minute + +template: web_log_1m_total_requests + on: web_log.requests +families: * + lookup: sum -1m unaligned + calc: ($this == 0)?(1):($this) + units: requests + every: 10s + info: the sum of all HTTP requests over the last minute + +template: web_log_1m_unmatched + on: web_log.excluded_requests +families: * + lookup: sum -1m unaligned of unmatched + calc: $this * 100 / $web_log_1m_total_requests + units: % + every: 10s + warn: ($web_log_1m_total_requests > 120) ? ($this > 1) : ( 0 ) + crit: ($web_log_1m_total_requests > 120) ? ($this > 5) : ( 0 ) + delay: up 1m down 5m multiplier 1.5 max 1h + info: the ratio of unmatched lines, over the last minute + to: webmaster + +# ----------------------------------------------------------------------------- +# high level response code alarms + +# the following alarms trigger only when there are enough data. +# we assume there are enough data when: +# +# $1m_requests > 120 +# +# i.e. when there are at least 120 requests during the last minute + +template: web_log_1m_requests + on: web_log.type_requests +families: * + lookup: sum -1m unaligned + calc: ($this == 0)?(1):($this) + units: requests + every: 10s + info: the sum of all HTTP requests over the last minute + +template: web_log_1m_successful + on: web_log.type_requests +families: * + lookup: sum -1m unaligned of success + calc: $this * 100 / $web_log_1m_requests + units: % + every: 10s + warn: ($web_log_1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 ) + crit: ($web_log_1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 ) + delay: up 2m down 15m multiplier 1.5 max 1h + info: the ratio of successful HTTP responses (1xx, 2xx, 304, 401) over the last minute + to: webmaster + +template: web_log_1m_redirects + on: web_log.type_requests +families: * + lookup: sum -1m unaligned of redirect + calc: $this * 100 / $web_log_1m_requests + units: % + every: 10s + warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 ) + crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 ) + delay: up 2m down 15m multiplier 1.5 max 1h + info: the ratio of HTTP redirects (3xx except 304) over the last minute + to: webmaster + +template: web_log_1m_bad_requests + on: web_log.type_requests +families: * + lookup: sum -1m unaligned of bad + calc: $this * 100 / $web_log_1m_requests + units: % + every: 10s + warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 ) + crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 ) + delay: up 2m down 15m multiplier 1.5 max 1h + info: the ratio of HTTP bad requests (4xx except 401) over the last minute + to: webmaster + +template: web_log_1m_internal_errors + on: web_log.type_requests +families: * + lookup: sum -1m unaligned of error + calc: $this * 100 / $web_log_1m_requests + units: % + every: 10s + warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 ) + crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 ) + delay: up 2m down 15m multiplier 1.5 max 1h + info: the ratio of HTTP internal server errors (5xx), over the last minute + to: webmaster + +# ----------------------------------------------------------------------------- +# web slow + +# the following alarms trigger only when there are enough data. +# we assume there are enough data when: +# +# $1m_requests > 120 +# +# i.e. when there are at least 120 requests during the last minute + +template: web_log_10m_response_time + on: web_log.request_processing_time +families: * + lookup: average -10m unaligned of avg + units: ms + every: 30s + info: the average time to respond to HTTP requests, over the last 10 minutes + +template: web_log_web_slow + on: web_log.request_processing_time +families: * + lookup: average -1m unaligned of avg + units: ms + every: 10s + green: 500 + red: 1000 + warn: ($web_log_1m_requests > 120) ? ($this > $green && $this > ($web_log_10m_response_time * 2) ) : ( 0 ) + crit: ($web_log_1m_requests > 120) ? ($this > $red && $this > ($web_log_10m_response_time * 4) ) : ( 0 ) + delay: down 15m multiplier 1.5 max 1h + info: the average time to respond to HTTP requests, over the last 1 minute + options: no-clear-notification + to: webmaster + +# ----------------------------------------------------------------------------- +# web too many or too few requests + +# the following alarms trigger only when there are enough data. +# we assume there are enough data when: +# +# $5m_successful_old > 120 +# +# i.e. when there were at least 120 requests during the 5 minutes starting +# at -10m and ending at -5m + +template: web_log_5m_successful_old + on: web_log.type_requests +families: * + lookup: average -5m at -5m unaligned of success + units: requests/s + every: 30s + info: average rate of successful HTTP requests over the last 5 minutes + +template: web_log_5m_successful + on: web_log.type_requests +families: * + lookup: average -5m unaligned of success + units: requests/s + every: 30s + info: average successful HTTP requests over the last 5 minutes + +template: web_log_5m_requests_ratio + on: web_log.type_requests +families: * + calc: ($web_log_5m_successful_old > 0)?($web_log_5m_successful * 100 / $web_log_5m_successful_old):(100) + units: % + every: 30s + warn: ($web_log_5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0) + crit: ($web_log_5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0) + delay: down 15m multiplier 1.5 max 1h +options: no-clear-notification + info: the percentage of successful web requests over the last 5 minutes, \ + compared with the previous 5 minutes \ + (clear notification for this alarm will not be sent) + to: webmaster |