summaryrefslogtreecommitdiffstats
path: root/health/health.d
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2019-11-28 04:53:29 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2019-11-28 04:53:29 +0000
commit17c93e2be4ad7b3af0cd6878bdd5d8a4a3e6da99 (patch)
tree3e0c96613972e8bb4afdeeb97a034806363ddfa9 /health/health.d
parentReleasing debian version 1.18.1-1. (diff)
downloadnetdata-17c93e2be4ad7b3af0cd6878bdd5d8a4a3e6da99.tar.xz
netdata-17c93e2be4ad7b3af0cd6878bdd5d8a4a3e6da99.zip
Merging upstream version 1.19.0.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'health/health.d')
-rw-r--r--health/health.d/am2320.conf12
-rw-r--r--health/health.d/dbengine.conf14
-rw-r--r--health/health.d/dnsmasq_dhcp.conf4
-rw-r--r--health/health.d/megacli.conf32
-rw-r--r--health/health.d/mysql.conf12
-rw-r--r--health/health.d/net.conf10
-rw-r--r--health/health.d/pihole.conf90
-rw-r--r--health/health.d/ram.conf48
-rw-r--r--health/health.d/softnet.conf28
-rw-r--r--health/health.d/tcp_listen.conf31
-rw-r--r--health/health.d/udp_errors.conf14
-rw-r--r--health/health.d/web_log.conf198
12 files changed, 361 insertions, 132 deletions
diff --git a/health/health.d/am2320.conf b/health/health.d/am2320.conf
new file mode 100644
index 000000000..ddf8b704d
--- /dev/null
+++ b/health/health.d/am2320.conf
@@ -0,0 +1,12 @@
+# make sure am2320 is sending stats
+
+template: am2320_last_collected_secs
+ on: am2320.temperature
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: webmaster \ No newline at end of file
diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf
index ce6427cd2..ce9839ef1 100644
--- a/health/health.d/dbengine.conf
+++ b/health/health.d/dbengine.conf
@@ -23,4 +23,16 @@ lookup: sum -10m unaligned of I/O errors
crit: $this > 0
delay: down 1h multiplier 1.5 max 3h
info: number of IO errors dbengine came across the last 10 minutes (CRC errors, out of space, bad disk etc)
- to: sysadmin \ No newline at end of file
+ to: sysadmin
+
+ alarm: 10min_dbengine_global_flushing_errors
+ on: netdata.dbengine_global_errors
+ os: linux freebsd macos
+ hosts: *
+lookup: sum -10m unaligned of flushing errors
+ units: errors
+ every: 3s
+ crit: $this > 0
+ delay: down 1h multiplier 1.5 max 3h
+ info: number of times in the last 10 minutes that the dbengine failed to completely flush data to disk, metric data will not be stored in the database, please reduce disk load or use a faster disk
+ to: sysadmin
diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf
index b7eb4e0a3..ecf3b84a8 100644
--- a/health/health.d/dnsmasq_dhcp.conf
+++ b/health/health.d/dnsmasq_dhcp.conf
@@ -1,6 +1,6 @@
- # dhcp-range utilization
+# dhcp-range utilization
- template: dnsmasq_dhcp_dhcp_range_utilization
+template: dnsmasq_dhcp_dhcp_range_utilization
on: dnsmasq_dhcp.dhcp_range_utilization
every: 10s
units: %
diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf
index 1881a7be1..73b87dcc0 100644
--- a/health/health.d/megacli.conf
+++ b/health/health.d/megacli.conf
@@ -1,48 +1,48 @@
alarm: adapter_state
on: megacli.adapter_degraded
units: is degraded
- lookup: sum -10s
- every: 10s
+ lookup: sum -10s
+ every: 10s
crit: $this > 0
info: adapter state
to: sysadmin
- template: bbu_relative_charge
+template: bbu_relative_charge
on: megacli.bbu_relative_charge
units: percent
- lookup: average -10s
- every: 10s
+ lookup: average -10s
+ every: 10s
warn: $this <= (($status >= $WARNING) ? (85) : (80))
crit: $this <= (($status == $CRITICAL) ? (50) : (40))
info: BBU relative state of charge
to: sysadmin
- template: bbu_cycle_count
+template: bbu_cycle_count
on: megacli.bbu_cycle_count
units: cycle count
- lookup: average -10s
- every: 10s
+ lookup: average -10s
+ every: 10s
warn: $this >= 100
crit: $this >= 500
info: BBU cycle count
to: sysadmin
- alarm: pd_media_errors
+ alarm: pd_media_errors
on: megacli.pd_media_error
units: media errors
- lookup: sum -10s
- every: 10s
+ lookup: sum -10s
+ every: 10s
warn: $this > 0
- delay: down 1m multiplier 2 max 10m
+ delay: down 1m multiplier 2 max 10m
info: physical drive media errors
to: sysadmin
- alarm: pd_predictive_failures
+ alarm: pd_predictive_failures
on: megacli.pd_predictive_failure
units: predictive failures
- lookup: sum -10s
- every: 10s
+ lookup: sum -10s
+ every: 10s
warn: $this > 0
- delay: down 1m multiplier 2 max 10m
+ delay: down 1m multiplier 2 max 10m
info: physical drive predictive failures
to: sysadmin
diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf
index ce7b98a87..2bec56387 100644
--- a/health/health.d/mysql.conf
+++ b/health/health.d/mysql.conf
@@ -132,3 +132,15 @@ template: mysql_galera_cluster_state
delay: up 30s down 5m multiplier 1.5 max 1h
info: node state (0: undefined, 1: joining, 2: donor/desynced, 3: joined, 4: synced)
to: dba
+
+
+# galera node status
+
+template: mysql_galera_cluster_status
+ on: mysql.galera_cluster_status
+ calc: $wsrep_cluster_status
+ every: 10s
+ crit: $mysql_galera_cluster_state != nan AND $this != 0
+ delay: up 30s down 5m multiplier 1.5 max 1h
+ info: node and cluster status (-1: unknown, 0: primary/quorum present, 1: non-primary/quorum lost, 2: disconnected)
+ to: dba
diff --git a/health/health.d/net.conf b/health/health.d/net.conf
index 255ab9982..e43cb1691 100644
--- a/health/health.d/net.conf
+++ b/health/health.d/net.conf
@@ -161,8 +161,8 @@ families: *
calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
every: 10s
units: %
- warn: $this > (($status >= $WARNING)?(200):(5000))
- crit: $this > (($status >= $WARNING)?(5000):(6000))
-options: no-clear-notification
- info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute (clear notification for this alarm will not be sent)
- to: sysadmin
+ warn: $this > (($status >= $WARNING)?(200):(5000))
+ crit: $this > (($status >= $WARNING)?(5000):(6000))
+ options: no-clear-notification
+ info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute (clear notification for this alarm will not be sent)
+ to: sysadmin
diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf
index 4a1217239..b255d35f9 100644
--- a/health/health.d/pihole.conf
+++ b/health/health.d/pihole.conf
@@ -1,5 +1,5 @@
- # Make sure Pi-hole is responding.
+# Make sure Pi-hole is responding.
template: pihole_last_collected_secs
on: pihole.dns_queries_total
@@ -12,56 +12,54 @@ template: pihole_last_collected_secs
info: number of seconds since the last successful data collection
to: webmaster
- # Blocked DNS queries.
+# Blocked DNS queries.
- template: pihole_blocked_queries
- on: pihole.dns_queries_percentage
- every: 10s
- units: %
- calc: $blocked
- warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) )
- crit: $this > ( ($status >= $CRITICAL) ? ( 55 ) : ( 75 ) )
- delay: up 2m down 5m
- info: percentage of blocked dns queries for the last 24 hour
- to: sysadmin
-
-
- # Blocklist last update time.
- # Default update interval is a week.
+template: pihole_blocked_queries
+ on: pihole.dns_queries_percentage
+ every: 10s
+ units: %
+ calc: $blocked
+ warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) )
+ crit: $this > ( ($status >= $CRITICAL) ? ( 55 ) : ( 75 ) )
+ delay: up 2m down 5m
+ info: percentage of blocked dns queries for the last 24 hour
+ to: sysadmin
- template: pihole_blocklist_last_update
- on: pihole.blocklist_last_update
- every: 10s
- units: seconds
- calc: $ago
- warn: $this > 60 * 60 * 24 * 8
- crit: $this > 60 * 60 * 24 * 8 * 2
- info: blocklist last update time
- to: sysadmin
+# Blocklist last update time.
+# Default update interval is a week.
- # Gravity file check (gravity.list).
+template: pihole_blocklist_last_update
+ on: pihole.blocklist_last_update
+ every: 10s
+ units: seconds
+ calc: $ago
+ warn: $this > 60 * 60 * 24 * 8
+ crit: $this > 60 * 60 * 24 * 8 * 2
+ info: blocklist last update time
+ to: sysadmin
- template: pihole_blocklist_gravity_file
- on: pihole.blocklist_last_update
- every: 10s
- units: boolean
- calc: $file_exists
- crit: $this != 1
- delay: up 2m down 5m
- info: gravity file existence
- to: sysadmin
+# Gravity file check (gravity.list).
+template: pihole_blocklist_gravity_file
+ on: pihole.blocklist_last_update
+ every: 10s
+ units: boolean
+ calc: $file_exists
+ crit: $this != 1
+ delay: up 2m down 5m
+ info: gravity file existence
+ to: sysadmin
- # Pi-hole's ability to block unwanted domains.
- # Should be enabled. The whole point of Pi-hole!
+# Pi-hole's ability to block unwanted domains.
+# Should be enabled. The whole point of Pi-hole!
- template: pihole_status
- on: pihole.unwanted_domains_blocking_status
- every: 10s
- units: boolean
- calc: $enabled
- warn: $this != 1
- delay: up 2m down 5m
- info: unwanted domains blocking status
- to: sysadmin
+template: pihole_status
+ on: pihole.unwanted_domains_blocking_status
+ every: 10s
+ units: boolean
+ calc: $enabled
+ warn: $this != 1
+ delay: up 2m down 5m
+ info: unwanted domains blocking status
+ to: sysadmin
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
index 4e41bb496..15e8e8464 100644
--- a/health/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -37,28 +37,28 @@
to: sysadmin
## FreeBSD
-alarm: ram_in_use
- on: system.ram
- os: freebsd
-hosts: *
- calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
-units: %
-every: 10s
- warn: $this > (($status >= $WARNING) ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
-delay: down 15m multiplier 1.5 max 1h
- info: system RAM usage
- to: sysadmin
+ alarm: ram_in_use
+ on: system.ram
+ os: freebsd
+ hosts: *
+ calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: down 15m multiplier 1.5 max 1h
+ info: system RAM usage
+ to: sysadmin
- alarm: ram_available
- on: system.ram
- os: freebsd
- hosts: *
- calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
- units: %
- every: 10s
- warn: $this < (($status >= $WARNING) ? (15) : (10))
- crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
- delay: down 15m multiplier 1.5 max 1h
- info: estimated amount of RAM available for userspace processes, without causing swapping
- to: sysadmin
+ alarm: ram_available
+ on: system.ram
+ os: freebsd
+ hosts: *
+ calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
+ units: %
+ every: 10s
+ warn: $this < (($status >= $WARNING) ? (15) : (10))
+ crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
+ delay: down 15m multiplier 1.5 max 1h
+ info: estimated amount of RAM available for userspace processes, without causing swapping
+ to: sysadmin
diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf
index 77c804bfd..ff3648626 100644
--- a/health/health.d/softnet.conf
+++ b/health/health.d/softnet.conf
@@ -3,38 +3,38 @@
# check for common /proc/net/softnet_stat errors
- alarm: 10min_netdev_backlog_exceeded
+ alarm: 1min_netdev_backlog_exceeded
on: system.softnet_stat
os: linux
hosts: *
- lookup: sum -10m unaligned absolute of dropped
+ lookup: average -1m unaligned absolute of dropped
units: packets
- every: 1m
- warn: $this > 0
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : (10)
delay: down 1h multiplier 1.5 max 2h
- info: number of packets dropped in the last 10min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets)
+ info: average number of packets dropped in the last 1min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets)
to: sysadmin
- alarm: 10min_netdev_budget_ran_outs
+ alarm: 1min_netdev_budget_ran_outs
on: system.softnet_stat
os: linux
hosts: *
- lookup: sum -10m unaligned absolute of squeezed
+ lookup: average -1m unaligned absolute of squeezed
units: events
- every: 1m
- warn: $this > (($status >= $WARNING) ? (0) : (10))
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : (10))
delay: down 1h multiplier 1.5 max 2h
- info: number of times, during the last 10min, ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs, with work remaining (this can be a cause for dropped packets)
+ info: average number of times, during the last 1min, ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs, with work remaining (this can be a cause for dropped packets)
to: silent
alarm: 10min_netisr_backlog_exceeded
on: system.softnet_stat
os: freebsd
hosts: *
- lookup: sum -10m unaligned absolute of qdrops
+ lookup: average -1m unaligned absolute of qdrops
units: packets
- every: 1m
- warn: $this > 0
+ every: 10s
+ warn: $this > (($status >+ $WARNING) ? (0) : (10))
delay: down 1h multiplier 1.5 max 2h
- info: number of drops in the last 10min, because sysctl net.route.netisr_maxqlen was exceeded (this can be a cause for dropped packets)
+ info: average number of drops in the last 1min, because sysctl net.route.netisr_maxqlen was exceeded (this can be a cause for dropped packets)
to: sysadmin
diff --git a/health/health.d/tcp_listen.conf b/health/health.d/tcp_listen.conf
index 552930ab7..3b3072577 100644
--- a/health/health.d/tcp_listen.conf
+++ b/health/health.d/tcp_listen.conf
@@ -22,12 +22,13 @@
on: ip.tcp_accept_queue
os: linux
hosts: *
- lookup: sum -60s unaligned absolute of ListenOverflows
+ lookup: average -60s unaligned absolute of ListenOverflows
units: overflows
every: 10s
- crit: $this > 0
+ warn: $this > 1
+ crit: $this > (($status == $CRITICAL) ? (1) : (5))
delay: up 0 down 5m multiplier 1.5 max 1h
- info: the number of times the TCP accept queue of the kernel overflown, during the last minute
+ info: the average number of times the TCP accept queue of the kernel overflown, during the last minute
to: sysadmin
# THIS IS TOO GENERIC
@@ -36,13 +37,13 @@
on: ip.tcp_accept_queue
os: linux
hosts: *
- lookup: sum -60s unaligned absolute of ListenDrops
+ lookup: average -60s unaligned absolute of ListenDrops
units: drops
every: 10s
-# warn: $this > 0
- crit: $this > (($status == $CRITICAL) ? (0) : (150))
+ warn: $this > 1
+ crit: $this > (($status == $CRITICAL) ? (1) : (5))
delay: up 0 down 5m multiplier 1.5 max 1h
- info: the number of times the TCP accept queue of the kernel dropped packets, during the last minute (includes bogus packets received)
+ info: the average number of times the TCP accept queue of the kernel dropped packets, during the last minute (includes bogus packets received)
to: sysadmin
@@ -58,12 +59,12 @@
on: ip.tcp_syn_queue
os: linux
hosts: *
- lookup: sum -60s unaligned absolute of TCPReqQFullDrop
+ lookup: average -60s unaligned absolute of TCPReqQFullDrop
units: drops
every: 10s
- warn: $this > 0
- crit: $this > (($status == $CRITICAL) ? (0) : (60))
- delay: up 0 down 5m multiplier 1.5 max 1h
+ warn: $this > 1
+ crit: $this > (($status == $CRITICAL) ? (0) : (5))
+ delay: up 10 down 5m multiplier 1.5 max 1h
info: the number of times the TCP SYN queue of the kernel was full and dropped packets, during the last minute
to: sysadmin
@@ -71,12 +72,12 @@
on: ip.tcp_syn_queue
os: linux
hosts: *
- lookup: sum -60s unaligned absolute of TCPReqQFullDoCookies
+ lookup: average -60s unaligned absolute of TCPReqQFullDoCookies
units: cookies
every: 10s
- warn: $this > 0
- crit: $this > (($status == $CRITICAL) ? (0) : (60))
- delay: up 0 down 5m multiplier 1.5 max 1h
+ warn: $this > 1
+ crit: $this > (($status == $CRITICAL) ? (0) : (5))
+ delay: up 10 down 5m multiplier 1.5 max 1h
info: the number of times the TCP SYN queue of the kernel was full and sent SYN cookies, during the last minute
to: sysadmin
diff --git a/health/health.d/udp_errors.conf b/health/health.d/udp_errors.conf
index 5140228f5..1e47b5c8b 100644
--- a/health/health.d/udp_errors.conf
+++ b/health/health.d/udp_errors.conf
@@ -23,12 +23,12 @@
on: ipv4.udperrors
os: linux freebsd
hosts: *
- lookup: sum -1m unaligned absolute of RcvbufErrors
+ lookup: average -1m unaligned absolute of RcvbufErrors
units: errors
every: 10s
- warn: $this > 0
- crit: $this > (($status == $CRITICAL) ? (0) : (100))
- info: number of UDP receive buffer errors during the last minute
+ warn: $this > 1
+ crit: $this > (($status == $CRITICAL) ? (0) : (10))
+ info: average number of UDP receive buffer errors during the last minute
delay: up 0 down 60m multiplier 1.2 max 2h
to: sysadmin
@@ -39,11 +39,11 @@
on: ipv4.udperrors
os: linux
hosts: *
- lookup: sum -1m unaligned absolute of SndbufErrors
+ lookup: average -1m unaligned absolute of SndbufErrors
units: errors
every: 10s
- warn: $this > 0
- crit: $this > (($status == $CRITICAL) ? (0) : (100))
+ warn: $this > 1
+ crit: $this > (($status == $CRITICAL) ? (0) : (10))
info: number of UDP send buffer errors during the last minute
delay: up 0 down 60m multiplier 1.2 max 2h
to: sysadmin
diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf
index 031adc2ea..1aefd7b00 100644
--- a/health/health.d/web_log.conf
+++ b/health/health.d/web_log.conf
@@ -43,7 +43,7 @@ families: *
warn: ($1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
crit: ($1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
delay: up 2m down 15m multiplier 1.5 max 1h
- info: the ratio of successful HTTP responses (1xx, 2xx, 304) over the last minute
+ info: the ratio of successful HTTP responses (1xx, 2xx, 304, 401) over the last minute
to: webmaster
template: 1m_redirects
@@ -69,7 +69,7 @@ families: *
warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 )
crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 )
delay: up 2m down 15m multiplier 1.5 max 1h
- info: the ratio of HTTP bad requests (4xx) over the last minute
+ info: the ratio of HTTP bad requests (4xx except 401) over the last minute
to: webmaster
template: 1m_internal_errors
@@ -191,3 +191,197 @@ options: no-clear-notification
(clear notification for this alarm will not be sent)
to: webmaster
+
+
+# ---------------------------------------------------GO-VERSION---------------------------------------------------------
+
+# make sure we can collect web log data
+
+template: web_log_last_collected_secs
+ on: web_log.requests
+families: *
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: webmaster
+
+# unmatched lines
+
+# the following alarms trigger only when there are enough data.
+# we assume there are enough data when:
+#
+# $1m_total_requests > 120
+#
+# i.e. when there are at least 120 requests during the last minute
+
+template: web_log_1m_total_requests
+ on: web_log.requests
+families: *
+ lookup: sum -1m unaligned
+ calc: ($this == 0)?(1):($this)
+ units: requests
+ every: 10s
+ info: the sum of all HTTP requests over the last minute
+
+template: web_log_1m_unmatched
+ on: web_log.excluded_requests
+families: *
+ lookup: sum -1m unaligned of unmatched
+ calc: $this * 100 / $web_log_1m_total_requests
+ units: %
+ every: 10s
+ warn: ($web_log_1m_total_requests > 120) ? ($this > 1) : ( 0 )
+ crit: ($web_log_1m_total_requests > 120) ? ($this > 5) : ( 0 )
+ delay: up 1m down 5m multiplier 1.5 max 1h
+ info: the ratio of unmatched lines, over the last minute
+ to: webmaster
+
+# -----------------------------------------------------------------------------
+# high level response code alarms
+
+# the following alarms trigger only when there are enough data.
+# we assume there are enough data when:
+#
+# $1m_requests > 120
+#
+# i.e. when there are at least 120 requests during the last minute
+
+template: web_log_1m_requests
+ on: web_log.type_requests
+families: *
+ lookup: sum -1m unaligned
+ calc: ($this == 0)?(1):($this)
+ units: requests
+ every: 10s
+ info: the sum of all HTTP requests over the last minute
+
+template: web_log_1m_successful
+ on: web_log.type_requests
+families: *
+ lookup: sum -1m unaligned of success
+ calc: $this * 100 / $web_log_1m_requests
+ units: %
+ every: 10s
+ warn: ($web_log_1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
+ crit: ($web_log_1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
+ delay: up 2m down 15m multiplier 1.5 max 1h
+ info: the ratio of successful HTTP responses (1xx, 2xx, 304, 401) over the last minute
+ to: webmaster
+
+template: web_log_1m_redirects
+ on: web_log.type_requests
+families: *
+ lookup: sum -1m unaligned of redirect
+ calc: $this * 100 / $web_log_1m_requests
+ units: %
+ every: 10s
+ warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 )
+ crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 )
+ delay: up 2m down 15m multiplier 1.5 max 1h
+ info: the ratio of HTTP redirects (3xx except 304) over the last minute
+ to: webmaster
+
+template: web_log_1m_bad_requests
+ on: web_log.type_requests
+families: *
+ lookup: sum -1m unaligned of bad
+ calc: $this * 100 / $web_log_1m_requests
+ units: %
+ every: 10s
+ warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 )
+ crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 )
+ delay: up 2m down 15m multiplier 1.5 max 1h
+ info: the ratio of HTTP bad requests (4xx except 401) over the last minute
+ to: webmaster
+
+template: web_log_1m_internal_errors
+ on: web_log.type_requests
+families: *
+ lookup: sum -1m unaligned of error
+ calc: $this * 100 / $web_log_1m_requests
+ units: %
+ every: 10s
+ warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 )
+ crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
+ delay: up 2m down 15m multiplier 1.5 max 1h
+ info: the ratio of HTTP internal server errors (5xx), over the last minute
+ to: webmaster
+
+# -----------------------------------------------------------------------------
+# web slow
+
+# the following alarms trigger only when there are enough data.
+# we assume there are enough data when:
+#
+# $1m_requests > 120
+#
+# i.e. when there are at least 120 requests during the last minute
+
+template: web_log_10m_response_time
+ on: web_log.request_processing_time
+families: *
+ lookup: average -10m unaligned of avg
+ units: ms
+ every: 30s
+ info: the average time to respond to HTTP requests, over the last 10 minutes
+
+template: web_log_web_slow
+ on: web_log.request_processing_time
+families: *
+ lookup: average -1m unaligned of avg
+ units: ms
+ every: 10s
+ green: 500
+ red: 1000
+ warn: ($web_log_1m_requests > 120) ? ($this > $green && $this > ($web_log_10m_response_time * 2) ) : ( 0 )
+ crit: ($web_log_1m_requests > 120) ? ($this > $red && $this > ($web_log_10m_response_time * 4) ) : ( 0 )
+ delay: down 15m multiplier 1.5 max 1h
+ info: the average time to respond to HTTP requests, over the last 1 minute
+ options: no-clear-notification
+ to: webmaster
+
+# -----------------------------------------------------------------------------
+# web too many or too few requests
+
+# the following alarms trigger only when there are enough data.
+# we assume there are enough data when:
+#
+# $5m_successful_old > 120
+#
+# i.e. when there were at least 120 requests during the 5 minutes starting
+# at -10m and ending at -5m
+
+template: web_log_5m_successful_old
+ on: web_log.type_requests
+families: *
+ lookup: average -5m at -5m unaligned of success
+ units: requests/s
+ every: 30s
+ info: average rate of successful HTTP requests over the last 5 minutes
+
+template: web_log_5m_successful
+ on: web_log.type_requests
+families: *
+ lookup: average -5m unaligned of success
+ units: requests/s
+ every: 30s
+ info: average successful HTTP requests over the last 5 minutes
+
+template: web_log_5m_requests_ratio
+ on: web_log.type_requests
+families: *
+ calc: ($web_log_5m_successful_old > 0)?($web_log_5m_successful * 100 / $web_log_5m_successful_old):(100)
+ units: %
+ every: 30s
+ warn: ($web_log_5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0)
+ crit: ($web_log_5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
+ delay: down 15m multiplier 1.5 max 1h
+options: no-clear-notification
+ info: the percentage of successful web requests over the last 5 minutes, \
+ compared with the previous 5 minutes \
+ (clear notification for this alarm will not be sent)
+ to: webmaster