Merging upstream version 1.19.0.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2019-11-28 04:53:29 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2019-11-28 04:53:29 +0000
commit: 17c93e2be4ad7b3af0cd6878bdd5d8a4a3e6da99 (patch)
tree: 3e0c96613972e8bb4afdeeb97a034806363ddfa9 /health/health.d
parent: Releasing debian version 1.18.1-1. (diff)
download: netdata-17c93e2be4ad7b3af0cd6878bdd5d8a4a3e6da99.tar.xz
netdata-17c93e2be4ad7b3af0cd6878bdd5d8a4a3e6da99.zip
12 files changed, 361 insertions, 132 deletions
diff --git a/health/health.d/am2320.conf b/health/health.d/am2320.conf
new file mode 100644
index 000000000..ddf8b704d
--- /dev/null
+++ b/health/health.d/am2320.conf
@@ -0,0 +1,12 @@
+# make sure am2320 is sending stats
+
+template: am2320_last_collected_secs
+      on: am2320.temperature
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: webmaster
+\ No newline at end of file
diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf
index ce6427cd2..ce9839ef1 100644
--- a/health/health.d/dbengine.conf
+++ b/health/health.d/dbengine.conf
@@ -23,4 +23,16 @@ lookup: sum -10m unaligned of I/O errors
   crit: $this > 0
  delay: down 1h multiplier 1.5 max 3h
   info: number of IO errors dbengine came across the last 10 minutes (CRC errors, out of space, bad disk etc)
-    to: sysadmin
-\ No newline at end of file
+    to: sysadmin
+
+ alarm: 10min_dbengine_global_flushing_errors
+    on: netdata.dbengine_global_errors
+    os: linux freebsd macos
+ hosts: *
+lookup: sum -10m unaligned of flushing errors
+ units: errors
+ every: 3s
+  crit: $this > 0
+ delay: down 1h multiplier 1.5 max 3h
+  info: number of times in the last 10 minutes that the dbengine failed to completely flush data to disk, metric data will not be stored in the database, please reduce disk load or use a faster disk
+    to: sysadmin
diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf
index b7eb4e0a3..ecf3b84a8 100644
--- a/health/health.d/dnsmasq_dhcp.conf
+++ b/health/health.d/dnsmasq_dhcp.conf
@@ -1,6 +1,6 @@
- # dhcp-range utilization
+# dhcp-range utilization
 
- template: dnsmasq_dhcp_dhcp_range_utilization
+template: dnsmasq_dhcp_dhcp_range_utilization
       on: dnsmasq_dhcp.dhcp_range_utilization
    every: 10s
    units: %
diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf
index 1881a7be1..73b87dcc0 100644
--- a/health/health.d/megacli.conf
+++ b/health/health.d/megacli.conf
@@ -1,48 +1,48 @@
    alarm: adapter_state
       on: megacli.adapter_degraded
    units: is degraded
-    lookup: sum -10s
-    every: 10s
+  lookup: sum -10s
+   every: 10s
     crit: $this > 0
     info: adapter state
       to: sysadmin
 
-   template: bbu_relative_charge
+template: bbu_relative_charge
       on: megacli.bbu_relative_charge
    units: percent
-    lookup: average -10s
-    every: 10s
+  lookup: average -10s
+   every: 10s
     warn: $this <= (($status >= $WARNING)  ? (85) : (80))
     crit: $this <= (($status == $CRITICAL)  ? (50) : (40))
     info: BBU relative state of charge
       to: sysadmin
 
-   template: bbu_cycle_count
+template: bbu_cycle_count
       on: megacli.bbu_cycle_count
    units: cycle count
-    lookup: average -10s
-    every: 10s
+  lookup: average -10s
+   every: 10s
     warn: $this >= 100
     crit: $this >= 500
     info: BBU cycle count
       to: sysadmin
 
-    alarm: pd_media_errors
+   alarm: pd_media_errors
       on: megacli.pd_media_error
    units: media errors
-    lookup: sum -10s
-    every: 10s
+  lookup: sum -10s
+   every: 10s
     warn: $this > 0
-    delay: down 1m multiplier 2 max 10m
+   delay: down 1m multiplier 2 max 10m
     info: physical drive media errors
       to: sysadmin
 
-    alarm: pd_predictive_failures
+   alarm: pd_predictive_failures
       on: megacli.pd_predictive_failure
    units: predictive failures
-    lookup: sum -10s
-    every: 10s
+  lookup: sum -10s
+   every: 10s
     warn: $this > 0
-    delay: down 1m multiplier 2 max 10m
+   delay: down 1m multiplier 2 max 10m
     info: physical drive predictive failures
       to: sysadmin
diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf
index ce7b98a87..2bec56387 100644
--- a/health/health.d/mysql.conf
+++ b/health/health.d/mysql.conf
@@ -132,3 +132,15 @@ template: mysql_galera_cluster_state
    delay: up 30s down 5m multiplier 1.5 max 1h
     info: node state (0: undefined, 1: joining, 2: donor/desynced, 3: joined, 4: synced)
       to: dba
+
+
+# galera node status
+
+template: mysql_galera_cluster_status
+      on: mysql.galera_cluster_status
+    calc: $wsrep_cluster_status
+   every: 10s
+    crit: $mysql_galera_cluster_state != nan AND $this != 0
+   delay: up 30s down 5m multiplier 1.5 max 1h
+    info: node and cluster status (-1: unknown, 0: primary/quorum present, 1: non-primary/quorum lost, 2: disconnected)
+      to: dba
diff --git a/health/health.d/net.conf b/health/health.d/net.conf
index 255ab9982..e43cb1691 100644
--- a/health/health.d/net.conf
+++ b/health/health.d/net.conf
@@ -161,8 +161,8 @@ families: *
     calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
    every: 10s
    units: %
-   warn: $this > (($status >= $WARNING)?(200):(5000))
-   crit: $this > (($status >= $WARNING)?(5000):(6000))
-options: no-clear-notification
-   info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute (clear notification for this alarm will not be sent)
-     to: sysadmin
+    warn: $this > (($status >= $WARNING)?(200):(5000))
+    crit: $this > (($status >= $WARNING)?(5000):(6000))
+ options: no-clear-notification
+    info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute (clear notification for this alarm will not be sent)
+      to: sysadmin
diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf
index 4a1217239..b255d35f9 100644
--- a/health/health.d/pihole.conf
+++ b/health/health.d/pihole.conf
@@ -1,5 +1,5 @@
 
- # Make sure Pi-hole is responding.
+# Make sure Pi-hole is responding.
 
 template: pihole_last_collected_secs
       on: pihole.dns_queries_total
@@ -12,56 +12,54 @@ template: pihole_last_collected_secs
     info: number of seconds since the last successful data collection
       to: webmaster
 
-  # Blocked DNS queries.
+# Blocked DNS queries.
 
- template: pihole_blocked_queries
-       on: pihole.dns_queries_percentage
-    every: 10s
-    units: %
-     calc: $blocked
-     warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) )
-     crit: $this > ( ($status >= $CRITICAL) ? ( 55 ) : ( 75 ) )
-    delay: up 2m down 5m
-     info: percentage of blocked dns queries for the last 24 hour
-       to: sysadmin
-
-
-  # Blocklist last update time.
-  # Default update interval is a week.
+template: pihole_blocked_queries
+      on: pihole.dns_queries_percentage
+   every: 10s
+   units: %
+    calc: $blocked
+    warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) )
+    crit: $this > ( ($status >= $CRITICAL) ? ( 55 ) : ( 75 ) )
+   delay: up 2m down 5m
+    info: percentage of blocked dns queries for the last 24 hour
+      to: sysadmin
 
- template: pihole_blocklist_last_update
-       on: pihole.blocklist_last_update
-    every: 10s
-    units: seconds
-     calc: $ago
-     warn: $this > 60 * 60 * 24 * 8
-     crit: $this > 60 * 60 * 24 * 8 * 2
-     info: blocklist last update time
-       to: sysadmin
 
+# Blocklist last update time.
+# Default update interval is a week.
 
-  # Gravity file check (gravity.list).
+template: pihole_blocklist_last_update
+      on: pihole.blocklist_last_update
+   every: 10s
+   units: seconds
+    calc: $ago
+    warn: $this > 60 * 60 * 24 * 8
+    crit: $this > 60 * 60 * 24 * 8 * 2
+    info: blocklist last update time
+      to: sysadmin
 
- template: pihole_blocklist_gravity_file
-       on: pihole.blocklist_last_update
-    every: 10s
-    units: boolean
-     calc: $file_exists
-     crit: $this != 1
-    delay: up 2m down 5m
-     info: gravity file existence
-       to: sysadmin
+# Gravity file check (gravity.list).
 
+template: pihole_blocklist_gravity_file
+      on: pihole.blocklist_last_update
+   every: 10s
+   units: boolean
+    calc: $file_exists
+    crit: $this != 1
+   delay: up 2m down 5m
+    info: gravity file existence
+      to: sysadmin
 
-  # Pi-hole's ability to block unwanted domains.
-  # Should be enabled. The whole point of Pi-hole!
+# Pi-hole's ability to block unwanted domains.
+# Should be enabled. The whole point of Pi-hole!
 
- template: pihole_status
-       on: pihole.unwanted_domains_blocking_status
-    every: 10s
-    units: boolean
-     calc: $enabled
-     warn: $this != 1
-    delay: up 2m down 5m
-     info: unwanted domains blocking status
-       to: sysadmin
+template: pihole_status
+      on: pihole.unwanted_domains_blocking_status
+   every: 10s
+   units: boolean
+    calc: $enabled
+    warn: $this != 1
+   delay: up 2m down 5m
+    info: unwanted domains blocking status
+      to: sysadmin
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
index 4e41bb496..15e8e8464 100644
--- a/health/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -37,28 +37,28 @@
       to: sysadmin
 
 ## FreeBSD
-alarm: ram_in_use
-   on: system.ram
-   os: freebsd
-hosts: *
- calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
-units: %
-every: 10s
- warn: $this > (($status >= $WARNING)  ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
-delay: down 15m multiplier 1.5 max 1h
- info: system RAM usage
-   to: sysadmin
+   alarm: ram_in_use
+      on: system.ram
+      os: freebsd
+   hosts: *
+    calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (80) : (90))
+    crit: $this > (($status == $CRITICAL) ? (90) : (98))
+   delay: down 15m multiplier 1.5 max 1h
+    info: system RAM usage
+      to: sysadmin
 
- alarm: ram_available
-    on: system.ram
-    os: freebsd
- hosts: *
-  calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
- units: %
- every: 10s
-  warn: $this < (($status >= $WARNING)  ? (15) : (10))
-  crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
- delay: down 15m multiplier 1.5 max 1h
-  info: estimated amount of RAM available for userspace processes, without causing swapping
-    to: sysadmin
+   alarm: ram_available
+      on: system.ram
+      os: freebsd
+   hosts: *
+    calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
+   units: %
+   every: 10s
+    warn: $this < (($status >= $WARNING)  ? (15) : (10))
+    crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
+   delay: down 15m multiplier 1.5 max 1h
+    info: estimated amount of RAM available for userspace processes, without causing swapping
+      to: sysadmin
diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf
index 77c804bfd..ff3648626 100644
--- a/health/health.d/softnet.conf
+++ b/health/health.d/softnet.conf
@@ -3,38 +3,38 @@
 
 # check for common /proc/net/softnet_stat errors
 
-   alarm: 10min_netdev_backlog_exceeded
+   alarm: 1min_netdev_backlog_exceeded
       on: system.softnet_stat
       os: linux
    hosts: *
-  lookup: sum -10m unaligned absolute of dropped
+  lookup: average -1m unaligned absolute of dropped
    units: packets
-   every: 1m
-    warn: $this > 0
+   every: 10s
+    warn: $this > (($status >= $WARNING) ? (0) : (10)
    delay: down 1h multiplier 1.5 max 2h
-    info: number of packets dropped in the last 10min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets)
+    info: average number of packets dropped in the last 1min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets)
       to: sysadmin
 
-   alarm: 10min_netdev_budget_ran_outs
+   alarm: 1min_netdev_budget_ran_outs
       on: system.softnet_stat
       os: linux
    hosts: *
-  lookup: sum -10m unaligned absolute of squeezed
+  lookup: average -1m unaligned absolute of squeezed
    units: events
-   every: 1m
-    warn: $this > (($status >= $WARNING)  ? (0) : (10))
+   every: 10s
+    warn: $this > (($status >= $WARNING) ? (0) : (10))
    delay: down 1h multiplier 1.5 max 2h
-    info: number of times, during the last 10min, ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs, with work remaining (this can be a cause for dropped packets)
+    info: average number of times, during the last 1min, ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs, with work remaining (this can be a cause for dropped packets)
       to: silent
 
    alarm: 10min_netisr_backlog_exceeded
       on: system.softnet_stat
       os: freebsd
    hosts: *
-   lookup: sum -10m unaligned absolute of qdrops
+  lookup: average -1m unaligned absolute of qdrops
    units: packets
-   every: 1m
-    warn: $this > 0
+   every: 10s
+    warn: $this > (($status >+ $WARNING) ? (0) : (10))
    delay: down 1h multiplier 1.5 max 2h
-    info: number of drops in the last 10min, because sysctl net.route.netisr_maxqlen was exceeded (this can be a cause for dropped packets)
+    info: average number of drops in the last 1min, because sysctl net.route.netisr_maxqlen was exceeded (this can be a cause for dropped packets)
       to: sysadmin
diff --git a/health/health.d/tcp_listen.conf b/health/health.d/tcp_listen.conf
index 552930ab7..3b3072577 100644
--- a/health/health.d/tcp_listen.conf
+++ b/health/health.d/tcp_listen.conf
@@ -22,12 +22,13 @@
       on: ip.tcp_accept_queue
       os: linux
    hosts: *
-  lookup: sum -60s unaligned absolute of ListenOverflows
+  lookup: average -60s unaligned absolute of ListenOverflows
    units: overflows
    every: 10s
-    crit: $this > 0
+    warn: $this > 1
+    crit: $this > (($status == $CRITICAL) ? (1) : (5))
    delay: up 0 down 5m multiplier 1.5 max 1h
-    info: the number of times the TCP accept queue of the kernel overflown, during the last minute
+    info: the average number of times the TCP accept queue of the kernel overflown, during the last minute
       to: sysadmin
 
 # THIS IS TOO GENERIC
@@ -36,13 +37,13 @@
       on: ip.tcp_accept_queue
       os: linux
    hosts: *
-  lookup: sum -60s unaligned absolute of ListenDrops
+  lookup: average -60s unaligned absolute of ListenDrops
    units: drops
    every: 10s
-#    warn: $this > 0
-    crit: $this > (($status == $CRITICAL) ? (0) : (150))
+    warn: $this > 1
+    crit: $this > (($status == $CRITICAL) ? (1) : (5))
    delay: up 0 down 5m multiplier 1.5 max 1h
-    info: the number of times the TCP accept queue of the kernel dropped packets, during the last minute (includes bogus packets received)
+    info: the average number of times the TCP accept queue of the kernel dropped packets, during the last minute (includes bogus packets received)
       to: sysadmin
 
 
@@ -58,12 +59,12 @@
       on: ip.tcp_syn_queue
       os: linux
    hosts: *
-  lookup: sum -60s unaligned absolute of TCPReqQFullDrop
+  lookup: average -60s unaligned absolute of TCPReqQFullDrop
    units: drops
    every: 10s
-    warn: $this > 0
-    crit: $this > (($status == $CRITICAL) ? (0) : (60))
-   delay: up 0 down 5m multiplier 1.5 max 1h
+    warn: $this > 1
+    crit: $this > (($status == $CRITICAL) ? (0) : (5))
+   delay: up 10 down 5m multiplier 1.5 max 1h
     info: the number of times the TCP SYN queue of the kernel was full and dropped packets, during the last minute
       to: sysadmin
 
@@ -71,12 +72,12 @@
       on: ip.tcp_syn_queue
       os: linux
    hosts: *
-  lookup: sum -60s unaligned absolute of TCPReqQFullDoCookies
+  lookup: average -60s unaligned absolute of TCPReqQFullDoCookies
    units: cookies
    every: 10s
-    warn: $this > 0
-    crit: $this > (($status == $CRITICAL) ? (0) : (60))
-   delay: up 0 down 5m multiplier 1.5 max 1h
+    warn: $this > 1
+    crit: $this > (($status == $CRITICAL) ? (0) : (5))
+   delay: up 10 down 5m multiplier 1.5 max 1h
     info: the number of times the TCP SYN queue of the kernel was full and sent SYN cookies, during the last minute
       to: sysadmin
 
diff --git a/health/health.d/udp_errors.conf b/health/health.d/udp_errors.conf
index 5140228f5..1e47b5c8b 100644
--- a/health/health.d/udp_errors.conf
+++ b/health/health.d/udp_errors.conf
@@ -23,12 +23,12 @@
       on: ipv4.udperrors
       os: linux freebsd
    hosts: *
-  lookup: sum -1m unaligned absolute of RcvbufErrors
+  lookup: average -1m unaligned absolute of RcvbufErrors
    units: errors
    every: 10s
-    warn: $this > 0
-    crit: $this > (($status == $CRITICAL) ? (0) : (100))
-    info: number of UDP receive buffer errors during the last minute
+    warn: $this > 1
+    crit: $this > (($status == $CRITICAL) ? (0) : (10))
+    info: average number of UDP receive buffer errors during the last minute
    delay: up 0 down 60m multiplier 1.2 max 2h
       to: sysadmin
 
@@ -39,11 +39,11 @@
       on: ipv4.udperrors
       os: linux
    hosts: *
-  lookup: sum -1m unaligned absolute of SndbufErrors
+  lookup: average -1m unaligned absolute of SndbufErrors
    units: errors
    every: 10s
-    warn: $this > 0
-    crit: $this > (($status == $CRITICAL) ? (0) : (100))
+    warn: $this > 1
+    crit: $this > (($status == $CRITICAL) ? (0) : (10))
     info: number of UDP send buffer errors during the last minute
    delay: up 0 down 60m multiplier 1.2 max 2h
       to: sysadmin
diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf
index 031adc2ea..1aefd7b00 100644
--- a/health/health.d/web_log.conf
+++ b/health/health.d/web_log.conf
@@ -43,7 +43,7 @@ families: *
     warn: ($1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
     crit: ($1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
    delay: up 2m down 15m multiplier 1.5 max 1h
-    info: the ratio of successful HTTP responses (1xx, 2xx, 304) over the last minute
+    info: the ratio of successful HTTP responses (1xx, 2xx, 304, 401) over the last minute
       to: webmaster
 
 template: 1m_redirects
@@ -69,7 +69,7 @@ families: *
     warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING)  ? ( 10 ) : ( 30 )) ) : ( 0 )
     crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 )
    delay: up 2m down 15m multiplier 1.5 max 1h
-    info: the ratio of HTTP bad requests (4xx) over the last minute
+    info: the ratio of HTTP bad requests (4xx except 401) over the last minute
       to: webmaster
 
 template: 1m_internal_errors
@@ -191,3 +191,197 @@ options: no-clear-notification
           (clear notification for this alarm will not be sent)
       to: webmaster
 
+
+
+# ---------------------------------------------------GO-VERSION---------------------------------------------------------
+
+# make sure we can collect web log data
+
+template: web_log_last_collected_secs
+      on: web_log.requests
+families: *
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: webmaster
+
+# unmatched lines
+
+# the following alarms trigger only when there are enough data.
+# we assume there are enough data when:
+#
+#  $1m_total_requests > 120
+#
+# i.e. when there are at least 120 requests during the last minute
+
+template: web_log_1m_total_requests
+      on: web_log.requests
+families: *
+  lookup: sum -1m unaligned
+    calc: ($this == 0)?(1):($this)
+   units: requests
+   every: 10s
+    info: the sum of all HTTP requests over the last minute
+
+template: web_log_1m_unmatched
+      on: web_log.excluded_requests
+families: *
+  lookup: sum -1m unaligned of unmatched
+    calc: $this * 100 / $web_log_1m_total_requests
+   units: %
+   every: 10s
+    warn: ($web_log_1m_total_requests > 120) ? ($this > 1) : ( 0 )
+    crit: ($web_log_1m_total_requests > 120) ? ($this > 5) : ( 0 )
+   delay: up 1m down 5m multiplier 1.5 max 1h
+    info: the ratio of unmatched lines, over the last minute
+      to: webmaster
+
+# -----------------------------------------------------------------------------
+# high level response code alarms
+
+# the following alarms trigger only when there are enough data.
+# we assume there are enough data when:
+#
+#  $1m_requests > 120
+#
+# i.e. when there are at least 120 requests during the last minute
+
+template: web_log_1m_requests
+      on: web_log.type_requests
+families: *
+  lookup: sum -1m unaligned
+    calc: ($this == 0)?(1):($this)
+   units: requests
+   every: 10s
+    info: the sum of all HTTP requests over the last minute
+
+template: web_log_1m_successful
+      on: web_log.type_requests
+families: *
+  lookup: sum -1m unaligned of success
+    calc: $this * 100 / $web_log_1m_requests
+   units: %
+   every: 10s
+    warn: ($web_log_1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
+    crit: ($web_log_1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
+   delay: up 2m down 15m multiplier 1.5 max 1h
+    info: the ratio of successful HTTP responses (1xx, 2xx, 304, 401) over the last minute
+      to: webmaster
+
+template: web_log_1m_redirects
+      on: web_log.type_requests
+families: *
+  lookup: sum -1m unaligned of redirect
+    calc: $this * 100 / $web_log_1m_requests
+   units: %
+   every: 10s
+    warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING ) ? (  1 ) : ( 20 )) ) : ( 0 )
+    crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 )
+   delay: up 2m down 15m multiplier 1.5 max 1h
+    info: the ratio of HTTP redirects (3xx except 304) over the last minute
+      to: webmaster
+
+template: web_log_1m_bad_requests
+      on: web_log.type_requests
+families: *
+  lookup: sum -1m unaligned of bad
+    calc: $this * 100 / $web_log_1m_requests
+   units: %
+   every: 10s
+    warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING)  ? ( 10 ) : ( 30 )) ) : ( 0 )
+    crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 )
+   delay: up 2m down 15m multiplier 1.5 max 1h
+    info: the ratio of HTTP bad requests (4xx except 401) over the last minute
+      to: webmaster
+
+template: web_log_1m_internal_errors
+      on: web_log.type_requests
+families: *
+  lookup: sum -1m unaligned of error
+    calc: $this * 100 / $web_log_1m_requests
+   units: %
+   every: 10s
+    warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING)  ? ( 1 ) : ( 2 )) ) : ( 0 )
+    crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
+   delay: up 2m down 15m multiplier 1.5 max 1h
+    info: the ratio of HTTP internal server errors (5xx), over the last minute
+      to: webmaster
+
+# -----------------------------------------------------------------------------
+# web slow
+
+# the following alarms trigger only when there are enough data.
+# we assume there are enough data when:
+#
+#  $1m_requests > 120
+#
+# i.e. when there are at least 120 requests during the last minute
+
+template: web_log_10m_response_time
+      on: web_log.request_processing_time
+families: *
+  lookup: average -10m unaligned of avg
+   units: ms
+   every: 30s
+    info: the average time to respond to HTTP requests, over the last 10 minutes
+
+template: web_log_web_slow
+      on: web_log.request_processing_time
+families: *
+  lookup: average -1m unaligned of avg
+   units: ms
+   every: 10s
+   green: 500
+     red: 1000
+    warn: ($web_log_1m_requests > 120) ? ($this > $green && $this > ($web_log_10m_response_time * 2) ) : ( 0 )
+    crit: ($web_log_1m_requests > 120) ? ($this > $red   && $this > ($web_log_10m_response_time * 4) ) : ( 0 )
+   delay: down 15m multiplier 1.5 max 1h
+    info: the average time to respond to HTTP requests, over the last 1 minute
+ options: no-clear-notification
+      to: webmaster
+
+# -----------------------------------------------------------------------------
+# web too many or too few requests
+
+# the following alarms trigger only when there are enough data.
+# we assume there are enough data when:
+#
+#  $5m_successful_old > 120
+#
+# i.e. when there were at least 120 requests during the 5 minutes starting
+#      at -10m and ending at -5m
+
+template: web_log_5m_successful_old
+      on: web_log.type_requests
+families: *
+  lookup: average -5m at -5m unaligned of success
+   units: requests/s
+   every: 30s
+    info: average rate of successful HTTP requests over the last 5 minutes
+
+template: web_log_5m_successful
+      on: web_log.type_requests
+families: *
+  lookup: average -5m unaligned of success
+   units: requests/s
+   every: 30s
+    info: average successful HTTP requests over the last 5 minutes
+
+template: web_log_5m_requests_ratio
+      on: web_log.type_requests
+families: *
+    calc: ($web_log_5m_successful_old > 0)?($web_log_5m_successful * 100 / $web_log_5m_successful_old):(100)
+   units: %
+   every: 30s
+    warn: ($web_log_5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0)
+    crit: ($web_log_5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
+   delay: down 15m multiplier 1.5 max 1h
+options: no-clear-notification
+    info: the percentage of successful web requests over the last 5 minutes, \
+          compared with the previous 5 minutes \
+          (clear notification for this alarm will not be sent)
+      to: webmaster
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2019-11-28 04:53:29 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2019-11-28 04:53:29 +0000
commit	17c93e2be4ad7b3af0cd6878bdd5d8a4a3e6da99 (patch)
tree	3e0c96613972e8bb4afdeeb97a034806363ddfa9 /health/health.d
parent	Releasing debian version 1.18.1-1. (diff)
download	netdata-17c93e2be4ad7b3af0cd6878bdd5d8a4a3e6da99.tar.xz netdata-17c93e2be4ad7b3af0cd6878bdd5d8a4a3e6da99.zip