Merge tag 'upstream/1.5.0+dfsg'

Upstream version 1.5.0+dfsg
author: Lennart Weller <lhw@ring0.de> 2017-01-24 15:21:16 +0000
committer: Lennart Weller <lhw@ring0.de> 2017-01-24 15:21:16 +0000
commit: ef0c127e7f95d2db2715b9e99fe758eebc7dabd3 (patch)
tree: ea5d62342aba06f376f3be63aab898503b56f3ec /conf.d/health.d
parent: update watch file and files-exclude (diff)
parent: New upstream version 1.5.0+dfsg (diff)
download: netdata-ef0c127e7f95d2db2715b9e99fe758eebc7dabd3.tar.xz
netdata-ef0c127e7f95d2db2715b9e99fe758eebc7dabd3.zip
29 files changed, 517 insertions, 67 deletions
diff --git a/conf.d/health.d/apache.conf b/conf.d/health.d/apache.conf
index 0aaf0e00..0c98b877 100644
--- a/conf.d/health.d/apache.conf
+++ b/conf.d/health.d/apache.conf
@@ -6,8 +6,8 @@ template: apache_last_collected_secs
     calc: $now - $last_collected_t
    units: seconds ago
    every: 10s
-    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
    delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
       to: webmaster
diff --git a/conf.d/health.d/backend.conf b/conf.d/health.d/backend.conf
new file mode 100644
index 00000000..9c193e7b
--- /dev/null
+++ b/conf.d/health.d/backend.conf
@@ -0,0 +1,45 @@
+
+# make sure we are sending data to backend
+
+   alarm: backend_last_buffering
+      on: netdata.backend_metrics
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful buffering of backend data
+      to: dba
+
+   alarm: backend_metrics_sent
+      on: netdata.backend_metrics
+   units: %
+    calc: abs($sent) * 100 / abs($buffered)
+   every: 10s
+    warn: $this != 100
+   delay: down 5m multiplier 1.5 max 1h
+    info: percentage of metrics sent to the backend server
+      to: dba
+
+   alarm: backend_metrics_lost
+      on: netdata.backend_metrics
+   units: metrics
+    calc: abs($lost)
+   every: 10s
+    crit: $this != 0
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of metrics lost due to repeating failures to contact the backend server
+      to: dba
+
+# this chart has been removed from netdata
+#   alarm: backend_slow
+#      on: netdata.backend_latency
+#   units: %
+#    calc: $latency * 100 / ($update_every * 1000)
+#   every: 10s
+#    warn: $this > 50
+#    crit: $this > 100
+#   delay: down 5m multiplier 1.5 max 1h
+#    info: the percentage of time between iterations needed by the backend time to process the data sent by netdata
+#      to: dba
diff --git a/conf.d/health.d/bind_rndc.conf b/conf.d/health.d/bind_rndc.conf
new file mode 100644
index 00000000..028bc9d0
--- /dev/null
+++ b/conf.d/health.d/bind_rndc.conf
@@ -0,0 +1,9 @@
+ alarm: bind_rndc_stats_file_size
+      on: bind_rndc.stats_size
+   units: megabytes
+   every: 60
+    calc: $stats_size
+    warn: $this > 512
+    crit: $this > 1024
+    info: Bind stats file is very large! Consider to create logrotate conf file for it!
+      to: sysadmin
diff --git a/conf.d/health.d/cpu.conf b/conf.d/health.d/cpu.conf
index 4d79fc79..60f494d7 100644
--- a/conf.d/health.d/cpu.conf
+++ b/conf.d/health.d/cpu.conf
@@ -4,8 +4,8 @@ template: 10min_cpu_usage
   lookup: average -10m unaligned of user,system,nice,softirq,irq,guest,guest_nice
    units: %
    every: 1m
-    warn: $this > (($status >= $WARNING)  ? (70) : (80))
-    crit: $this > (($status == $CRITICAL) ? (80) : (90))
+    warn: $this > (($status >= $WARNING)  ? (75) : (85))
+    crit: $this > (($status == $CRITICAL) ? (85) : (95))
    delay: down 15m multiplier 1.5 max 1h
     info: average cpu utilization for the last 10 minutes
       to: sysadmin
@@ -15,8 +15,8 @@ template: 10min_cpu_iowait
   lookup: average -10m unaligned of iowait
    units: %
    every: 1m
-    warn: $this > (($status >= $WARNING)  ? (5)  : (10))
-    crit: $this > (($status == $CRITICAL) ? (20) : (30))
+    warn: $this > (($status >= $WARNING)  ? (20) : (40))
+    crit: $this > (($status == $CRITICAL) ? (40) : (50))
    delay: down 15m multiplier 1.5 max 1h
     info: average CPU wait I/O for the last 10 minutes
       to: sysadmin
@@ -28,6 +28,6 @@ template: 20min_steal_cpu
    every: 5m
     warn: $this > (($status >= $WARNING)  ? (5)  : (10))
     crit: $this > (($status == $CRITICAL) ? (20) : (30))
-   delay: down 15m multiplier 1.5 max 1h
+   delay: down 1h multiplier 1.5 max 2h
     info: average CPU steal time for the last 20 minutes
       to: sysadmin
diff --git a/conf.d/health.d/disks.conf b/conf.d/health.d/disks.conf
index cc7a4766..0549bac2 100644
--- a/conf.d/health.d/disks.conf
+++ b/conf.d/health.d/disks.conf
@@ -4,11 +4,12 @@
 # for mount points
 template: disk_space_last_collected_secs
       on: disk.space
+families: *
     calc: $now - $last_collected_t
    units: seconds ago
    every: 10s
-    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
    delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection of the mount point
       to: sysadmin
@@ -16,11 +17,12 @@ template: disk_space_last_collected_secs
 # for block devices
 template: disk_last_collected_secs
       on: disk.io
+families: *
     calc: $now - $last_collected_t
    units: seconds ago
    every: 10s
-    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
    delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection of the block device
       to: sysadmin
@@ -35,22 +37,24 @@ template: disk_last_collected_secs
 
 template: disk_space_usage
       on: disk.space
+families: *
     calc: $used * 100 / ($avail + $used)
    units: %
    every: 1m
-    warn: $this > (($status >= $WARNING ) ? (70) : (80))
-    crit: $this > (($status == $CRITICAL) ? (85) : (95))
+    warn: $this > (($status >= $WARNING ) ? (80) : (90))
+    crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: up 1m down 15m multiplier 1.5 max 1h
     info: current disk space usage
       to: sysadmin
 
 template: disk_inode_usage
       on: disk.inodes
+families: *
     calc: $used * 100 / ($avail + $used)
    units: %
    every: 1m
-    warn: $this > (($status >= $WARNING)  ? (75) : (80))
-    crit: $this > (($status == $CRITICAL) ? (90) : (95))
+    warn: $this > (($status >= $WARNING)  ? (80) : (90))
+    crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: up 1m down 15m multiplier 1.5 max 1h
     info: current disk inode usage
       to: sysadmin
@@ -69,6 +73,7 @@ template: disk_inode_usage
 
 template: disk_fill_rate
       on: disk.space
+families: *
   lookup: min -10m at -50m unaligned of avail
     calc: ($this - $avail) / (($now - $after) / 3600)
    every: 1m
@@ -82,7 +87,8 @@ template: disk_fill_rate
 
 template: out_of_disk_space_time
       on: disk.space
-    calc: $avail / $disk_fill_rate
+families: *
+    calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (0)
    units: hours
    every: 10s
     warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
@@ -101,6 +107,7 @@ template: out_of_disk_space_time
 
 template: 10min_disk_utilization
       on: disk.util
+families: *
   lookup: average -10m unaligned
    units: %
    every: 1m
@@ -120,6 +127,7 @@ template: 10min_disk_utilization
 
 template: 10min_disk_backlog
       on: disk.backlog
+families: *
   lookup: average -10m unaligned
    units: ms
    every: 1m
diff --git a/conf.d/health.d/elasticsearch.conf b/conf.d/health.d/elasticsearch.conf
new file mode 100644
index 00000000..dffd4096
--- /dev/null
+++ b/conf.d/health.d/elasticsearch.conf
@@ -0,0 +1,9 @@
+   alarm: elasticsearch_last_collected
+      on: elasticsearch_local.cluster_health_status
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    info: number of seconds since the last successful data collection
+      to: sysadmin
diff --git a/conf.d/health.d/entropy.conf b/conf.d/health.d/entropy.conf
index d0eca8a6..5dd8af50 100644
--- a/conf.d/health.d/entropy.conf
+++ b/conf.d/health.d/entropy.conf
@@ -3,12 +3,12 @@
 # the alarm is checked every 1 minute
 # and examines the last hour of data
 
-   alarm: 1hour_lowest_entropy
+   alarm: lowest_entropy
       on: system.entropy
-  lookup: min -1h unaligned
+  lookup: min -10m unaligned
    units: entries
    every: 5m
     warn: $this < (($status >= $WARNING) ? (200) : (100))
-   delay: down 1h multiplier 1.5 max 1h
-    info: minimum entries in the random numbers pool in the last 30 minutes
+   delay: down 1h multiplier 1.5 max 2h
+    info: minimum entries in the random numbers pool in the last 10 minutes
       to: silent
diff --git a/conf.d/health.d/haproxy.conf b/conf.d/health.d/haproxy.conf
new file mode 100644
index 00000000..e49c70d4
--- /dev/null
+++ b/conf.d/health.d/haproxy.conf
@@ -0,0 +1,27 @@
+template: haproxy_backend_server_status
+      on: haproxy_hs.down
+   units: failed servers
+   every: 10s
+  lookup: average -10s
+    crit: $this > 0
+    info: number of failed haproxy backend servers
+      to: sysadmin
+
+template: haproxy_backend_status
+      on: haproxy_hb.down
+   units: failed backend
+   every: 10s
+  lookup: average -10s
+    crit: $this > 0
+    info: number of failed haproxy backends
+      to: sysadmin
+
+template: haproxy_last_collected
+      on: haproxy_hb.down
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    info: number of seconds since the last successful data collection
+      to: sysadmin
diff --git a/conf.d/health.d/ipc.conf b/conf.d/health.d/ipc.conf
new file mode 100644
index 00000000..ee7c4bad
--- /dev/null
+++ b/conf.d/health.d/ipc.conf
@@ -0,0 +1,22 @@
+
+   alarm: semaphores_used
+      on: system.ipc_semaphores
+    calc: $semaphores * 100 / $ipc.semaphores.max
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (70) : (80))
+    crit: $this > (($status == $CRITICAL) ? (70) : (90))
+   delay: down 5m multiplier 1.5 max 1h
+    info: the percentage of IPC semaphores used
+      to: sysadmin
+
+   alarm: semaphore_arrays_used
+      on: system.ipc_semaphore_arrays
+    calc: $arrays * 100 / $ipc.semaphores.arrays.max
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (70) : (80))
+    crit: $this > (($status == $CRITICAL) ? (70) : (90))
+   delay: down 5m multiplier 1.5 max 1h
+    info: the percentage of IPC semaphore arrays used
+      to: sysadmin
diff --git a/conf.d/health.d/ipfs.conf b/conf.d/health.d/ipfs.conf
new file mode 100644
index 00000000..3f77572d
--- /dev/null
+++ b/conf.d/health.d/ipfs.conf
@@ -0,0 +1,11 @@
+
+template: ipfs_datastore_usage
+      on: ipfs.repo_size
+    calc: $size * 100 / $avail
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (80) : (90))
+    crit: $this > (($status == $CRITICAL) ? (90) : (98))
+   delay: down 15m multiplier 1.5 max 1h
+    info: ipfs Datastore close to running out of space
+      to: sysadmin
diff --git a/conf.d/health.d/isc_dhcpd.conf b/conf.d/health.d/isc_dhcpd.conf
new file mode 100644
index 00000000..4345619a
--- /dev/null
+++ b/conf.d/health.d/isc_dhcpd.conf
@@ -0,0 +1,10 @@
+ alarm: isc_dhcpd_parse_time
+      on: isc_dhcpd.parse_time
+   units: ms
+   every: 60
+    calc: $ptime
+    warn: $this > 100
+    crit: $this > 250
+   delay: up 2m down 5m
+    info: Parsing too slow! It can slow down your server. Check dhcpd.leases file size.
+      to: sysadmin
diff --git a/conf.d/health.d/mdstat.conf b/conf.d/health.d/mdstat.conf
new file mode 100644
index 00000000..c9e7d20d
--- /dev/null
+++ b/conf.d/health.d/mdstat.conf
@@ -0,0 +1,18 @@
+template: mdstat_disks
+      on: md.disks
+   units: failed devices
+   every: 10s
+    calc: $total - $inuse
+    crit: $this > 0
+    info: Array is degraded!
+      to: sysadmin
+
+template: mdstat_last_collected
+      on: md.disks
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    info: number of seconds since the last successful data collection
+      to: sysadmin
diff --git a/conf.d/health.d/memcached.conf b/conf.d/health.d/memcached.conf
index 46a8ca0e..7917e36a 100644
--- a/conf.d/health.d/memcached.conf
+++ b/conf.d/health.d/memcached.conf
@@ -6,8 +6,8 @@ template: memcached_last_collected_secs
     calc: $now - $last_collected_t
    units: seconds ago
    every: 10s
-    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
    delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
       to: dba
@@ -42,7 +42,7 @@ template: cache_fill_rate
 
 template: out_of_cache_space_time
       on: memcached.cache
-    calc: $available / $cache_fill_rate
+    calc: ($cache_fill_rate > 0) ? ($available / $cache_fill_rate) : (0)
    units: hours
    every: 10s
     warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
diff --git a/conf.d/health.d/memory.conf b/conf.d/health.d/memory.conf
new file mode 100644
index 00000000..3c904f6b
--- /dev/null
+++ b/conf.d/health.d/memory.conf
@@ -0,0 +1,30 @@
+
+   alarm: 1hour_ecc_memory_correctable
+      on: mem.ecc_ce
+  lookup: sum -10m unaligned
+   units: errors
+   every: 1m
+    warn: $this > 0
+   delay: down 1h multiplier 1.5 max 1h
+    info: number of ECC correctable errors during the last hour
+      to: sysadmin
+
+   alarm: 1hour_ecc_memory_uncorrectable
+      on: mem.ecc_ue
+  lookup: sum -10m unaligned
+   units: errors
+   every: 1m
+    crit: $this > 0
+   delay: down 1h multiplier 1.5 max 1h
+    info: number of ECC uncorrectable errors during the last hour
+      to: sysadmin
+
+   alarm: 1hour_memory_hw_corrupted
+      on: mem.hwcorrupt
+    calc: $HardwareCorrupted
+   units: MB
+   every: 10s
+    warn: $this > 0
+   delay: down 1h multiplier 1.5 max 1h
+    info: amount of memory corrupted due to a hardware failure
+      to: sysadmin
diff --git a/conf.d/health.d/mysql.conf b/conf.d/health.d/mysql.conf
index a2cfa3ec..78773e5b 100644
--- a/conf.d/health.d/mysql.conf
+++ b/conf.d/health.d/mysql.conf
@@ -6,8 +6,80 @@ template: mysql_last_collected_secs
     calc: $now - $last_collected_t
    units: seconds ago
    every: 10s
-    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
    delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
       to: dba
+
+
+# -----------------------------------------------------------------------------
+# slow queries
+
+template: mysql_10s_slow_queries
+      on: mysql.queries
+  lookup: sum -10s of slow_queries
+   units: slow queries
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (5)  : (10))
+    crit: $this > (($status == $CRITICAL) ? (10) : (20))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of mysql slow queries over the last 10 seconds
+      to: dba
+
+
+# -----------------------------------------------------------------------------
+# lock waits
+
+template: mysql_10s_table_locks_immediate
+      on: mysql.table_locks
+  lookup: sum -10s absolute of immediate
+   units: immediate locks
+   every: 10s
+    info: number of table immediate locks over the last 10 seconds
+      to: dba
+
+template: mysql_10s_table_locks_waited
+      on: mysql.table_locks
+  lookup: sum -10s absolute of waited
+   units: waited locks
+   every: 10s
+    info: number of table waited locks over the last 10 seconds
+      to: dba
+
+template: mysql_10s_waited_locks_ratio
+      on: mysql.table_locks
+    calc: ($mysql_10s_table_locks_waited * 100) / ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate)
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (10) : (25))
+    crit: $this > (($status == $CRITICAL) ? (25) : (50))
+   delay: down 30m multiplier 1.5 max 1h
+    info: the ratio of mysql waited table locks, for the last 10 seconds
+      to: dba
+
+
+# -----------------------------------------------------------------------------
+# replication
+
+template: mysql_replication
+      on: mysql.slave_status
+    calc: ($sql_running == -1 OR $io_running == -1)?0:1
+   units: status
+   every: 10s
+    crit: $this == 0
+   delay: down 5m multiplier 1.5 max 1h
+    info: checks if mysql replication has stopped
+      to: dba
+
+template: mysql_replication_lag
+      on: mysql.slave_behind
+    calc: $seconds
+   units: seconds
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (5)  : (10))
+    crit: $this > (($status == $CRITICAL) ? (10) : (30))
+   delay: down 15m multiplier 1.5 max 1h
+    info: the number of seconds mysql replication is behind this master
+      to: dba
+
diff --git a/conf.d/health.d/named.conf b/conf.d/health.d/named.conf
index f2eaa83c..4fc65c8e 100644
--- a/conf.d/health.d/named.conf
+++ b/conf.d/health.d/named.conf
@@ -6,8 +6,8 @@ template: named_last_collected_secs
     calc: $now - $last_collected_t
    units: seconds ago
    every: 10s
-    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
    delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
       to: domainadmin
diff --git a/conf.d/health.d/net.conf b/conf.d/health.d/net.conf
index 7753aa18..924acccc 100644
--- a/conf.d/health.d/net.conf
+++ b/conf.d/health.d/net.conf
@@ -3,46 +3,119 @@
 
 template: interface_last_collected_secs
       on: net.net
+families: *
     calc: $now - $last_collected_t
    units: seconds ago
    every: 10s
-    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
    delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
       to: sysadmin
 
 
 # -----------------------------------------------------------------------------
+# dropped packets
 
 # check if an interface is dropping packets
 # the alarm is checked every 1 minute
-# and examines the last hour of data
+# and examines the last 10 minutes of data
 
-template: 1hour_packet_drops
+template: inbound_packets_dropped
       on: net.drops
-  lookup: sum -1h unaligned absolute
+families: *
+  lookup: sum -10m unaligned absolute of inbound
    units: packets
    every: 1m
     warn: $this > 0
-   delay: down 30m multiplier 1.5 max 1h
-    info: interface dropped packets in the last hour
+   delay: down 1h multiplier 1.5 max 2h
+    info: interface inbound dropped packets in the last 10 minutes
+      to: sysadmin
+
+template: outbound_packets_dropped
+      on: net.drops
+families: *
+  lookup: sum -10m unaligned absolute of outbound
+   units: packets
+   every: 1m
+    warn: $this > 0
+   delay: down 1h multiplier 1.5 max 2h
+    info: interface outbound dropped packets in the last 10 minutes
+      to: sysadmin
+
+template: inbound_packets_dropped_ratio
+      on: net.packets
+families: *
+  lookup: sum -10m unaligned absolute of received
+    calc: (($inbound_packets_dropped != nan AND $this > 0) ? ($inbound_packets_dropped * 100 / $this) : (0))
+   units: %
+   every: 1m
+    warn: $this > 0.5
+    crit: $this > 3
+   delay: down 1h multiplier 1.5 max 2h
+    info: the ratio of inbound dropped packets vs the total number of received packets of the network interface, during the last 10 minutes
+      to: sysadmin
+
+template: outbound_packets_dropped_ratio
+      on: net.packets
+families: *
+  lookup: sum -10m unaligned absolute of sent
+    calc: (($outbound_packets_dropped != nan AND $this > 0) ? ($outbound_packets_dropped * 100 / $this) : (0))
+   units: %
+   every: 1m
+    warn: $this > 0.5
+    crit: $this > 3
+   delay: down 1h multiplier 1.5 max 2h
+    info: the ratio of outbound dropped packets vs the total number of sent packets of the network interface, during the last 10 minutes
       to: sysadmin
 
 
 # -----------------------------------------------------------------------------
+# FIFO errors
 
 # check if an interface is having FIFO
 # buffer errors
 # the alarm is checked every 1 minute
-# and examines the last hour of data
+# and examines the last 10 minutes of data
 
-template: 1hour_fifo_errors
+template: 10min_fifo_errors
       on: net.fifo
-  lookup: sum -1h unaligned absolute
+families: *
+  lookup: sum -10m unaligned absolute
    units: errors
    every: 1m
     warn: $this > 0
-   delay: down 30m multiplier 1.5 max 1h
-    info: interface fifo errors in the last hour
+   delay: down 1h multiplier 1.5 max 2h
+    info: interface fifo errors in the last 10 minutes
       to: sysadmin
+
+
+# -----------------------------------------------------------------------------
+# check for packet storms
+
+# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
+# 2. do the same for the last 10s
+# 3. raise an alarm if the later is 10x or 20x the first
+# we assume the minimum packet storm should at least have
+# 10000 packets/s, average of the last 10 seconds
+
+template: 1m_received_packets_rate
+      on: net.packets
+families: *
+  lookup: average -1m of received
+   units: packets
+   every: 10s
+    info: the average number of packets received during the last minute
+
+template: 10s_received_packets_storm
+      on: net.packets
+families: *
+  lookup: average -10s of received
+    calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
+   every: 10s
+   units: %
+   warn: $this > (($status >= $WARNING)?(200):(1000))
+   crit: $this > (($status >= $WARNING)?(1000):(2000))
+   info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute
+     to: silent
+
diff --git a/conf.d/health.d/netfilter.conf b/conf.d/health.d/netfilter.conf
new file mode 100644
index 00000000..3dd6a67b
--- /dev/null
+++ b/conf.d/health.d/netfilter.conf
@@ -0,0 +1,23 @@
+
+   alarm: netfilter_last_collected_secs
+      on: netfilter.conntrack_sockets
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: sysadmin
+
+   alarm: netfilter_conntrack_full
+      on: netfilter.conntrack_sockets
+  lookup: max -10s unaligned of connections
+    calc: $this * 100 / $netfilter.conntrack.max
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (70) : (80))
+    crit: $this > (($status == $CRITICAL) ? (80) : (90))
+   delay: down 5m multiplier 1.5 max 1h
+    info: the number of connections tracked by the netfilter connection tracker, as a percentage of the connection tracker table size
+      to: sysadmin
diff --git a/conf.d/health.d/nginx.conf b/conf.d/health.d/nginx.conf
index d70d6a59..a686c3d9 100644
--- a/conf.d/health.d/nginx.conf
+++ b/conf.d/health.d/nginx.conf
@@ -6,8 +6,8 @@ template: nginx_last_collected_secs
     calc: $now - $last_collected_t
    units: seconds ago
    every: 10s
-    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
    delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
       to: webmaster
diff --git a/conf.d/health.d/postgres.conf b/conf.d/health.d/postgres.conf
new file mode 100644
index 00000000..4e0583b8
--- /dev/null
+++ b/conf.d/health.d/postgres.conf
@@ -0,0 +1,13 @@
+
+# make sure postgres is running
+
+template: postgres_last_collected_secs
+      on: postgres.db_stat_transactions
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: dba
diff --git a/conf.d/health.d/ram.conf b/conf.d/health.d/ram.conf
index 216b82fe..d60df75b 100644
--- a/conf.d/health.d/ram.conf
+++ b/conf.d/health.d/ram.conf
@@ -4,8 +4,8 @@
     calc: $used * 100 / ($used + $cached + $free)
    units: %
    every: 10s
-    warn: $this > (($status >= $WARNING)  ? (70) : (80))
-    crit: $this > (($status == $CRITICAL) ? (80) : (90))
+    warn: $this > (($status >= $WARNING)  ? (80) : (90))
+    crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: down 15m multiplier 1.5 max 1h
     info: system RAM usage
       to: sysadmin
diff --git a/conf.d/health.d/redis.conf b/conf.d/health.d/redis.conf
index 3e648d85..5f6d397e 100644
--- a/conf.d/health.d/redis.conf
+++ b/conf.d/health.d/redis.conf
@@ -6,8 +6,8 @@ template: redis_last_collected_secs
     calc: $now - $last_collected_t
    units: seconds ago
    every: 10s
-    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
    delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
       to: dba
diff --git a/conf.d/health.d/retroshare.conf b/conf.d/health.d/retroshare.conf
index 1af7b468..2344b60e 100644
--- a/conf.d/health.d/retroshare.conf
+++ b/conf.d/health.d/retroshare.conf
@@ -5,8 +5,8 @@ template: retroshare_last_collected_secs
     calc: $now - $last_collected_t
    units: seconds ago
    every: 10s
-    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
    delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
       to: sysadmin
diff --git a/conf.d/health.d/softnet.conf b/conf.d/health.d/softnet.conf
index 0c3709f4..5faf9a9e 100644
--- a/conf.d/health.d/softnet.conf
+++ b/conf.d/health.d/softnet.conf
@@ -1,21 +1,21 @@
 # check for common /proc/net/softnet_stat errors
 
-   alarm: 1hour_netdev_backlog_exceeded
+   alarm: 10min_netdev_backlog_exceeded
       on: system.softnet_stat
-  lookup: sum -1h unaligned absolute of dropped
+  lookup: sum -10m unaligned absolute of dropped
    units: packets
    every: 1m
     warn: $this > 0
-   delay: down 30m multiplier 1.5 max 1h
-    info: number of packets dropped because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets)
+   delay: down 1h multiplier 1.5 max 2h
+    info: number of packets dropped in the last 10min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets)
       to: sysadmin
 
-   alarm: 1hour_netdev_budget_ran_outs
+   alarm: 10min_netdev_budget_ran_outs
       on: system.softnet_stat
-  lookup: sum -1h unaligned absolute of squeezed
+  lookup: sum -10m unaligned absolute of squeezed
    units: events
    every: 1m
-    warn: $this > 0
-   delay: down 30m multiplier 1.5 max 1h
-    info: number of times ksoftirq ran out of sysctl net.core.netdev_budget or time slice, with work remaining (this can be a cause for dropped packets)
+    warn: $this > (($status >= $WARNING)  ? (0) : (10))
+   delay: down 1h multiplier 1.5 max 2h
+    info: number of times, during the last 10min, ksoftirq ran out of sysctl net.core.netdev_budget or time slice, with work remaining (this can be a cause for dropped packets)
       to: silent
diff --git a/conf.d/health.d/squid.conf b/conf.d/health.d/squid.conf
index 76143c5d..06cc9678 100644
--- a/conf.d/health.d/squid.conf
+++ b/conf.d/health.d/squid.conf
@@ -6,8 +6,8 @@ template: squid_last_collected_secs
     calc: $now - $last_collected_t
    units: seconds ago
    every: 10s
-    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
    delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
       to: proxyadmin
diff --git a/conf.d/health.d/swap.conf b/conf.d/health.d/swap.conf
index 0cfa888c..7f57560e 100644
--- a/conf.d/health.d/swap.conf
+++ b/conf.d/health.d/swap.conf
@@ -6,13 +6,13 @@
     calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
    units: % of RAM
    every: 1m
-    warn: $this > (($status >= $WARNING)  ? (5)  : (10))
-    crit: $this > (($status == $CRITICAL) ? (15) : (20))
+    warn: $this > (($status >= $WARNING)  ? (10) : (20))
+    crit: $this > (($status == $CRITICAL) ? (20) : (30))
    delay: up 0 down 15m multiplier 1.5 max 1h
     info: the amount of memory swapped in the last 30 minutes, as a percentage of the system RAM
       to: sysadmin
 
-   alarm: used_swap_space
+   alarm: ram_in_swap
       on: system.swap
     calc: $used * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
    units: % of RAM
@@ -22,3 +22,14 @@
    delay: up 0 down 15m multiplier 1.5 max 1h
     info: the swap memory used, as a percentage of the system RAM
       to: sysadmin
+
+   alarm: used_swap
+      on: system.swap
+    calc: $used * 100 / ( $used + $free )
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (80) : (90))
+    crit: $this > (($status == $CRITICAL) ? (90) : (98))
+   delay: up 0 down 15m multiplier 1.5 max 1h
+    info: the percentage of swap memory used
+      to: sysadmin
diff --git a/conf.d/health.d/tcp_resets.conf b/conf.d/health.d/tcp_resets.conf
index 8e93c479..daf24a1c 100644
--- a/conf.d/health.d/tcp_resets.conf
+++ b/conf.d/health.d/tcp_resets.conf
@@ -5,28 +5,48 @@
     calc: $now - $last_collected_t
    units: seconds ago
    every: 10s
-    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
    delay: up 0 down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
       to: sysadmin
 
 # -----------------------------------------------------------------------------
+# tcp resets this host sends
 
-   alarm: 1m_ipv4_tcp_resets
+   alarm: 1m_ipv4_tcp_resets_sent
       on: ipv4.tcphandshake
   lookup: average -1m at -10s unaligned absolute of OutRsts
    units: tcp resets/s
    every: 10s
     info: average TCP RESETS this host is sending, over the last minute
 
-   alarm: 10s_ipv4_tcp_resets
+   alarm: 10s_ipv4_tcp_resets_sent
       on: ipv4.tcphandshake
   lookup: average -10s unaligned absolute of OutRsts
    units: tcp resets/s
    every: 10s
-    warn: $this > ((($1m_ipv4_tcp_resets < 5)?(5):($1m_ipv4_tcp_resets)) * (($status >= $WARNING)  ? (1) : (4)))
+    warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING)  ? (1) : (4)))
    delay: up 0 down 60m multiplier 1.2 max 2h
     info: average TCP RESETS this host is sending, over the last 10 seconds (this can be an indication that a port scan is made, or that a service running on this host has crashed)
-      to: sysadmin
+      to: silent
+
+# -----------------------------------------------------------------------------
+# tcp resets this host receives
+
+   alarm: 1m_ipv4_tcp_resets_received
+      on: ipv4.tcphandshake
+  lookup: average -1m at -10s unaligned absolute of AttemptFails
+   units: tcp resets/s
+   every: 10s
+    info: average TCP RESETS this host is sending, over the last minute
 
+   alarm: 10s_ipv4_tcp_resets_received
+      on: ipv4.tcphandshake
+  lookup: average -10s unaligned absolute of AttemptFails
+   units: tcp resets/s
+   every: 10s
+    warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING)  ? (1) : (4)))
+   delay: up 0 down 60m multiplier 1.2 max 2h
+    info: average TCP RESETS this host is receiving, over the last 10 seconds (this can be an indication that a service this host needs, has crashed)
+      to: silent
diff --git a/conf.d/health.d/udp_errors.conf b/conf.d/health.d/udp_errors.conf
new file mode 100644
index 00000000..98e955c0
--- /dev/null
+++ b/conf.d/health.d/udp_errors.conf
@@ -0,0 +1,40 @@
+# -----------------------------------------------------------------------------
+
+   alarm: ipv4_udperrors_last_collected_secs
+      on: ipv4.udperrors
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: up 0 down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: sysadmin
+
+# -----------------------------------------------------------------------------
+# UDP receive buffer errors
+
+   alarm: 1m_ipv4_udp_receive_buffer_errors
+      on: ipv4.udperrors
+  lookup: sum -1m unaligned absolute of RcvbufErrors
+   units: errors
+   every: 10s
+    warn: $this > 0
+    crit: $this > 100
+    info: number of UDP receive buffer errors during the last minute
+   delay: up 0 down 60m multiplier 1.2 max 2h
+      to: sysadmin
+
+# -----------------------------------------------------------------------------
+# UDP send buffer errors
+
+   alarm: 1m_ipv4_udp_send_buffer_errors
+      on: ipv4.udperrors
+  lookup: sum -1m unaligned absolute of SndbufErrors
+   units: errors
+   every: 10s
+    warn: $this > 0
+    crit: $this > 100
+    info: number of UDP send buffer errors during the last minute
+   delay: up 0 down 60m multiplier 1.2 max 2h
+      to: sysadmin
diff --git a/conf.d/health.d/varnish.conf b/conf.d/health.d/varnish.conf
new file mode 100644
index 00000000..cca7446b
--- /dev/null
+++ b/conf.d/health.d/varnish.conf
@@ -0,0 +1,9 @@
+   alarm: varnish_last_collected
+      on: varnish.uptime
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    info: number of seconds since the last successful data collection
+      to: sysadmin
author	Lennart Weller <lhw@ring0.de>	2017-01-24 15:21:16 +0000
committer	Lennart Weller <lhw@ring0.de>	2017-01-24 15:21:16 +0000
commit	ef0c127e7f95d2db2715b9e99fe758eebc7dabd3 (patch)
tree	ea5d62342aba06f376f3be63aab898503b56f3ec /conf.d/health.d
parent	update watch file and files-exclude (diff)
parent	New upstream version 1.5.0+dfsg (diff)
download	netdata-ef0c127e7f95d2db2715b9e99fe758eebc7dabd3.tar.xz netdata-ef0c127e7f95d2db2715b9e99fe758eebc7dabd3.zip