Merge tag 'upstream/1.4.0+dfsg'

Upstream version 1.4.0+dfsg
author: Federico Ceratto <federico.ceratto@gmail.com> 2016-11-23 15:49:14 +0000
committer: Federico Ceratto <federico.ceratto@gmail.com> 2016-11-23 15:49:14 +0000
commit: 68141d9dac0c08e51d257feef16a79086dd8a2df (patch)
tree: f4a0f5d31ed2194b5991130754b297b9c8c076e6 /conf.d/health.d
parent: Release v. 1.3.0+dfsg-1 to Unstable (diff)
parent: New upstream version 1.4.0+dfsg (diff)
download: netdata-68141d9dac0c08e51d257feef16a79086dd8a2df.tar.xz
netdata-68141d9dac0c08e51d257feef16a79086dd8a2df.zip
17 files changed, 299 insertions, 107 deletions
diff --git a/conf.d/health.d/apache.conf b/conf.d/health.d/apache.conf
index 1fddbc99..0aaf0e00 100644
--- a/conf.d/health.d/apache.conf
+++ b/conf.d/health.d/apache.conf
@@ -4,10 +4,11 @@
 template: apache_last_collected_secs
       on: apache.requests
     calc: $now - $last_collected_t
-   every: 10s
-    warn: $this > ( 5 * $update_every)
-    crit: $this > (10 * $update_every)
    units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
-
+      to: webmaster
 
diff --git a/conf.d/health.d/cpu.conf b/conf.d/health.d/cpu.conf
index 9332e508..4d79fc79 100644
--- a/conf.d/health.d/cpu.conf
+++ b/conf.d/health.d/cpu.conf
@@ -1,24 +1,33 @@
 
-template: 5min_cpu_pcent
+template: 10min_cpu_usage
       on: system.cpu
-  lookup: average -5m unaligned of user,system,nice,softirq,irq,guest,guest_nice
-   every: 1m
-    warn: $this > 90
+  lookup: average -10m unaligned of user,system,nice,softirq,irq,guest,guest_nice
    units: %
-    info: average cpu utilization for the last 5 minutes
+   every: 1m
+    warn: $this > (($status >= $WARNING)  ? (70) : (80))
+    crit: $this > (($status == $CRITICAL) ? (80) : (90))
+   delay: down 15m multiplier 1.5 max 1h
+    info: average cpu utilization for the last 10 minutes
+      to: sysadmin
 
-template: 5min_iowait_cpu_pcent
+template: 10min_cpu_iowait
       on: system.cpu
-  lookup: average -5m unaligned of iowait
-   every: 1m
-    warn: $this > 10
+  lookup: average -10m unaligned of iowait
    units: %
-    info: average wait I/O for the last 5 minutes
+   every: 1m
+    warn: $this > (($status >= $WARNING)  ? (5)  : (10))
+    crit: $this > (($status == $CRITICAL) ? (20) : (30))
+   delay: down 15m multiplier 1.5 max 1h
+    info: average CPU wait I/O for the last 10 minutes
+      to: sysadmin
 
-template: 20min_steal_cpu_pcent
+template: 20min_steal_cpu
       on: system.cpu
   lookup: average -20m unaligned of steal
-   every: 5m
-    warn: $this > 10
    units: %
-    info: average stolen CPU time for the last 20 minutes
+   every: 5m
+    warn: $this > (($status >= $WARNING)  ? (5)  : (10))
+    crit: $this > (($status == $CRITICAL) ? (20) : (30))
+   delay: down 15m multiplier 1.5 max 1h
+    info: average CPU steal time for the last 20 minutes
+      to: sysadmin
diff --git a/conf.d/health.d/disks.conf b/conf.d/health.d/disks.conf
index c38f1a0a..cc7a4766 100644
--- a/conf.d/health.d/disks.conf
+++ b/conf.d/health.d/disks.conf
@@ -1,18 +1,59 @@
 # -----------------------------------------------------------------------------
+# make sure we collect values for each disk
+
+# for mount points
+template: disk_space_last_collected_secs
+      on: disk.space
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection of the mount point
+      to: sysadmin
+
+# for block devices
+template: disk_last_collected_secs
+      on: disk.io
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection of the block device
+      to: sysadmin
+
+
+# -----------------------------------------------------------------------------
 # low disk space
 
 # checking the latest collected values
 # raise an alarm if the disk is low on
 # available disk space
 
-template: disk_full_percent
+template: disk_space_usage
       on: disk.space
     calc: $used * 100 / ($avail + $used)
-   every: 1m
-    warn: $this > 80
-    crit: $this > 95
    units: %
+   every: 1m
+    warn: $this > (($status >= $WARNING ) ? (70) : (80))
+    crit: $this > (($status == $CRITICAL) ? (85) : (95))
+   delay: up 1m down 15m multiplier 1.5 max 1h
     info: current disk space usage
+      to: sysadmin
+
+template: disk_inode_usage
+      on: disk.inodes
+    calc: $used * 100 / ($avail + $used)
+   units: %
+   every: 1m
+    warn: $this > (($status >= $WARNING)  ? (75) : (80))
+    crit: $this > (($status == $CRITICAL) ? (90) : (95))
+   delay: up 1m down 15m multiplier 1.5 max 1h
+    info: current disk inode usage
+      to: sysadmin
 
 
 # -----------------------------------------------------------------------------
@@ -20,7 +61,7 @@ template: disk_full_percent
 
 # calculate the rate the disk fills
 # use as base, the available space change
-# during the last 30 minutes
+# during the last hour
 
 # this is just a calculation - it has no alarm
 # we will use it in the next template to find
@@ -28,25 +69,27 @@ template: disk_full_percent
 
 template: disk_fill_rate
       on: disk.space
-  lookup: max -1s at -30m unaligned of avail
-    calc: ($this - $avail) / ($now - $after)
-   every: 15s
-   units: MB/s
-    info: average rate the disk fills up (positive), or frees up (negative) space, for the last 30 minutes
+  lookup: min -10m at -50m unaligned of avail
+    calc: ($this - $avail) / (($now - $after) / 3600)
+   every: 1m
+   units: GB/hour
+    info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour
 
 
 # calculate the hours remaining
 # if the disk continues to fill
 # in this rate
 
-template: disk_full_after_hours
+template: out_of_disk_space_time
       on: disk.space
-    calc: $avail / $disk_fill_rate / 3600
-   every: 10s
-    warn: $this > 0 and $this < 48
-    crit: $this > 0 and $this < 24
+    calc: $avail / $disk_fill_rate
    units: hours
-    info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last 30 minutes
+   every: 10s
+    warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
+    crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+   delay: down 15m multiplier 1.2 max 1h
+    info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour
+      to: sysadmin
 
 
 # -----------------------------------------------------------------------------
@@ -59,13 +102,15 @@ template: disk_full_after_hours
 template: 10min_disk_utilization
       on: disk.util
   lookup: average -10m unaligned
+   units: %
    every: 1m
    green: 90
      red: 98
-    warn: $this > $green
-    crit: $this > $red
-   units: %
+    warn: $this > $green * (($status >= $WARNING)  ? (0.7) : (1))
+    crit: $this > $red   * (($status == $CRITICAL) ? (0.7) : (1))
+   delay: down 15m multiplier 1.2 max 1h
     info: the percentage of time the disk was busy, during the last 10 minutes
+      to: sysadmin
 
 
 # raise an alarm if the disk backlog
@@ -76,10 +121,12 @@ template: 10min_disk_utilization
 template: 10min_disk_backlog
       on: disk.backlog
   lookup: average -10m unaligned
-   every: 1m
-   green: 1000
-     red: 2000
-    warn: $this > $green
-    crit: $this > $red
    units: ms
+   every: 1m
+   green: 2000
+     red: 5000
+    warn: $this > $green * (($status >= $WARNING)  ? (0.7) : (1))
+    crit: $this > $red   * (($status == $CRITICAL) ? (0.7) : (1))
+   delay: down 15m multiplier 1.2 max 1h
     info: average of the kernel estimated disk backlog, for the last 10 minutes
+      to: sysadmin
diff --git a/conf.d/health.d/entropy.conf b/conf.d/health.d/entropy.conf
index 6f8b6e85..d0eca8a6 100644
--- a/conf.d/health.d/entropy.conf
+++ b/conf.d/health.d/entropy.conf
@@ -1,13 +1,14 @@
 
 # check if entropy is too low
 # the alarm is checked every 1 minute
-# and examines the last 30 minutes of data
+# and examines the last hour of data
 
-   alarm: min_30min_entropy
+   alarm: 1hour_lowest_entropy
       on: system.entropy
-  lookup: min -30m unaligned
-   every: 1m
-    warn: $this < 200
-    crit: $this < 100
+  lookup: min -1h unaligned
    units: entries
-    info: minimum entries in the random numbers pool (entropy), for the last 30 minutes
+   every: 5m
+    warn: $this < (($status >= $WARNING) ? (200) : (100))
+   delay: down 1h multiplier 1.5 max 1h
+    info: minimum entries in the random numbers pool in the last 30 minutes
+      to: silent
diff --git a/conf.d/health.d/memcached.conf b/conf.d/health.d/memcached.conf
index 05ff1471..46a8ca0e 100644
--- a/conf.d/health.d/memcached.conf
+++ b/conf.d/health.d/memcached.conf
@@ -4,43 +4,49 @@
 template: memcached_last_collected_secs
       on: memcached.cache
     calc: $now - $last_collected_t
-   every: 10s
-    warn: $this > ( 5 * $update_every)
-    crit: $this > (10 * $update_every)
    units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
+      to: dba
 
 
 # detect if memcached cache is full
 
-template: cache_full_pcent
+template: memcached_cache_memory_usage
       on: memcached.cache
     calc: $used * 100 / ($used + $available)
-   every: 10s
-    warn: $this > 80
-    crit: $this > 90
    units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (70) : (80))
+    crit: $this > (($status == $CRITICAL) ? (80) : (90))
+   delay: up 0 down 15m multiplier 1.5 max 1h
     info: current cache memory usage
+      to: dba
 
 
 # find the rate memcached cache is filling
 
 template: cache_fill_rate
       on: memcached.cache
-  lookup: max -1s at -30m unaligned of available
-    calc: ($this - $available) / ($now - $after)
-   every: 15s
-   units: KB/s
-    info: average rate the cache fills up (positive), or frees up (negative) space, for the last 30 minutes
+  lookup: min -10m at -50m unaligned of available
+    calc: ($this - $available) / (($now - $after) / 3600)
+   units: KB/hour
+   every: 1m
+    info: average rate the cache fills up (positive), or frees up (negative) space, for the last hour
 
 
 # find the hours remaining until memcached cache is full
 
-template: cache_full_after_hours
+template: out_of_cache_space_time
       on: memcached.cache
-    calc: $available / $cache_fill_rate / 3600
-   every: 10s
-    warn: $this > 0 and $this < 48
-    crit: $this > 0 and $this < 24
+    calc: $available / $cache_fill_rate
    units: hours
-    info: estimated time the cache will run out of space, if the system continues to add data with the rate of the last 30 minutes
+   every: 10s
+    warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
+    crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+   delay: down 15m multiplier 1.5 max 1h
+    info: estimated time the cache will run out of space, if the system continues to add data with the rate of the last hour
+      to: dba
diff --git a/conf.d/health.d/mysql.conf b/conf.d/health.d/mysql.conf
new file mode 100644
index 00000000..a2cfa3ec
--- /dev/null
+++ b/conf.d/health.d/mysql.conf
@@ -0,0 +1,13 @@
+
+# make sure mysql is running
+
+template: mysql_last_collected_secs
+      on: mysql.queries
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: dba
diff --git a/conf.d/health.d/named.conf b/conf.d/health.d/named.conf
index e46d1d33..f2eaa83c 100644
--- a/conf.d/health.d/named.conf
+++ b/conf.d/health.d/named.conf
@@ -4,9 +4,11 @@
 template: named_last_collected_secs
       on: named.global_queries
     calc: $now - $last_collected_t
-   every: 10s
-    warn: $this > ( 5 * $update_every)
-    crit: $this > (10 * $update_every)
    units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
+      to: domainadmin
 
diff --git a/conf.d/health.d/net.conf b/conf.d/health.d/net.conf
index f65bc4fc..7753aa18 100644
--- a/conf.d/health.d/net.conf
+++ b/conf.d/health.d/net.conf
@@ -1,27 +1,48 @@
+# -----------------------------------------------------------------------------
+# make sure we collect values for each interface
+
+template: interface_last_collected_secs
+      on: net.net
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: sysadmin
+
+
+# -----------------------------------------------------------------------------
 
 # check if an interface is dropping packets
-# the alarm is checked every 10 seconds
-# and examines the last 30 minutes of data
+# the alarm is checked every 1 minute
+# and examines the last hour of data
 
-template: 30min_packet_drops
+template: 1hour_packet_drops
       on: net.drops
-  lookup: sum -30m unaligned absolute
-   every: 1m
-    crit: $this > 0
+  lookup: sum -1h unaligned absolute
    units: packets
-    info: dropped packets in the last 30 minutes
+   every: 1m
+    warn: $this > 0
+   delay: down 30m multiplier 1.5 max 1h
+    info: interface dropped packets in the last hour
+      to: sysadmin
 
 
+# -----------------------------------------------------------------------------
+
 # check if an interface is having FIFO
 # buffer errors
-# the alarm is checked every 10 seconds
-# and examines the last 30 minutes of data
+# the alarm is checked every 1 minute
+# and examines the last hour of data
 
-template: 30min_fifo_errors
+template: 1hour_fifo_errors
       on: net.fifo
-  lookup: sum -30m unaligned absolute
-   every: 1m
-    crit: $this > 0
+  lookup: sum -1h unaligned absolute
    units: errors
-    info: network interface fifo errors in the last 30 minutes
-
+   every: 1m
+    warn: $this > 0
+   delay: down 30m multiplier 1.5 max 1h
+    info: interface fifo errors in the last hour
+      to: sysadmin
diff --git a/conf.d/health.d/nginx.conf b/conf.d/health.d/nginx.conf
index da13008e..d70d6a59 100644
--- a/conf.d/health.d/nginx.conf
+++ b/conf.d/health.d/nginx.conf
@@ -4,9 +4,11 @@
 template: nginx_last_collected_secs
       on: nginx.requests
     calc: $now - $last_collected_t
-   every: 10s
-    warn: $this > ( 5 * $update_every)
-    crit: $this > (10 * $update_every)
    units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
+      to: webmaster
 
diff --git a/conf.d/health.d/qos.conf b/conf.d/health.d/qos.conf
index ac3bf8ff..9e5939fd 100644
--- a/conf.d/health.d/qos.conf
+++ b/conf.d/health.d/qos.conf
@@ -8,5 +8,7 @@
 #  lookup: sum -10m unaligned absolute
 #   every: 30s
 #    warn: $this > 0
+#   delay: up 0 down 30m multiplier 1.5 max 1h
 #   units: packets
 #    info: dropped packets in the last 30 minutes
+#      to: sysadmin
diff --git a/conf.d/health.d/ram.conf b/conf.d/health.d/ram.conf
index 1d368112..216b82fe 100644
--- a/conf.d/health.d/ram.conf
+++ b/conf.d/health.d/ram.conf
@@ -1,9 +1,11 @@
 
-   alarm: used_ram_pcent
+   alarm: ram_in_use
       on: system.ram
     calc: $used * 100 / ($used + $cached + $free)
-   every: 10s
-    warn: $this > 80
-    crit: $this > 90
    units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (70) : (80))
+    crit: $this > (($status == $CRITICAL) ? (80) : (90))
+   delay: down 15m multiplier 1.5 max 1h
     info: system RAM usage
+      to: sysadmin
diff --git a/conf.d/health.d/redis.conf b/conf.d/health.d/redis.conf
index 3750176c..3e648d85 100644
--- a/conf.d/health.d/redis.conf
+++ b/conf.d/health.d/redis.conf
@@ -4,9 +4,11 @@
 template: redis_last_collected_secs
       on: redis.operations
     calc: $now - $last_collected_t
-   every: 10s
-    warn: $this > ( 5 * $update_every)
-    crit: $this > (10 * $update_every)
    units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
+      to: dba
 
diff --git a/conf.d/health.d/retroshare.conf b/conf.d/health.d/retroshare.conf
new file mode 100644
index 00000000..1af7b468
--- /dev/null
+++ b/conf.d/health.d/retroshare.conf
@@ -0,0 +1,25 @@
+# make sure RetroShare is running
+
+template: retroshare_last_collected_secs
+      on: retroshare.peers
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: sysadmin
+
+# make sure the DHT is fine when active
+
+template: retroshare_dht_working
+      on: retroshare.dht
+    calc: $dht_size_all
+   units: peers
+   every: 1m
+    warn: $this < (($status >= $WARNING)  ? (120) : (100))
+    crit: $this < (($status == $CRITICAL) ? (10)  : (1))
+   delay: up 0 down 15m multiplier 1.5 max 1h
+    info: Checks if the DHT has enough peers to operate
+      to: sysadmin
diff --git a/conf.d/health.d/softnet.conf b/conf.d/health.d/softnet.conf
new file mode 100644
index 00000000..0c3709f4
--- /dev/null
+++ b/conf.d/health.d/softnet.conf
@@ -0,0 +1,21 @@
+# check for common /proc/net/softnet_stat errors
+
+   alarm: 1hour_netdev_backlog_exceeded
+      on: system.softnet_stat
+  lookup: sum -1h unaligned absolute of dropped
+   units: packets
+   every: 1m
+    warn: $this > 0
+   delay: down 30m multiplier 1.5 max 1h
+    info: number of packets dropped because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets)
+      to: sysadmin
+
+   alarm: 1hour_netdev_budget_ran_outs
+      on: system.softnet_stat
+  lookup: sum -1h unaligned absolute of squeezed
+   units: events
+   every: 1m
+    warn: $this > 0
+   delay: down 30m multiplier 1.5 max 1h
+    info: number of times ksoftirq ran out of sysctl net.core.netdev_budget or time slice, with work remaining (this can be a cause for dropped packets)
+      to: silent
diff --git a/conf.d/health.d/squid.conf b/conf.d/health.d/squid.conf
index cc5ce1c3..76143c5d 100644
--- a/conf.d/health.d/squid.conf
+++ b/conf.d/health.d/squid.conf
@@ -4,9 +4,11 @@
 template: squid_last_collected_secs
       on: squid.clients_requests
     calc: $now - $last_collected_t
-   every: 10s
-    warn: $this > ( 5 * $update_every)
-    crit: $this > (10 * $update_every)
    units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
+      to: proxyadmin
 
diff --git a/conf.d/health.d/swap.conf b/conf.d/health.d/swap.conf
index 552dd310..0cfa888c 100644
--- a/conf.d/health.d/swap.conf
+++ b/conf.d/health.d/swap.conf
@@ -4,17 +4,21 @@
   lookup: sum -30m unaligned absolute of out
           # we have to convert KB to MB by dividing $this (i.e. the result of the lookup) with 1024
     calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
-   every: 1m
-    warn: $this > 1
-    crit: $this > 10
    units: % of RAM
-    info: the sum of all memory swapped out during the last 30 minutes, as a percentage of the available RAM
+   every: 1m
+    warn: $this > (($status >= $WARNING)  ? (5)  : (10))
+    crit: $this > (($status == $CRITICAL) ? (15) : (20))
+   delay: up 0 down 15m multiplier 1.5 max 1h
+    info: the amount of memory swapped in the last 30 minutes, as a percentage of the system RAM
+      to: sysadmin
 
-   alarm: pcent_of_ram_in_swap
+   alarm: used_swap_space
       on: system.swap
     calc: $used * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
-   every: 10s
-    warn: $this > 10
-    crit: $this > 50
    units: % of RAM
-    info: the currently used swap space, as a percentage of the available RAM
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (15) : (20))
+    crit: $this > (($status == $CRITICAL) ? (40) : (50))
+   delay: up 0 down 15m multiplier 1.5 max 1h
+    info: the swap memory used, as a percentage of the system RAM
+      to: sysadmin
diff --git a/conf.d/health.d/tcp_resets.conf b/conf.d/health.d/tcp_resets.conf
new file mode 100644
index 00000000..8e93c479
--- /dev/null
+++ b/conf.d/health.d/tcp_resets.conf
@@ -0,0 +1,32 @@
+# -----------------------------------------------------------------------------
+
+   alarm: ipv4_tcphandshake_last_collected_secs
+      on: ipv4.tcphandshake
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (0) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+   delay: up 0 down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: sysadmin
+
+# -----------------------------------------------------------------------------
+
+   alarm: 1m_ipv4_tcp_resets
+      on: ipv4.tcphandshake
+  lookup: average -1m at -10s unaligned absolute of OutRsts
+   units: tcp resets/s
+   every: 10s
+    info: average TCP RESETS this host is sending, over the last minute
+
+   alarm: 10s_ipv4_tcp_resets
+      on: ipv4.tcphandshake
+  lookup: average -10s unaligned absolute of OutRsts
+   units: tcp resets/s
+   every: 10s
+    warn: $this > ((($1m_ipv4_tcp_resets < 5)?(5):($1m_ipv4_tcp_resets)) * (($status >= $WARNING)  ? (1) : (4)))
+   delay: up 0 down 60m multiplier 1.2 max 2h
+    info: average TCP RESETS this host is sending, over the last 10 seconds (this can be an indication that a port scan is made, or that a service running on this host has crashed)
+      to: sysadmin
+
author	Federico Ceratto <federico.ceratto@gmail.com>	2016-11-23 15:49:14 +0000
committer	Federico Ceratto <federico.ceratto@gmail.com>	2016-11-23 15:49:14 +0000
commit	68141d9dac0c08e51d257feef16a79086dd8a2df (patch)
tree	f4a0f5d31ed2194b5991130754b297b9c8c076e6 /conf.d/health.d
parent	Release v. 1.3.0+dfsg-1 to Unstable (diff)
parent	New upstream version 1.4.0+dfsg (diff)
download	netdata-68141d9dac0c08e51d257feef16a79086dd8a2df.tar.xz netdata-68141d9dac0c08e51d257feef16a79086dd8a2df.zip