summaryrefslogtreecommitdiffstats
path: root/conf.d/health.d
diff options
context:
space:
mode:
authorFederico Ceratto <federico.ceratto@gmail.com>2016-11-23 15:49:14 +0000
committerFederico Ceratto <federico.ceratto@gmail.com>2016-11-23 15:49:14 +0000
commit68141d9dac0c08e51d257feef16a79086dd8a2df (patch)
treef4a0f5d31ed2194b5991130754b297b9c8c076e6 /conf.d/health.d
parentRelease v. 1.3.0+dfsg-1 to Unstable (diff)
parentNew upstream version 1.4.0+dfsg (diff)
downloadnetdata-68141d9dac0c08e51d257feef16a79086dd8a2df.tar.xz
netdata-68141d9dac0c08e51d257feef16a79086dd8a2df.zip
Merge tag 'upstream/1.4.0+dfsg'
Upstream version 1.4.0+dfsg
Diffstat (limited to 'conf.d/health.d')
-rw-r--r--conf.d/health.d/apache.conf9
-rw-r--r--conf.d/health.d/cpu.conf37
-rw-r--r--conf.d/health.d/disks.conf95
-rw-r--r--conf.d/health.d/entropy.conf15
-rw-r--r--conf.d/health.d/memcached.conf42
-rw-r--r--conf.d/health.d/mysql.conf13
-rw-r--r--conf.d/health.d/named.conf8
-rw-r--r--conf.d/health.d/net.conf51
-rw-r--r--conf.d/health.d/nginx.conf8
-rw-r--r--conf.d/health.d/qos.conf2
-rw-r--r--conf.d/health.d/ram.conf10
-rw-r--r--conf.d/health.d/redis.conf8
-rw-r--r--conf.d/health.d/retroshare.conf25
-rw-r--r--conf.d/health.d/softnet.conf21
-rw-r--r--conf.d/health.d/squid.conf8
-rw-r--r--conf.d/health.d/swap.conf22
-rw-r--r--conf.d/health.d/tcp_resets.conf32
17 files changed, 299 insertions, 107 deletions
diff --git a/conf.d/health.d/apache.conf b/conf.d/health.d/apache.conf
index 1fddbc99..0aaf0e00 100644
--- a/conf.d/health.d/apache.conf
+++ b/conf.d/health.d/apache.conf
@@ -4,10 +4,11 @@
template: apache_last_collected_secs
on: apache.requests
calc: $now - $last_collected_t
- every: 10s
- warn: $this > ( 5 * $update_every)
- crit: $this > (10 * $update_every)
units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
-
+ to: webmaster
diff --git a/conf.d/health.d/cpu.conf b/conf.d/health.d/cpu.conf
index 9332e508..4d79fc79 100644
--- a/conf.d/health.d/cpu.conf
+++ b/conf.d/health.d/cpu.conf
@@ -1,24 +1,33 @@
-template: 5min_cpu_pcent
+template: 10min_cpu_usage
on: system.cpu
- lookup: average -5m unaligned of user,system,nice,softirq,irq,guest,guest_nice
- every: 1m
- warn: $this > 90
+ lookup: average -10m unaligned of user,system,nice,softirq,irq,guest,guest_nice
units: %
- info: average cpu utilization for the last 5 minutes
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (70) : (80))
+ crit: $this > (($status == $CRITICAL) ? (80) : (90))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average cpu utilization for the last 10 minutes
+ to: sysadmin
-template: 5min_iowait_cpu_pcent
+template: 10min_cpu_iowait
on: system.cpu
- lookup: average -5m unaligned of iowait
- every: 1m
- warn: $this > 10
+ lookup: average -10m unaligned of iowait
units: %
- info: average wait I/O for the last 5 minutes
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (5) : (10))
+ crit: $this > (($status == $CRITICAL) ? (20) : (30))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average CPU wait I/O for the last 10 minutes
+ to: sysadmin
-template: 20min_steal_cpu_pcent
+template: 20min_steal_cpu
on: system.cpu
lookup: average -20m unaligned of steal
- every: 5m
- warn: $this > 10
units: %
- info: average stolen CPU time for the last 20 minutes
+ every: 5m
+ warn: $this > (($status >= $WARNING) ? (5) : (10))
+ crit: $this > (($status == $CRITICAL) ? (20) : (30))
+ delay: down 15m multiplier 1.5 max 1h
+ info: average CPU steal time for the last 20 minutes
+ to: sysadmin
diff --git a/conf.d/health.d/disks.conf b/conf.d/health.d/disks.conf
index c38f1a0a..cc7a4766 100644
--- a/conf.d/health.d/disks.conf
+++ b/conf.d/health.d/disks.conf
@@ -1,18 +1,59 @@
# -----------------------------------------------------------------------------
+# make sure we collect values for each disk
+
+# for mount points
+template: disk_space_last_collected_secs
+ on: disk.space
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection of the mount point
+ to: sysadmin
+
+# for block devices
+template: disk_last_collected_secs
+ on: disk.io
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection of the block device
+ to: sysadmin
+
+
+# -----------------------------------------------------------------------------
# low disk space
# checking the latest collected values
# raise an alarm if the disk is low on
# available disk space
-template: disk_full_percent
+template: disk_space_usage
on: disk.space
calc: $used * 100 / ($avail + $used)
- every: 1m
- warn: $this > 80
- crit: $this > 95
units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING ) ? (70) : (80))
+ crit: $this > (($status == $CRITICAL) ? (85) : (95))
+ delay: up 1m down 15m multiplier 1.5 max 1h
info: current disk space usage
+ to: sysadmin
+
+template: disk_inode_usage
+ on: disk.inodes
+ calc: $used * 100 / ($avail + $used)
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (75) : (80))
+ crit: $this > (($status == $CRITICAL) ? (90) : (95))
+ delay: up 1m down 15m multiplier 1.5 max 1h
+ info: current disk inode usage
+ to: sysadmin
# -----------------------------------------------------------------------------
@@ -20,7 +61,7 @@ template: disk_full_percent
# calculate the rate the disk fills
# use as base, the available space change
-# during the last 30 minutes
+# during the last hour
# this is just a calculation - it has no alarm
# we will use it in the next template to find
@@ -28,25 +69,27 @@ template: disk_full_percent
template: disk_fill_rate
on: disk.space
- lookup: max -1s at -30m unaligned of avail
- calc: ($this - $avail) / ($now - $after)
- every: 15s
- units: MB/s
- info: average rate the disk fills up (positive), or frees up (negative) space, for the last 30 minutes
+ lookup: min -10m at -50m unaligned of avail
+ calc: ($this - $avail) / (($now - $after) / 3600)
+ every: 1m
+ units: GB/hour
+ info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour
# calculate the hours remaining
# if the disk continues to fill
# in this rate
-template: disk_full_after_hours
+template: out_of_disk_space_time
on: disk.space
- calc: $avail / $disk_fill_rate / 3600
- every: 10s
- warn: $this > 0 and $this < 48
- crit: $this > 0 and $this < 24
+ calc: $avail / $disk_fill_rate
units: hours
- info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last 30 minutes
+ every: 10s
+ warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
+ crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+ delay: down 15m multiplier 1.2 max 1h
+ info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour
+ to: sysadmin
# -----------------------------------------------------------------------------
@@ -59,13 +102,15 @@ template: disk_full_after_hours
template: 10min_disk_utilization
on: disk.util
lookup: average -10m unaligned
+ units: %
every: 1m
green: 90
red: 98
- warn: $this > $green
- crit: $this > $red
- units: %
+ warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1))
+ crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1))
+ delay: down 15m multiplier 1.2 max 1h
info: the percentage of time the disk was busy, during the last 10 minutes
+ to: sysadmin
# raise an alarm if the disk backlog
@@ -76,10 +121,12 @@ template: 10min_disk_utilization
template: 10min_disk_backlog
on: disk.backlog
lookup: average -10m unaligned
- every: 1m
- green: 1000
- red: 2000
- warn: $this > $green
- crit: $this > $red
units: ms
+ every: 1m
+ green: 2000
+ red: 5000
+ warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1))
+ crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1))
+ delay: down 15m multiplier 1.2 max 1h
info: average of the kernel estimated disk backlog, for the last 10 minutes
+ to: sysadmin
diff --git a/conf.d/health.d/entropy.conf b/conf.d/health.d/entropy.conf
index 6f8b6e85..d0eca8a6 100644
--- a/conf.d/health.d/entropy.conf
+++ b/conf.d/health.d/entropy.conf
@@ -1,13 +1,14 @@
# check if entropy is too low
# the alarm is checked every 1 minute
-# and examines the last 30 minutes of data
+# and examines the last hour of data
- alarm: min_30min_entropy
+ alarm: 1hour_lowest_entropy
on: system.entropy
- lookup: min -30m unaligned
- every: 1m
- warn: $this < 200
- crit: $this < 100
+ lookup: min -1h unaligned
units: entries
- info: minimum entries in the random numbers pool (entropy), for the last 30 minutes
+ every: 5m
+ warn: $this < (($status >= $WARNING) ? (200) : (100))
+ delay: down 1h multiplier 1.5 max 1h
+ info: minimum entries in the random numbers pool in the last 30 minutes
+ to: silent
diff --git a/conf.d/health.d/memcached.conf b/conf.d/health.d/memcached.conf
index 05ff1471..46a8ca0e 100644
--- a/conf.d/health.d/memcached.conf
+++ b/conf.d/health.d/memcached.conf
@@ -4,43 +4,49 @@
template: memcached_last_collected_secs
on: memcached.cache
calc: $now - $last_collected_t
- every: 10s
- warn: $this > ( 5 * $update_every)
- crit: $this > (10 * $update_every)
units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
+ to: dba
# detect if memcached cache is full
-template: cache_full_pcent
+template: memcached_cache_memory_usage
on: memcached.cache
calc: $used * 100 / ($used + $available)
- every: 10s
- warn: $this > 80
- crit: $this > 90
units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (70) : (80))
+ crit: $this > (($status == $CRITICAL) ? (80) : (90))
+ delay: up 0 down 15m multiplier 1.5 max 1h
info: current cache memory usage
+ to: dba
# find the rate memcached cache is filling
template: cache_fill_rate
on: memcached.cache
- lookup: max -1s at -30m unaligned of available
- calc: ($this - $available) / ($now - $after)
- every: 15s
- units: KB/s
- info: average rate the cache fills up (positive), or frees up (negative) space, for the last 30 minutes
+ lookup: min -10m at -50m unaligned of available
+ calc: ($this - $available) / (($now - $after) / 3600)
+ units: KB/hour
+ every: 1m
+ info: average rate the cache fills up (positive), or frees up (negative) space, for the last hour
# find the hours remaining until memcached cache is full
-template: cache_full_after_hours
+template: out_of_cache_space_time
on: memcached.cache
- calc: $available / $cache_fill_rate / 3600
- every: 10s
- warn: $this > 0 and $this < 48
- crit: $this > 0 and $this < 24
+ calc: $available / $cache_fill_rate
units: hours
- info: estimated time the cache will run out of space, if the system continues to add data with the rate of the last 30 minutes
+ every: 10s
+ warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
+ crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+ delay: down 15m multiplier 1.5 max 1h
+ info: estimated time the cache will run out of space, if the system continues to add data with the rate of the last hour
+ to: dba
diff --git a/conf.d/health.d/mysql.conf b/conf.d/health.d/mysql.conf
new file mode 100644
index 00000000..a2cfa3ec
--- /dev/null
+++ b/conf.d/health.d/mysql.conf
@@ -0,0 +1,13 @@
+
+# make sure mysql is running
+
+template: mysql_last_collected_secs
+ on: mysql.queries
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: dba
diff --git a/conf.d/health.d/named.conf b/conf.d/health.d/named.conf
index e46d1d33..f2eaa83c 100644
--- a/conf.d/health.d/named.conf
+++ b/conf.d/health.d/named.conf
@@ -4,9 +4,11 @@
template: named_last_collected_secs
on: named.global_queries
calc: $now - $last_collected_t
- every: 10s
- warn: $this > ( 5 * $update_every)
- crit: $this > (10 * $update_every)
units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
+ to: domainadmin
diff --git a/conf.d/health.d/net.conf b/conf.d/health.d/net.conf
index f65bc4fc..7753aa18 100644
--- a/conf.d/health.d/net.conf
+++ b/conf.d/health.d/net.conf
@@ -1,27 +1,48 @@
+# -----------------------------------------------------------------------------
+# make sure we collect values for each interface
+
+template: interface_last_collected_secs
+ on: net.net
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: sysadmin
+
+
+# -----------------------------------------------------------------------------
# check if an interface is dropping packets
-# the alarm is checked every 10 seconds
-# and examines the last 30 minutes of data
+# the alarm is checked every 1 minute
+# and examines the last hour of data
-template: 30min_packet_drops
+template: 1hour_packet_drops
on: net.drops
- lookup: sum -30m unaligned absolute
- every: 1m
- crit: $this > 0
+ lookup: sum -1h unaligned absolute
units: packets
- info: dropped packets in the last 30 minutes
+ every: 1m
+ warn: $this > 0
+ delay: down 30m multiplier 1.5 max 1h
+ info: interface dropped packets in the last hour
+ to: sysadmin
+# -----------------------------------------------------------------------------
+
# check if an interface is having FIFO
# buffer errors
-# the alarm is checked every 10 seconds
-# and examines the last 30 minutes of data
+# the alarm is checked every 1 minute
+# and examines the last hour of data
-template: 30min_fifo_errors
+template: 1hour_fifo_errors
on: net.fifo
- lookup: sum -30m unaligned absolute
- every: 1m
- crit: $this > 0
+ lookup: sum -1h unaligned absolute
units: errors
- info: network interface fifo errors in the last 30 minutes
-
+ every: 1m
+ warn: $this > 0
+ delay: down 30m multiplier 1.5 max 1h
+ info: interface fifo errors in the last hour
+ to: sysadmin
diff --git a/conf.d/health.d/nginx.conf b/conf.d/health.d/nginx.conf
index da13008e..d70d6a59 100644
--- a/conf.d/health.d/nginx.conf
+++ b/conf.d/health.d/nginx.conf
@@ -4,9 +4,11 @@
template: nginx_last_collected_secs
on: nginx.requests
calc: $now - $last_collected_t
- every: 10s
- warn: $this > ( 5 * $update_every)
- crit: $this > (10 * $update_every)
units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
+ to: webmaster
diff --git a/conf.d/health.d/qos.conf b/conf.d/health.d/qos.conf
index ac3bf8ff..9e5939fd 100644
--- a/conf.d/health.d/qos.conf
+++ b/conf.d/health.d/qos.conf
@@ -8,5 +8,7 @@
# lookup: sum -10m unaligned absolute
# every: 30s
# warn: $this > 0
+# delay: up 0 down 30m multiplier 1.5 max 1h
# units: packets
# info: dropped packets in the last 30 minutes
+# to: sysadmin
diff --git a/conf.d/health.d/ram.conf b/conf.d/health.d/ram.conf
index 1d368112..216b82fe 100644
--- a/conf.d/health.d/ram.conf
+++ b/conf.d/health.d/ram.conf
@@ -1,9 +1,11 @@
- alarm: used_ram_pcent
+ alarm: ram_in_use
on: system.ram
calc: $used * 100 / ($used + $cached + $free)
- every: 10s
- warn: $this > 80
- crit: $this > 90
units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (70) : (80))
+ crit: $this > (($status == $CRITICAL) ? (80) : (90))
+ delay: down 15m multiplier 1.5 max 1h
info: system RAM usage
+ to: sysadmin
diff --git a/conf.d/health.d/redis.conf b/conf.d/health.d/redis.conf
index 3750176c..3e648d85 100644
--- a/conf.d/health.d/redis.conf
+++ b/conf.d/health.d/redis.conf
@@ -4,9 +4,11 @@
template: redis_last_collected_secs
on: redis.operations
calc: $now - $last_collected_t
- every: 10s
- warn: $this > ( 5 * $update_every)
- crit: $this > (10 * $update_every)
units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
+ to: dba
diff --git a/conf.d/health.d/retroshare.conf b/conf.d/health.d/retroshare.conf
new file mode 100644
index 00000000..1af7b468
--- /dev/null
+++ b/conf.d/health.d/retroshare.conf
@@ -0,0 +1,25 @@
+# make sure RetroShare is running
+
+template: retroshare_last_collected_secs
+ on: retroshare.peers
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: sysadmin
+
+# make sure the DHT is fine when active
+
+template: retroshare_dht_working
+ on: retroshare.dht
+ calc: $dht_size_all
+ units: peers
+ every: 1m
+ warn: $this < (($status >= $WARNING) ? (120) : (100))
+ crit: $this < (($status == $CRITICAL) ? (10) : (1))
+ delay: up 0 down 15m multiplier 1.5 max 1h
+ info: Checks if the DHT has enough peers to operate
+ to: sysadmin
diff --git a/conf.d/health.d/softnet.conf b/conf.d/health.d/softnet.conf
new file mode 100644
index 00000000..0c3709f4
--- /dev/null
+++ b/conf.d/health.d/softnet.conf
@@ -0,0 +1,21 @@
+# check for common /proc/net/softnet_stat errors
+
+ alarm: 1hour_netdev_backlog_exceeded
+ on: system.softnet_stat
+ lookup: sum -1h unaligned absolute of dropped
+ units: packets
+ every: 1m
+ warn: $this > 0
+ delay: down 30m multiplier 1.5 max 1h
+ info: number of packets dropped because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets)
+ to: sysadmin
+
+ alarm: 1hour_netdev_budget_ran_outs
+ on: system.softnet_stat
+ lookup: sum -1h unaligned absolute of squeezed
+ units: events
+ every: 1m
+ warn: $this > 0
+ delay: down 30m multiplier 1.5 max 1h
+ info: number of times ksoftirq ran out of sysctl net.core.netdev_budget or time slice, with work remaining (this can be a cause for dropped packets)
+ to: silent
diff --git a/conf.d/health.d/squid.conf b/conf.d/health.d/squid.conf
index cc5ce1c3..76143c5d 100644
--- a/conf.d/health.d/squid.conf
+++ b/conf.d/health.d/squid.conf
@@ -4,9 +4,11 @@
template: squid_last_collected_secs
on: squid.clients_requests
calc: $now - $last_collected_t
- every: 10s
- warn: $this > ( 5 * $update_every)
- crit: $this > (10 * $update_every)
units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
+ to: proxyadmin
diff --git a/conf.d/health.d/swap.conf b/conf.d/health.d/swap.conf
index 552dd310..0cfa888c 100644
--- a/conf.d/health.d/swap.conf
+++ b/conf.d/health.d/swap.conf
@@ -4,17 +4,21 @@
lookup: sum -30m unaligned absolute of out
# we have to convert KB to MB by dividing $this (i.e. the result of the lookup) with 1024
calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
- every: 1m
- warn: $this > 1
- crit: $this > 10
units: % of RAM
- info: the sum of all memory swapped out during the last 30 minutes, as a percentage of the available RAM
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (5) : (10))
+ crit: $this > (($status == $CRITICAL) ? (15) : (20))
+ delay: up 0 down 15m multiplier 1.5 max 1h
+ info: the amount of memory swapped in the last 30 minutes, as a percentage of the system RAM
+ to: sysadmin
- alarm: pcent_of_ram_in_swap
+ alarm: used_swap_space
on: system.swap
calc: $used * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
- every: 10s
- warn: $this > 10
- crit: $this > 50
units: % of RAM
- info: the currently used swap space, as a percentage of the available RAM
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (15) : (20))
+ crit: $this > (($status == $CRITICAL) ? (40) : (50))
+ delay: up 0 down 15m multiplier 1.5 max 1h
+ info: the swap memory used, as a percentage of the system RAM
+ to: sysadmin
diff --git a/conf.d/health.d/tcp_resets.conf b/conf.d/health.d/tcp_resets.conf
new file mode 100644
index 00000000..8e93c479
--- /dev/null
+++ b/conf.d/health.d/tcp_resets.conf
@@ -0,0 +1,32 @@
+# -----------------------------------------------------------------------------
+
+ alarm: ipv4_tcphandshake_last_collected_secs
+ on: ipv4.tcphandshake
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (0) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? (0) : (60 * $update_every))
+ delay: up 0 down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: sysadmin
+
+# -----------------------------------------------------------------------------
+
+ alarm: 1m_ipv4_tcp_resets
+ on: ipv4.tcphandshake
+ lookup: average -1m at -10s unaligned absolute of OutRsts
+ units: tcp resets/s
+ every: 10s
+ info: average TCP RESETS this host is sending, over the last minute
+
+ alarm: 10s_ipv4_tcp_resets
+ on: ipv4.tcphandshake
+ lookup: average -10s unaligned absolute of OutRsts
+ units: tcp resets/s
+ every: 10s
+ warn: $this > ((($1m_ipv4_tcp_resets < 5)?(5):($1m_ipv4_tcp_resets)) * (($status >= $WARNING) ? (1) : (4)))
+ delay: up 0 down 60m multiplier 1.2 max 2h
+ info: average TCP RESETS this host is sending, over the last 10 seconds (this can be an indication that a port scan is made, or that a service running on this host has crashed)
+ to: sysadmin
+