New upstream version 1.11.0+dfsgupstream/1.11.0+dfsg

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2018-11-07 12:19:29 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2018-11-07 12:20:17 +0000
commit: a64a253794ac64cb40befee54db53bde17dd0d49 (patch)
tree: c1024acc5f6e508814b944d99f112259bb28b1be /health/health.d
parent: New upstream version 1.10.0+dfsg (diff)
download: netdata-a64a253794ac64cb40befee54db53bde17dd0d49.tar.xz
netdata-a64a253794ac64cb40befee54db53bde17dd0d49.zip
57 files changed, 428 insertions, 17 deletions
diff --git a/health/health.d/adaptec_raid.conf b/health/health.d/adaptec_raid.conf
new file mode 100644
index 000000000..a1301ce8a
--- /dev/null
+++ b/health/health.d/adaptec_raid.conf
@@ -0,0 +1,24 @@
+
+# logical device status check
+
+template: adapter_raid_ld_status
+      on: adapter_raid.ld_status
+  lookup: max -5s
+   units: bool
+   every: 10s
+    crit: $this > 0
+   delay: down 5m multiplier 1.5 max 1h
+    info: at least 1 logical device is failed or degraded
+      to: sysadmin
+
+# physical device state check
+
+template: adapter_raid_pd_state
+      on: adapter_raid.pd_state
+  lookup: max -5s
+   units: bool
+   every: 10s
+    crit: $this > 0
+   delay: down 5m multiplier 1.5 max 1h
+    info: at least 1 physical device is not in online state
+      to: sysadmin
diff --git a/conf.d/health.d/apache.conf b/health/health.d/apache.conf
index 0c98b8778..0c98b8778 100644
--- a/conf.d/health.d/apache.conf
+++ b/health/health.d/apache.conf
diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf
new file mode 100644
index 000000000..4f86037ba
--- /dev/null
+++ b/health/health.d/apcupsd.conf
@@ -0,0 +1,40 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+template: 10min_ups_load
+      on: apcupsd.load
+      os: *
+   hosts: *
+  lookup: average -10m unaligned of percentage
+   units: %
+   every: 1m
+    warn: $this > (($status >= $WARNING)  ? (70) : (80))
+    crit: $this > (($status == $CRITICAL) ? (85) : (95))
+   delay: down 10m multiplier 1.5 max 1h
+    info: average UPS load for the last 10 minutes
+      to: sitemgr
+
+# Discussion in https://github.com/netdata/netdata/pull/3928:
+# Fire the alarm as soon as it's going on battery (99% charge) and clear only when full.
+template: ups_charge
+      on: apcupsd.charge
+      os: *
+   hosts: *
+  lookup: average -60s unaligned of charge
+   units: %
+   every: 60s
+    warn: $this < 100
+    crit: $this < (($status == $CRITICAL) ? (60) : (50))
+   delay: down 10m multiplier 1.5 max 1h
+    info: current UPS charge, averaged over the last 60 seconds to reduce measurement errors
+      to: sitemgr
+
+template: apcupsd_last_collected_secs
+      on: apcupsd.load
+    calc: $now - $last_collected_t
+   every: 10s
+   units: seconds ago
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: sitemgr
diff --git a/conf.d/health.d/backend.conf b/health/health.d/backend.conf
index 7af100d8f..7af100d8f 100644
--- a/conf.d/health.d/backend.conf
+++ b/health/health.d/backend.conf
diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf
new file mode 100644
index 000000000..f0da9ac5e
--- /dev/null
+++ b/health/health.d/bcache.conf
@@ -0,0 +1,22 @@
+
+template: bcache_cache_errors
+      on: disk.bcache_cache_read_races
+  lookup: sum -10m unaligned absolute
+   units: errors
+   every: 1m
+    warn: $this > 0
+    crit: $this > ( ($status >= $CRITICAL) ? (0) : (10) )
+   delay: down 1h multiplier 1.5 max 2h
+    info: the number of times bcache had issues using the cache, during the last 10 mins (this usually means your SSD cache is failing)
+      to: sysadmin
+
+template: bcache_cache_dirty
+      on: disk.bcache_cache_alloc
+    calc: $dirty + $metadata + $undefined
+   units: %
+   every: 1m
+    warn: $this > ( ($status >= $WARNING ) ? ( 70 ) : ( 90 ) )
+    crit: $this > ( ($status >= $CRITICAL) ? ( 90 ) : ( 95 ) )
+   delay: up 1m down 1h multiplier 1.5 max 2h
+    info: the percentage of cache space used for dirty and metadata (this usually means your SSD cache is too small)
+      to: sysadmin
diff --git a/conf.d/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf
index 30dc27328..30dc27328 100644
--- a/conf.d/health.d/beanstalkd.conf
+++ b/health/health.d/beanstalkd.conf
diff --git a/conf.d/health.d/bind_rndc.conf b/health/health.d/bind_rndc.conf
index 4145e77cd..4145e77cd 100644
--- a/conf.d/health.d/bind_rndc.conf
+++ b/health/health.d/bind_rndc.conf
diff --git a/health/health.d/boinc.conf b/health/health.d/boinc.conf
new file mode 100644
index 000000000..43c588db6
--- /dev/null
+++ b/health/health.d/boinc.conf
@@ -0,0 +1,62 @@
+# Alarms for various BOINC issues.
+
+# Warn on any compute errors encountered.
+template: boinc_compute_errors
+      on: boinc.states
+      os: *
+   hosts: *
+families: *
+  lookup: average -10m unaligned of comperror
+   units: tasks
+   every: 1m
+    warn: $this > 0
+    crit: $this > 1
+   delay: up 1m down 5m multiplier 1.5 max 1h
+    info: the total number of compute errors over the past 10 minutes
+      to: sysadmin
+
+# Warn on lots of upload errors
+template: boinc_upload_errors
+      on: boinc.states
+      os: *
+   hosts: *
+families: *
+  lookup: average -10m unaligned of upload_failed
+   units: tasks
+   every: 1m
+    warn: $this > 0
+    crit: $this > 1
+   delay: up 1m down 5m multiplier 1.5 max 1h
+    info: the average number of failed uploads over the past 10 minutes
+      to: sysadmin
+
+# Warn on the task queue being empty
+template: boinc_total_tasks
+      on: boinc.tasks
+      os: *
+   hosts: *
+families: *
+  lookup: average -10m unaligned of total
+   units: tasks
+   every: 1m
+    warn: $this < 1
+    crit: $this < 0.1
+   delay: up 5m down 10m multiplier 1.5 max 1h
+    info: the total number of locally available tasks
+      to: sysadmin
+
+# Warn on no active tasks with a non-empty queue
+template: boinc_active_tasks
+      on: boinc.tasks
+      os: *
+   hosts: *
+families: *
+  lookup: average -10m unaligned of active
+    calc: ($boinc_total_tasks >= 1) ? ($this) : (inf)
+   units: tasks
+   every: 1m
+    warn: $this < 1
+    crit: $this < 0.1
+   delay: up 5m down 10m multiplier 1.5 max 1h
+    info: the total number of active tasks
+      to: sysadmin
diff --git a/conf.d/health.d/btrfs.conf b/health/health.d/btrfs.conf
index b27aa544f..b27aa544f 100644
--- a/conf.d/health.d/btrfs.conf
+++ b/health/health.d/btrfs.conf
diff --git a/conf.d/health.d/ceph.conf b/health/health.d/ceph.conf
index de16f7b6f..de16f7b6f 100644
--- a/conf.d/health.d/ceph.conf
+++ b/health/health.d/ceph.conf
diff --git a/conf.d/health.d/couchdb.conf b/health/health.d/couchdb.conf
index 4a2895280..4a2895280 100644
--- a/conf.d/health.d/couchdb.conf
+++ b/health/health.d/couchdb.conf
diff --git a/conf.d/health.d/cpu.conf b/health/health.d/cpu.conf
index fa8189856..fa8189856 100644
--- a/conf.d/health.d/cpu.conf
+++ b/health/health.d/cpu.conf
diff --git a/conf.d/health.d/disks.conf b/health/health.d/disks.conf
index 26f85848a..26f85848a 100644
--- a/conf.d/health.d/disks.conf
+++ b/health/health.d/disks.conf
diff --git a/health/health.d/dockerd.conf b/health/health.d/dockerd.conf
new file mode 100644
index 000000000..729906cdb
--- /dev/null
+++ b/health/health.d/dockerd.conf
@@ -0,0 +1,8 @@
+template: docker_unhealthy_containers
+      on: docker.unhealthy_containers
+   units: unhealthy containers
+   every: 10s
+  lookup: average -10s
+    crit: $this > 0
+    info: number of unhealthy containers
+      to: sysadmin
diff --git a/conf.d/health.d/elasticsearch.conf b/health/health.d/elasticsearch.conf
index dffd40965..dffd40965 100644
--- a/conf.d/health.d/elasticsearch.conf
+++ b/health/health.d/elasticsearch.conf
diff --git a/conf.d/health.d/entropy.conf b/health/health.d/entropy.conf
index 66d44ec13..66d44ec13 100644
--- a/conf.d/health.d/entropy.conf
+++ b/health/health.d/entropy.conf
diff --git a/conf.d/health.d/fping.conf b/health/health.d/fping.conf
index 43658fef6..43658fef6 100644
--- a/conf.d/health.d/fping.conf
+++ b/health/health.d/fping.conf
diff --git a/conf.d/health.d/fronius.conf b/health/health.d/fronius.conf
index cdf6c8fcb..cdf6c8fcb 100644
--- a/conf.d/health.d/fronius.conf
+++ b/health/health.d/fronius.conf
diff --git a/conf.d/health.d/haproxy.conf b/health/health.d/haproxy.conf
index e49c70d48..e49c70d48 100644
--- a/conf.d/health.d/haproxy.conf
+++ b/health/health.d/haproxy.conf
diff --git a/conf.d/health.d/httpcheck.conf b/health/health.d/httpcheck.conf
index 0ddf35eab..0ddf35eab 100644
--- a/conf.d/health.d/httpcheck.conf
+++ b/health/health.d/httpcheck.conf
diff --git a/conf.d/health.d/ipc.conf b/health/health.d/ipc.conf
index 03cf264d8..989d6e912 100644
--- a/conf.d/health.d/ipc.conf
+++ b/health/health.d/ipc.conf
@@ -5,7 +5,7 @@
       on: system.ipc_semaphores
       os: linux
    hosts: *
-    calc: $semaphores * 100 / $ipc.semaphores.max
+    calc: $semaphores * 100 / $ipc_semaphores_max
    units: %
    every: 10s
     warn: $this > (($status >= $WARNING)  ? (70) : (80))
@@ -18,7 +18,7 @@
       on: system.ipc_semaphore_arrays
       os: linux
    hosts: *
-    calc: $arrays * 100 / $ipc.semaphores.arrays.max
+    calc: $arrays * 100 / $ipc_semaphores_arrays_max
    units: %
    every: 10s
     warn: $this > (($status >= $WARNING)  ? (70) : (80))
diff --git a/conf.d/health.d/ipfs.conf b/health/health.d/ipfs.conf
index 3f77572d6..3f77572d6 100644
--- a/conf.d/health.d/ipfs.conf
+++ b/health/health.d/ipfs.conf
diff --git a/conf.d/health.d/ipmi.conf b/health/health.d/ipmi.conf
index c25581964..c25581964 100644
--- a/conf.d/health.d/ipmi.conf
+++ b/health/health.d/ipmi.conf
diff --git a/conf.d/health.d/isc_dhcpd.conf b/health/health.d/isc_dhcpd.conf
index 8054656ff..8054656ff 100644
--- a/conf.d/health.d/isc_dhcpd.conf
+++ b/health/health.d/isc_dhcpd.conf
diff --git a/conf.d/health.d/lighttpd.conf b/health/health.d/lighttpd.conf
index 915907a4a..915907a4a 100644
--- a/conf.d/health.d/lighttpd.conf
+++ b/health/health.d/lighttpd.conf
diff --git a/health/health.d/linux_power_supply.conf b/health/health.d/linux_power_supply.conf
new file mode 100644
index 000000000..27a172a14
--- /dev/null
+++ b/health/health.d/linux_power_supply.conf
@@ -0,0 +1,12 @@
+# Alert on low battery capacity.
+
+template: linux_power_supply_capacity
+      on: power_supply.capacity
+    calc: $capacity
+   units: %
+   every: 10s
+    warn: $this < 10
+    crit: $this < 5
+   delay: up 0 down 5m multiplier 1.2 max 1h
+    info: the percentage remaining capacity of the power supply
+      to: sysadmin
diff --git a/health/health.d/load.conf b/health/health.d/load.conf
new file mode 100644
index 000000000..ee0c54b8e
--- /dev/null
+++ b/health/health.d/load.conf
@@ -0,0 +1,56 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# Calculate the base trigger point for the load average alarms.
+# This is the maximum number of CPU's in the system over the past 1
+# minute, with a special case for a single CPU of setting the trigger at 2.
+   alarm: load_trigger
+      on: system.load
+      os: linux
+   hosts: *
+    calc: ($active_processors == nan or $active_processors == inf or $active_processors < 2) ? ( 2 ) : ( $active_processors )
+   units: cpus
+   every: 1m
+    info: trigger point for load average alarms
+
+# Send alarms if the load average is unusually high.
+# These intentionally _do not_ calculate the average over the sampled
+# time period because the values being checked already are averages.
+   alarm: load_average_15
+      on: system.load
+      os: linux
+   hosts: *
+  lookup: max -1m unaligned of load15
+   units: load
+   every: 1m
+    warn: $this > (($status >= $WARNING)  ? (1.75 * $load_trigger) : (2 * $load_trigger))
+    crit: $this > (($status == $CRITICAL) ? (3.5 * $load_trigger) : (4 * $load_trigger))
+   delay: down 15m multiplier 1.5 max 1h
+    info: fifteen-minute load average
+      to: sysadmin
+
+   alarm: load_average_5
+      on: system.load
+      os: linux
+   hosts: *
+  lookup: max -1m unaligned of load5
+   units: load
+   every: 1m
+    warn: $this > (($status >= $WARNING)  ? (3.5 * $load_trigger) : (4 * $load_trigger))
+    crit: $this > (($status == $CRITICAL) ? (7 * $load_trigger) : (8 * $load_trigger))
+   delay: down 15m multiplier 1.5 max 1h
+    info: five-minute load average
+      to: sysadmin
+
+   alarm: load_average_1
+      on: system.load
+      os: linux
+   hosts: *
+  lookup: max -1m unaligned of load1
+   units: load
+   every: 1m
+    warn: $this > (($status >= $WARNING)  ? (7 * $load_trigger) : (8 * $load_trigger))
+    crit: $this > (($status == $CRITICAL) ? (14 * $load_trigger) : (16 * $load_trigger))
+   delay: down 15m multiplier 1.5 max 1h
+    info: one-minute load average
+      to: sysadmin
diff --git a/conf.d/health.d/mdstat.conf b/health/health.d/mdstat.conf
index c9e7d20db..0f5f2837e 100644
--- a/conf.d/health.d/mdstat.conf
+++ b/health/health.d/mdstat.conf
@@ -1,3 +1,13 @@
+template: mdstat_last_collected
+      on: md.disks
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    info: number of seconds since the last successful data collection
+      to: sysadmin
+
 template: mdstat_disks
       on: md.disks
    units: failed devices
@@ -7,12 +17,11 @@ template: mdstat_disks
     info: Array is degraded!
       to: sysadmin
 
-template: mdstat_last_collected
-      on: md.disks
-    calc: $now - $last_collected_t
-   units: seconds ago
+template: mdstat_mismatch_cnt
+      on: md.mismatch_cnt
+   units: unsynchronized blocks
+    calc: $count
    every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-    info: number of seconds since the last successful data collection
+    crit: $this > 0
+    info: Mismatch count!
       to: sysadmin
diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf
new file mode 100644
index 000000000..1881a7be1
--- /dev/null
+++ b/health/health.d/megacli.conf
@@ -0,0 +1,48 @@
+   alarm: adapter_state
+      on: megacli.adapter_degraded
+   units: is degraded
+    lookup: sum -10s
+    every: 10s
+    crit: $this > 0
+    info: adapter state
+      to: sysadmin
+
+   template: bbu_relative_charge
+      on: megacli.bbu_relative_charge
+   units: percent
+    lookup: average -10s
+    every: 10s
+    warn: $this <= (($status >= $WARNING)  ? (85) : (80))
+    crit: $this <= (($status == $CRITICAL)  ? (50) : (40))
+    info: BBU relative state of charge
+      to: sysadmin
+
+   template: bbu_cycle_count
+      on: megacli.bbu_cycle_count
+   units: cycle count
+    lookup: average -10s
+    every: 10s
+    warn: $this >= 100
+    crit: $this >= 500
+    info: BBU cycle count
+      to: sysadmin
+
+    alarm: pd_media_errors
+      on: megacli.pd_media_error
+   units: media errors
+    lookup: sum -10s
+    every: 10s
+    warn: $this > 0
+    delay: down 1m multiplier 2 max 10m
+    info: physical drive media errors
+      to: sysadmin
+
+    alarm: pd_predictive_failures
+      on: megacli.pd_predictive_failure
+   units: predictive failures
+    lookup: sum -10s
+    every: 10s
+    warn: $this > 0
+    delay: down 1m multiplier 2 max 10m
+    info: physical drive predictive failures
+      to: sysadmin
diff --git a/conf.d/health.d/memcached.conf b/health/health.d/memcached.conf
index d248ef57a..d248ef57a 100644
--- a/conf.d/health.d/memcached.conf
+++ b/health/health.d/memcached.conf
diff --git a/conf.d/health.d/memory.conf b/health/health.d/memory.conf
index 4a0e6e522..4a0e6e522 100644
--- a/conf.d/health.d/memory.conf
+++ b/health/health.d/memory.conf
diff --git a/conf.d/health.d/mongodb.conf b/health/health.d/mongodb.conf
index a80cb3112..a80cb3112 100644
--- a/conf.d/health.d/mongodb.conf
+++ b/health/health.d/mongodb.conf
diff --git a/conf.d/health.d/mysql.conf b/health/health.d/mysql.conf
index 1eeb993f0..39c401915 100644
--- a/conf.d/health.d/mysql.conf
+++ b/health/health.d/mysql.conf
@@ -60,6 +60,21 @@ template: mysql_10s_waited_locks_ratio
 
 
 # -----------------------------------------------------------------------------
+# connections
+
+template: mysql_connections
+      on: mysql.connections_active
+    calc: $active * 100 / $limit
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (60) : (70))
+    crit: $this > (($status == $CRITICAL) ? (80) : (90))
+   delay: down 15m multiplier 1.5 max 1h
+    info: the ratio of current active connections vs the maximum possible number of connections
+      to: dba
+
+
+# -----------------------------------------------------------------------------
 # replication
 
 template: mysql_replication
diff --git a/conf.d/health.d/named.conf b/health/health.d/named.conf
index 4fc65c8ee..4fc65c8ee 100644
--- a/conf.d/health.d/named.conf
+++ b/health/health.d/named.conf
diff --git a/conf.d/health.d/net.conf b/health/health.d/net.conf
index 22a88927d..489016dd5 100644
--- a/conf.d/health.d/net.conf
+++ b/health/health.d/net.conf
@@ -2,6 +2,39 @@
 # you can disable an alarm notification by setting the 'to' line to: silent
 
 # -----------------------------------------------------------------------------
+# net traffic overflow
+
+ template: 1m_received_traffic_overflow
+       on: net.net
+       os: linux
+    hosts: *
+ families: *
+   lookup: average -1m unaligned absolute of received
+     calc: ($nic_speed_max > 0) ? ($this * 100 / ($nic_speed_max * 1000)) : ( nan )
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (80) : (85))
+     crit: $this > (($status == $CRITICAL) ? (85) : (90))
+    delay: down 1m multiplier 1.5 max 1h
+     info: interface received bandwidth usage over net device speed max
+       to: sysadmin
+
+ template: 1m_sent_traffic_overflow
+       on: net.net
+       os: linux
+    hosts: *
+ families: *
+   lookup: average -1m unaligned absolute of sent
+     calc: ($nic_speed_max > 0) ? ($this * 100 / ($nic_speed_max * 1000)) : ( nan )
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (80) : (85))
+     crit: $this > (($status == $CRITICAL) ? (85) : (90))
+    delay: down 1m multiplier 1.5 max 1h
+     info: interface sent bandwidth usage over net device speed max
+       to: sysadmin
+
+# -----------------------------------------------------------------------------
 # dropped packets
 
 # check if an interface is dropping packets
@@ -101,7 +134,7 @@ template: 1m_received_packets_rate
       os: linux freebsd
    hosts: *
 families: *
-  lookup: average -1m of received
+  lookup: average -1m unaligned of received
    units: packets
    every: 10s
     info: the average number of packets received during the last minute
@@ -111,7 +144,7 @@ template: 10s_received_packets_storm
       os: linux freebsd
    hosts: *
 families: *
-  lookup: average -10s of received
+  lookup: average -10s unaligned of received
     calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
    every: 10s
    units: %
diff --git a/conf.d/health.d/netfilter.conf b/health/health.d/netfilter.conf
index fa1732b33..1d07752cc 100644
--- a/conf.d/health.d/netfilter.conf
+++ b/health/health.d/netfilter.conf
@@ -19,7 +19,7 @@
       os: linux
    hosts: *
   lookup: max -10s unaligned of connections
-    calc: $this * 100 / $netfilter.conntrack.max
+    calc: $this * 100 / $netfilter_conntrack_max
    units: %
    every: 10s
     warn: $this > (($status >= $WARNING)  ? (70) : (80))
diff --git a/conf.d/health.d/nginx.conf b/health/health.d/nginx.conf
index a686c3d99..a686c3d99 100644
--- a/conf.d/health.d/nginx.conf
+++ b/health/health.d/nginx.conf
diff --git a/conf.d/health.d/nginx_plus.conf b/health/health.d/nginx_plus.conf
index 5a171a76d..5a171a76d 100644
--- a/conf.d/health.d/nginx_plus.conf
+++ b/health/health.d/nginx_plus.conf
diff --git a/conf.d/health.d/portcheck.conf b/health/health.d/portcheck.conf
index f42b63d30..f42b63d30 100644
--- a/conf.d/health.d/portcheck.conf
+++ b/health/health.d/portcheck.conf
diff --git a/conf.d/health.d/postgres.conf b/health/health.d/postgres.conf
index 4e0583b85..4e0583b85 100644
--- a/conf.d/health.d/postgres.conf
+++ b/health/health.d/postgres.conf
diff --git a/conf.d/health.d/qos.conf b/health/health.d/qos.conf
index 7290d15ff..7290d15ff 100644
--- a/conf.d/health.d/qos.conf
+++ b/health/health.d/qos.conf
diff --git a/conf.d/health.d/ram.conf b/health/health.d/ram.conf
index b6dc5f945..4e437322c 100644
--- a/conf.d/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -3,7 +3,7 @@
 
    alarm: used_ram_to_ignore
       on: system.ram
-      os: linux
+      os: linux freebsd
    hosts: *
     calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz)
    every: 10s
@@ -41,7 +41,7 @@ alarm: ram_in_use
    on: system.ram
    os: freebsd
 hosts: *
- calc: (($active + $wired) - $used_ram_to_ignore) * 100 / (($active + $wired) - $used_ram_to_ignore + $cached + $free)
+ calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
 units: %
 every: 10s
  warn: $this > (($status >= $WARNING)  ? (80) : (90))
@@ -54,7 +54,7 @@ delay: down 15m multiplier 1.5 max 1h
     on: system.ram
     os: freebsd
  hosts: *
-  calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $buffers)
+  calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
  units: %
  every: 10s
   warn: $this < (($status >= $WARNING)  ? ( 5) : (10))
diff --git a/conf.d/health.d/redis.conf b/health/health.d/redis.conf
index c08a884a6..c08a884a6 100644
--- a/conf.d/health.d/redis.conf
+++ b/health/health.d/redis.conf
diff --git a/conf.d/health.d/retroshare.conf b/health/health.d/retroshare.conf
index 2344b60ec..2344b60ec 100644
--- a/conf.d/health.d/retroshare.conf
+++ b/health/health.d/retroshare.conf
diff --git a/conf.d/health.d/softnet.conf b/health/health.d/softnet.conf
index 77c804bfd..77c804bfd 100644
--- a/conf.d/health.d/softnet.conf
+++ b/health/health.d/softnet.conf
diff --git a/conf.d/health.d/squid.conf b/health/health.d/squid.conf
index 06cc9678f..06cc9678f 100644
--- a/conf.d/health.d/squid.conf
+++ b/health/health.d/squid.conf
diff --git a/conf.d/health.d/stiebeleltron.conf b/health/health.d/stiebeleltron.conf
index e0361eb20..e0361eb20 100644
--- a/conf.d/health.d/stiebeleltron.conf
+++ b/health/health.d/stiebeleltron.conf
diff --git a/conf.d/health.d/swap.conf b/health/health.d/swap.conf
index f920b0807..f920b0807 100644
--- a/conf.d/health.d/swap.conf
+++ b/health/health.d/swap.conf
diff --git a/conf.d/health.d/tcp_conn.conf b/health/health.d/tcp_conn.conf
index 7aa9a9800..7aa9a9800 100644
--- a/conf.d/health.d/tcp_conn.conf
+++ b/health/health.d/tcp_conn.conf
diff --git a/health/health.d/tcp_listen.conf b/health/health.d/tcp_listen.conf
new file mode 100644
index 000000000..552930ab7
--- /dev/null
+++ b/health/health.d/tcp_listen.conf
@@ -0,0 +1,82 @@
+#
+# There are two queues involved when incoming TCP connections are handled
+# (both at the kernel):
+#
+# SYN queue
+# The SYN queue tracks TCP handshakes until connections are fully established.
+# It overflows when too many incoming TCP connection requests hang in the
+# half-open state and the server is not configured to fall back to SYN cookies.
+# Overflows are usually caused by SYN flood DoS attacks (i.e. someone sends
+# lots of SYN packets and never completes the handshakes).
+#
+# Accept queue
+# The accept queue holds fully established TCP connections waiting to be handled
+# by the listening application. It overflows when the server application fails
+# to accept new connections at the rate they are coming in.
+#
+#
+# -----------------------------------------------------------------------------
+# tcp accept queue (at the kernel)
+
+   alarm: 1m_tcp_accept_queue_overflows
+      on: ip.tcp_accept_queue
+      os: linux
+   hosts: *
+  lookup: sum -60s unaligned absolute of ListenOverflows
+   units: overflows
+   every: 10s
+    crit: $this > 0
+   delay: up 0 down 5m multiplier 1.5 max 1h
+    info: the number of times the TCP accept queue of the kernel overflown, during the last minute
+      to: sysadmin
+
+# THIS IS TOO GENERIC
+# CHECK: https://github.com/netdata/netdata/issues/3234#issuecomment-423935842
+   alarm: 1m_tcp_accept_queue_drops
+      on: ip.tcp_accept_queue
+      os: linux
+   hosts: *
+  lookup: sum -60s unaligned absolute of ListenDrops
+   units: drops
+   every: 10s
+#    warn: $this > 0
+    crit: $this > (($status == $CRITICAL) ? (0) : (150))
+   delay: up 0 down 5m multiplier 1.5 max 1h
+    info: the number of times the TCP accept queue of the kernel dropped packets, during the last minute (includes bogus packets received)
+      to: sysadmin
+
+
+# -----------------------------------------------------------------------------
+# tcp SYN queue (at the kernel)
+
+# When the SYN queue is full, either TcpExtTCPReqQFullDoCookies or
+# TcpExtTCPReqQFullDrop is incremented, depending on whether SYN cookies are
+# enabled or not. In both cases this probably indicates a SYN flood attack,
+# so i guess a notification should be sent.
+
+   alarm: 1m_tcp_syn_queue_drops
+      on: ip.tcp_syn_queue
+      os: linux
+   hosts: *
+  lookup: sum -60s unaligned absolute of TCPReqQFullDrop
+   units: drops
+   every: 10s
+    warn: $this > 0
+    crit: $this > (($status == $CRITICAL) ? (0) : (60))
+   delay: up 0 down 5m multiplier 1.5 max 1h
+    info: the number of times the TCP SYN queue of the kernel was full and dropped packets, during the last minute
+      to: sysadmin
+
+   alarm: 1m_tcp_syn_queue_cookies
+      on: ip.tcp_syn_queue
+      os: linux
+   hosts: *
+  lookup: sum -60s unaligned absolute of TCPReqQFullDoCookies
+   units: cookies
+   every: 10s
+    warn: $this > 0
+    crit: $this > (($status == $CRITICAL) ? (0) : (60))
+   delay: up 0 down 5m multiplier 1.5 max 1h
+    info: the number of times the TCP SYN queue of the kernel was full and sent SYN cookies, during the last minute
+      to: sysadmin
+
diff --git a/conf.d/health.d/tcp_mem.conf b/health/health.d/tcp_mem.conf
index 6927d5765..6927d5765 100644
--- a/conf.d/health.d/tcp_mem.conf
+++ b/health/health.d/tcp_mem.conf
diff --git a/conf.d/health.d/tcp_orphans.conf b/health/health.d/tcp_orphans.conf
index 280d6590f..280d6590f 100644
--- a/conf.d/health.d/tcp_orphans.conf
+++ b/health/health.d/tcp_orphans.conf
diff --git a/conf.d/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf
index 91dad3c6a..91dad3c6a 100644
--- a/conf.d/health.d/tcp_resets.conf
+++ b/health/health.d/tcp_resets.conf
diff --git a/conf.d/health.d/udp_errors.conf b/health/health.d/udp_errors.conf
index 382b39658..5140228f5 100644
--- a/conf.d/health.d/udp_errors.conf
+++ b/health/health.d/udp_errors.conf
@@ -27,7 +27,7 @@
    units: errors
    every: 10s
     warn: $this > 0
-    crit: $this > 100
+    crit: $this > (($status == $CRITICAL) ? (0) : (100))
     info: number of UDP receive buffer errors during the last minute
    delay: up 0 down 60m multiplier 1.2 max 2h
       to: sysadmin
@@ -43,7 +43,7 @@
    units: errors
    every: 10s
     warn: $this > 0
-    crit: $this > 100
+    crit: $this > (($status == $CRITICAL) ? (0) : (100))
     info: number of UDP send buffer errors during the last minute
    delay: up 0 down 60m multiplier 1.2 max 2h
       to: sysadmin
diff --git a/conf.d/health.d/varnish.conf b/health/health.d/varnish.conf
index cca7446b4..cca7446b4 100644
--- a/conf.d/health.d/varnish.conf
+++ b/health/health.d/varnish.conf
diff --git a/conf.d/health.d/web_log.conf b/health/health.d/web_log.conf
index d8be88b47..d8be88b47 100644
--- a/conf.d/health.d/web_log.conf
+++ b/health/health.d/web_log.conf
diff --git a/conf.d/health.d/zfs.conf b/health/health.d/zfs.conf
index af73824e6..af73824e6 100644
--- a/conf.d/health.d/zfs.conf
+++ b/health/health.d/zfs.conf
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2018-11-07 12:19:29 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2018-11-07 12:20:17 +0000
commit	a64a253794ac64cb40befee54db53bde17dd0d49 (patch)
tree	c1024acc5f6e508814b944d99f112259bb28b1be /health/health.d
parent	New upstream version 1.10.0+dfsg (diff)
download	netdata-a64a253794ac64cb40befee54db53bde17dd0d49.tar.xz netdata-a64a253794ac64cb40befee54db53bde17dd0d49.zip