summaryrefslogtreecommitdiffstats
path: root/health/health.d
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--health/health.d/adaptec_raid.conf24
-rw-r--r--health/health.d/apache.conf (renamed from conf.d/health.d/apache.conf)0
-rw-r--r--health/health.d/apcupsd.conf40
-rw-r--r--health/health.d/backend.conf (renamed from conf.d/health.d/backend.conf)0
-rw-r--r--health/health.d/bcache.conf22
-rw-r--r--health/health.d/beanstalkd.conf (renamed from conf.d/health.d/beanstalkd.conf)0
-rw-r--r--health/health.d/bind_rndc.conf (renamed from conf.d/health.d/bind_rndc.conf)0
-rw-r--r--health/health.d/boinc.conf62
-rw-r--r--health/health.d/btrfs.conf (renamed from conf.d/health.d/btrfs.conf)0
-rw-r--r--health/health.d/ceph.conf (renamed from conf.d/health.d/ceph.conf)0
-rw-r--r--health/health.d/couchdb.conf (renamed from conf.d/health.d/couchdb.conf)0
-rw-r--r--health/health.d/cpu.conf (renamed from conf.d/health.d/cpu.conf)0
-rw-r--r--health/health.d/disks.conf (renamed from conf.d/health.d/disks.conf)0
-rw-r--r--health/health.d/dockerd.conf8
-rw-r--r--health/health.d/elasticsearch.conf (renamed from conf.d/health.d/elasticsearch.conf)0
-rw-r--r--health/health.d/entropy.conf (renamed from conf.d/health.d/entropy.conf)0
-rw-r--r--health/health.d/fping.conf (renamed from conf.d/health.d/fping.conf)0
-rw-r--r--health/health.d/fronius.conf (renamed from conf.d/health.d/fronius.conf)0
-rw-r--r--health/health.d/haproxy.conf (renamed from conf.d/health.d/haproxy.conf)0
-rw-r--r--health/health.d/httpcheck.conf (renamed from conf.d/health.d/httpcheck.conf)0
-rw-r--r--health/health.d/ipc.conf (renamed from conf.d/health.d/ipc.conf)4
-rw-r--r--health/health.d/ipfs.conf (renamed from conf.d/health.d/ipfs.conf)0
-rw-r--r--health/health.d/ipmi.conf (renamed from conf.d/health.d/ipmi.conf)0
-rw-r--r--health/health.d/isc_dhcpd.conf (renamed from conf.d/health.d/isc_dhcpd.conf)0
-rw-r--r--health/health.d/lighttpd.conf (renamed from conf.d/health.d/lighttpd.conf)0
-rw-r--r--health/health.d/linux_power_supply.conf12
-rw-r--r--health/health.d/load.conf56
-rw-r--r--health/health.d/mdstat.conf (renamed from conf.d/health.d/mdstat.conf)23
-rw-r--r--health/health.d/megacli.conf48
-rw-r--r--health/health.d/memcached.conf (renamed from conf.d/health.d/memcached.conf)0
-rw-r--r--health/health.d/memory.conf (renamed from conf.d/health.d/memory.conf)0
-rw-r--r--health/health.d/mongodb.conf (renamed from conf.d/health.d/mongodb.conf)0
-rw-r--r--health/health.d/mysql.conf (renamed from conf.d/health.d/mysql.conf)15
-rw-r--r--health/health.d/named.conf (renamed from conf.d/health.d/named.conf)0
-rw-r--r--health/health.d/net.conf (renamed from conf.d/health.d/net.conf)37
-rw-r--r--health/health.d/netfilter.conf (renamed from conf.d/health.d/netfilter.conf)2
-rw-r--r--health/health.d/nginx.conf (renamed from conf.d/health.d/nginx.conf)0
-rw-r--r--health/health.d/nginx_plus.conf (renamed from conf.d/health.d/nginx_plus.conf)0
-rw-r--r--health/health.d/portcheck.conf (renamed from conf.d/health.d/portcheck.conf)0
-rw-r--r--health/health.d/postgres.conf (renamed from conf.d/health.d/postgres.conf)0
-rw-r--r--health/health.d/qos.conf (renamed from conf.d/health.d/qos.conf)0
-rw-r--r--health/health.d/ram.conf (renamed from conf.d/health.d/ram.conf)6
-rw-r--r--health/health.d/redis.conf (renamed from conf.d/health.d/redis.conf)0
-rw-r--r--health/health.d/retroshare.conf (renamed from conf.d/health.d/retroshare.conf)0
-rw-r--r--health/health.d/softnet.conf (renamed from conf.d/health.d/softnet.conf)0
-rw-r--r--health/health.d/squid.conf (renamed from conf.d/health.d/squid.conf)0
-rw-r--r--health/health.d/stiebeleltron.conf (renamed from conf.d/health.d/stiebeleltron.conf)0
-rw-r--r--health/health.d/swap.conf (renamed from conf.d/health.d/swap.conf)0
-rw-r--r--health/health.d/tcp_conn.conf (renamed from conf.d/health.d/tcp_conn.conf)0
-rw-r--r--health/health.d/tcp_listen.conf82
-rw-r--r--health/health.d/tcp_mem.conf (renamed from conf.d/health.d/tcp_mem.conf)0
-rw-r--r--health/health.d/tcp_orphans.conf (renamed from conf.d/health.d/tcp_orphans.conf)0
-rw-r--r--health/health.d/tcp_resets.conf (renamed from conf.d/health.d/tcp_resets.conf)0
-rw-r--r--health/health.d/udp_errors.conf (renamed from conf.d/health.d/udp_errors.conf)4
-rw-r--r--health/health.d/varnish.conf (renamed from conf.d/health.d/varnish.conf)0
-rw-r--r--health/health.d/web_log.conf (renamed from conf.d/health.d/web_log.conf)0
-rw-r--r--health/health.d/zfs.conf (renamed from conf.d/health.d/zfs.conf)0
57 files changed, 428 insertions, 17 deletions
diff --git a/health/health.d/adaptec_raid.conf b/health/health.d/adaptec_raid.conf
new file mode 100644
index 000000000..a1301ce8a
--- /dev/null
+++ b/health/health.d/adaptec_raid.conf
@@ -0,0 +1,24 @@
+
+# logical device status check
+
+template: adapter_raid_ld_status
+ on: adapter_raid.ld_status
+ lookup: max -5s
+ units: bool
+ every: 10s
+ crit: $this > 0
+ delay: down 5m multiplier 1.5 max 1h
+ info: at least 1 logical device is failed or degraded
+ to: sysadmin
+
+# physical device state check
+
+template: adapter_raid_pd_state
+ on: adapter_raid.pd_state
+ lookup: max -5s
+ units: bool
+ every: 10s
+ crit: $this > 0
+ delay: down 5m multiplier 1.5 max 1h
+ info: at least 1 physical device is not in online state
+ to: sysadmin
diff --git a/conf.d/health.d/apache.conf b/health/health.d/apache.conf
index 0c98b8778..0c98b8778 100644
--- a/conf.d/health.d/apache.conf
+++ b/health/health.d/apache.conf
diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf
new file mode 100644
index 000000000..4f86037ba
--- /dev/null
+++ b/health/health.d/apcupsd.conf
@@ -0,0 +1,40 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+template: 10min_ups_load
+ on: apcupsd.load
+ os: *
+ hosts: *
+ lookup: average -10m unaligned of percentage
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (70) : (80))
+ crit: $this > (($status == $CRITICAL) ? (85) : (95))
+ delay: down 10m multiplier 1.5 max 1h
+ info: average UPS load for the last 10 minutes
+ to: sitemgr
+
+# Discussion in https://github.com/netdata/netdata/pull/3928:
+# Fire the alarm as soon as it's going on battery (99% charge) and clear only when full.
+template: ups_charge
+ on: apcupsd.charge
+ os: *
+ hosts: *
+ lookup: average -60s unaligned of charge
+ units: %
+ every: 60s
+ warn: $this < 100
+ crit: $this < (($status == $CRITICAL) ? (60) : (50))
+ delay: down 10m multiplier 1.5 max 1h
+ info: current UPS charge, averaged over the last 60 seconds to reduce measurement errors
+ to: sitemgr
+
+template: apcupsd_last_collected_secs
+ on: apcupsd.load
+ calc: $now - $last_collected_t
+ every: 10s
+ units: seconds ago
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: sitemgr
diff --git a/conf.d/health.d/backend.conf b/health/health.d/backend.conf
index 7af100d8f..7af100d8f 100644
--- a/conf.d/health.d/backend.conf
+++ b/health/health.d/backend.conf
diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf
new file mode 100644
index 000000000..f0da9ac5e
--- /dev/null
+++ b/health/health.d/bcache.conf
@@ -0,0 +1,22 @@
+
+template: bcache_cache_errors
+ on: disk.bcache_cache_read_races
+ lookup: sum -10m unaligned absolute
+ units: errors
+ every: 1m
+ warn: $this > 0
+ crit: $this > ( ($status >= $CRITICAL) ? (0) : (10) )
+ delay: down 1h multiplier 1.5 max 2h
+ info: the number of times bcache had issues using the cache, during the last 10 mins (this usually means your SSD cache is failing)
+ to: sysadmin
+
+template: bcache_cache_dirty
+ on: disk.bcache_cache_alloc
+ calc: $dirty + $metadata + $undefined
+ units: %
+ every: 1m
+ warn: $this > ( ($status >= $WARNING ) ? ( 70 ) : ( 90 ) )
+ crit: $this > ( ($status >= $CRITICAL) ? ( 90 ) : ( 95 ) )
+ delay: up 1m down 1h multiplier 1.5 max 2h
+ info: the percentage of cache space used for dirty and metadata (this usually means your SSD cache is too small)
+ to: sysadmin
diff --git a/conf.d/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf
index 30dc27328..30dc27328 100644
--- a/conf.d/health.d/beanstalkd.conf
+++ b/health/health.d/beanstalkd.conf
diff --git a/conf.d/health.d/bind_rndc.conf b/health/health.d/bind_rndc.conf
index 4145e77cd..4145e77cd 100644
--- a/conf.d/health.d/bind_rndc.conf
+++ b/health/health.d/bind_rndc.conf
diff --git a/health/health.d/boinc.conf b/health/health.d/boinc.conf
new file mode 100644
index 000000000..43c588db6
--- /dev/null
+++ b/health/health.d/boinc.conf
@@ -0,0 +1,62 @@
+# Alarms for various BOINC issues.
+
+# Warn on any compute errors encountered.
+template: boinc_compute_errors
+ on: boinc.states
+ os: *
+ hosts: *
+families: *
+ lookup: average -10m unaligned of comperror
+ units: tasks
+ every: 1m
+ warn: $this > 0
+ crit: $this > 1
+ delay: up 1m down 5m multiplier 1.5 max 1h
+ info: the total number of compute errors over the past 10 minutes
+ to: sysadmin
+
+# Warn on lots of upload errors
+template: boinc_upload_errors
+ on: boinc.states
+ os: *
+ hosts: *
+families: *
+ lookup: average -10m unaligned of upload_failed
+ units: tasks
+ every: 1m
+ warn: $this > 0
+ crit: $this > 1
+ delay: up 1m down 5m multiplier 1.5 max 1h
+ info: the average number of failed uploads over the past 10 minutes
+ to: sysadmin
+
+# Warn on the task queue being empty
+template: boinc_total_tasks
+ on: boinc.tasks
+ os: *
+ hosts: *
+families: *
+ lookup: average -10m unaligned of total
+ units: tasks
+ every: 1m
+ warn: $this < 1
+ crit: $this < 0.1
+ delay: up 5m down 10m multiplier 1.5 max 1h
+ info: the total number of locally available tasks
+ to: sysadmin
+
+# Warn on no active tasks with a non-empty queue
+template: boinc_active_tasks
+ on: boinc.tasks
+ os: *
+ hosts: *
+families: *
+ lookup: average -10m unaligned of active
+ calc: ($boinc_total_tasks >= 1) ? ($this) : (inf)
+ units: tasks
+ every: 1m
+ warn: $this < 1
+ crit: $this < 0.1
+ delay: up 5m down 10m multiplier 1.5 max 1h
+ info: the total number of active tasks
+ to: sysadmin
diff --git a/conf.d/health.d/btrfs.conf b/health/health.d/btrfs.conf
index b27aa544f..b27aa544f 100644
--- a/conf.d/health.d/btrfs.conf
+++ b/health/health.d/btrfs.conf
diff --git a/conf.d/health.d/ceph.conf b/health/health.d/ceph.conf
index de16f7b6f..de16f7b6f 100644
--- a/conf.d/health.d/ceph.conf
+++ b/health/health.d/ceph.conf
diff --git a/conf.d/health.d/couchdb.conf b/health/health.d/couchdb.conf
index 4a2895280..4a2895280 100644
--- a/conf.d/health.d/couchdb.conf
+++ b/health/health.d/couchdb.conf
diff --git a/conf.d/health.d/cpu.conf b/health/health.d/cpu.conf
index fa8189856..fa8189856 100644
--- a/conf.d/health.d/cpu.conf
+++ b/health/health.d/cpu.conf
diff --git a/conf.d/health.d/disks.conf b/health/health.d/disks.conf
index 26f85848a..26f85848a 100644
--- a/conf.d/health.d/disks.conf
+++ b/health/health.d/disks.conf
diff --git a/health/health.d/dockerd.conf b/health/health.d/dockerd.conf
new file mode 100644
index 000000000..729906cdb
--- /dev/null
+++ b/health/health.d/dockerd.conf
@@ -0,0 +1,8 @@
+template: docker_unhealthy_containers
+ on: docker.unhealthy_containers
+ units: unhealthy containers
+ every: 10s
+ lookup: average -10s
+ crit: $this > 0
+ info: number of unhealthy containers
+ to: sysadmin
diff --git a/conf.d/health.d/elasticsearch.conf b/health/health.d/elasticsearch.conf
index dffd40965..dffd40965 100644
--- a/conf.d/health.d/elasticsearch.conf
+++ b/health/health.d/elasticsearch.conf
diff --git a/conf.d/health.d/entropy.conf b/health/health.d/entropy.conf
index 66d44ec13..66d44ec13 100644
--- a/conf.d/health.d/entropy.conf
+++ b/health/health.d/entropy.conf
diff --git a/conf.d/health.d/fping.conf b/health/health.d/fping.conf
index 43658fef6..43658fef6 100644
--- a/conf.d/health.d/fping.conf
+++ b/health/health.d/fping.conf
diff --git a/conf.d/health.d/fronius.conf b/health/health.d/fronius.conf
index cdf6c8fcb..cdf6c8fcb 100644
--- a/conf.d/health.d/fronius.conf
+++ b/health/health.d/fronius.conf
diff --git a/conf.d/health.d/haproxy.conf b/health/health.d/haproxy.conf
index e49c70d48..e49c70d48 100644
--- a/conf.d/health.d/haproxy.conf
+++ b/health/health.d/haproxy.conf
diff --git a/conf.d/health.d/httpcheck.conf b/health/health.d/httpcheck.conf
index 0ddf35eab..0ddf35eab 100644
--- a/conf.d/health.d/httpcheck.conf
+++ b/health/health.d/httpcheck.conf
diff --git a/conf.d/health.d/ipc.conf b/health/health.d/ipc.conf
index 03cf264d8..989d6e912 100644
--- a/conf.d/health.d/ipc.conf
+++ b/health/health.d/ipc.conf
@@ -5,7 +5,7 @@
on: system.ipc_semaphores
os: linux
hosts: *
- calc: $semaphores * 100 / $ipc.semaphores.max
+ calc: $semaphores * 100 / $ipc_semaphores_max
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (70) : (80))
@@ -18,7 +18,7 @@
on: system.ipc_semaphore_arrays
os: linux
hosts: *
- calc: $arrays * 100 / $ipc.semaphores.arrays.max
+ calc: $arrays * 100 / $ipc_semaphores_arrays_max
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (70) : (80))
diff --git a/conf.d/health.d/ipfs.conf b/health/health.d/ipfs.conf
index 3f77572d6..3f77572d6 100644
--- a/conf.d/health.d/ipfs.conf
+++ b/health/health.d/ipfs.conf
diff --git a/conf.d/health.d/ipmi.conf b/health/health.d/ipmi.conf
index c25581964..c25581964 100644
--- a/conf.d/health.d/ipmi.conf
+++ b/health/health.d/ipmi.conf
diff --git a/conf.d/health.d/isc_dhcpd.conf b/health/health.d/isc_dhcpd.conf
index 8054656ff..8054656ff 100644
--- a/conf.d/health.d/isc_dhcpd.conf
+++ b/health/health.d/isc_dhcpd.conf
diff --git a/conf.d/health.d/lighttpd.conf b/health/health.d/lighttpd.conf
index 915907a4a..915907a4a 100644
--- a/conf.d/health.d/lighttpd.conf
+++ b/health/health.d/lighttpd.conf
diff --git a/health/health.d/linux_power_supply.conf b/health/health.d/linux_power_supply.conf
new file mode 100644
index 000000000..27a172a14
--- /dev/null
+++ b/health/health.d/linux_power_supply.conf
@@ -0,0 +1,12 @@
+# Alert on low battery capacity.
+
+template: linux_power_supply_capacity
+ on: power_supply.capacity
+ calc: $capacity
+ units: %
+ every: 10s
+ warn: $this < 10
+ crit: $this < 5
+ delay: up 0 down 5m multiplier 1.2 max 1h
+ info: the percentage remaining capacity of the power supply
+ to: sysadmin
diff --git a/health/health.d/load.conf b/health/health.d/load.conf
new file mode 100644
index 000000000..ee0c54b8e
--- /dev/null
+++ b/health/health.d/load.conf
@@ -0,0 +1,56 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# Calculate the base trigger point for the load average alarms.
+# This is the maximum number of CPU's in the system over the past 1
+# minute, with a special case for a single CPU of setting the trigger at 2.
+ alarm: load_trigger
+ on: system.load
+ os: linux
+ hosts: *
+ calc: ($active_processors == nan or $active_processors == inf or $active_processors < 2) ? ( 2 ) : ( $active_processors )
+ units: cpus
+ every: 1m
+ info: trigger point for load average alarms
+
+# Send alarms if the load average is unusually high.
+# These intentionally _do not_ calculate the average over the sampled
+# time period because the values being checked already are averages.
+ alarm: load_average_15
+ on: system.load
+ os: linux
+ hosts: *
+ lookup: max -1m unaligned of load15
+ units: load
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (1.75 * $load_trigger) : (2 * $load_trigger))
+ crit: $this > (($status == $CRITICAL) ? (3.5 * $load_trigger) : (4 * $load_trigger))
+ delay: down 15m multiplier 1.5 max 1h
+ info: fifteen-minute load average
+ to: sysadmin
+
+ alarm: load_average_5
+ on: system.load
+ os: linux
+ hosts: *
+ lookup: max -1m unaligned of load5
+ units: load
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (3.5 * $load_trigger) : (4 * $load_trigger))
+ crit: $this > (($status == $CRITICAL) ? (7 * $load_trigger) : (8 * $load_trigger))
+ delay: down 15m multiplier 1.5 max 1h
+ info: five-minute load average
+ to: sysadmin
+
+ alarm: load_average_1
+ on: system.load
+ os: linux
+ hosts: *
+ lookup: max -1m unaligned of load1
+ units: load
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (7 * $load_trigger) : (8 * $load_trigger))
+ crit: $this > (($status == $CRITICAL) ? (14 * $load_trigger) : (16 * $load_trigger))
+ delay: down 15m multiplier 1.5 max 1h
+ info: one-minute load average
+ to: sysadmin
diff --git a/conf.d/health.d/mdstat.conf b/health/health.d/mdstat.conf
index c9e7d20db..0f5f2837e 100644
--- a/conf.d/health.d/mdstat.conf
+++ b/health/health.d/mdstat.conf
@@ -1,3 +1,13 @@
+template: mdstat_last_collected
+ on: md.disks
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ info: number of seconds since the last successful data collection
+ to: sysadmin
+
template: mdstat_disks
on: md.disks
units: failed devices
@@ -7,12 +17,11 @@ template: mdstat_disks
info: Array is degraded!
to: sysadmin
-template: mdstat_last_collected
- on: md.disks
- calc: $now - $last_collected_t
- units: seconds ago
+template: mdstat_mismatch_cnt
+ on: md.mismatch_cnt
+ units: unsynchronized blocks
+ calc: $count
every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- info: number of seconds since the last successful data collection
+ crit: $this > 0
+ info: Mismatch count!
to: sysadmin
diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf
new file mode 100644
index 000000000..1881a7be1
--- /dev/null
+++ b/health/health.d/megacli.conf
@@ -0,0 +1,48 @@
+ alarm: adapter_state
+ on: megacli.adapter_degraded
+ units: is degraded
+ lookup: sum -10s
+ every: 10s
+ crit: $this > 0
+ info: adapter state
+ to: sysadmin
+
+ template: bbu_relative_charge
+ on: megacli.bbu_relative_charge
+ units: percent
+ lookup: average -10s
+ every: 10s
+ warn: $this <= (($status >= $WARNING) ? (85) : (80))
+ crit: $this <= (($status == $CRITICAL) ? (50) : (40))
+ info: BBU relative state of charge
+ to: sysadmin
+
+ template: bbu_cycle_count
+ on: megacli.bbu_cycle_count
+ units: cycle count
+ lookup: average -10s
+ every: 10s
+ warn: $this >= 100
+ crit: $this >= 500
+ info: BBU cycle count
+ to: sysadmin
+
+ alarm: pd_media_errors
+ on: megacli.pd_media_error
+ units: media errors
+ lookup: sum -10s
+ every: 10s
+ warn: $this > 0
+ delay: down 1m multiplier 2 max 10m
+ info: physical drive media errors
+ to: sysadmin
+
+ alarm: pd_predictive_failures
+ on: megacli.pd_predictive_failure
+ units: predictive failures
+ lookup: sum -10s
+ every: 10s
+ warn: $this > 0
+ delay: down 1m multiplier 2 max 10m
+ info: physical drive predictive failures
+ to: sysadmin
diff --git a/conf.d/health.d/memcached.conf b/health/health.d/memcached.conf
index d248ef57a..d248ef57a 100644
--- a/conf.d/health.d/memcached.conf
+++ b/health/health.d/memcached.conf
diff --git a/conf.d/health.d/memory.conf b/health/health.d/memory.conf
index 4a0e6e522..4a0e6e522 100644
--- a/conf.d/health.d/memory.conf
+++ b/health/health.d/memory.conf
diff --git a/conf.d/health.d/mongodb.conf b/health/health.d/mongodb.conf
index a80cb3112..a80cb3112 100644
--- a/conf.d/health.d/mongodb.conf
+++ b/health/health.d/mongodb.conf
diff --git a/conf.d/health.d/mysql.conf b/health/health.d/mysql.conf
index 1eeb993f0..39c401915 100644
--- a/conf.d/health.d/mysql.conf
+++ b/health/health.d/mysql.conf
@@ -60,6 +60,21 @@ template: mysql_10s_waited_locks_ratio
# -----------------------------------------------------------------------------
+# connections
+
+template: mysql_connections
+ on: mysql.connections_active
+ calc: $active * 100 / $limit
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (60) : (70))
+ crit: $this > (($status == $CRITICAL) ? (80) : (90))
+ delay: down 15m multiplier 1.5 max 1h
+ info: the ratio of current active connections vs the maximum possible number of connections
+ to: dba
+
+
+# -----------------------------------------------------------------------------
# replication
template: mysql_replication
diff --git a/conf.d/health.d/named.conf b/health/health.d/named.conf
index 4fc65c8ee..4fc65c8ee 100644
--- a/conf.d/health.d/named.conf
+++ b/health/health.d/named.conf
diff --git a/conf.d/health.d/net.conf b/health/health.d/net.conf
index 22a88927d..489016dd5 100644
--- a/conf.d/health.d/net.conf
+++ b/health/health.d/net.conf
@@ -2,6 +2,39 @@
# you can disable an alarm notification by setting the 'to' line to: silent
# -----------------------------------------------------------------------------
+# net traffic overflow
+
+ template: 1m_received_traffic_overflow
+ on: net.net
+ os: linux
+ hosts: *
+ families: *
+ lookup: average -1m unaligned absolute of received
+ calc: ($nic_speed_max > 0) ? ($this * 100 / ($nic_speed_max * 1000)) : ( nan )
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (85))
+ crit: $this > (($status == $CRITICAL) ? (85) : (90))
+ delay: down 1m multiplier 1.5 max 1h
+ info: interface received bandwidth usage over net device speed max
+ to: sysadmin
+
+ template: 1m_sent_traffic_overflow
+ on: net.net
+ os: linux
+ hosts: *
+ families: *
+ lookup: average -1m unaligned absolute of sent
+ calc: ($nic_speed_max > 0) ? ($this * 100 / ($nic_speed_max * 1000)) : ( nan )
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (85))
+ crit: $this > (($status == $CRITICAL) ? (85) : (90))
+ delay: down 1m multiplier 1.5 max 1h
+ info: interface sent bandwidth usage over net device speed max
+ to: sysadmin
+
+# -----------------------------------------------------------------------------
# dropped packets
# check if an interface is dropping packets
@@ -101,7 +134,7 @@ template: 1m_received_packets_rate
os: linux freebsd
hosts: *
families: *
- lookup: average -1m of received
+ lookup: average -1m unaligned of received
units: packets
every: 10s
info: the average number of packets received during the last minute
@@ -111,7 +144,7 @@ template: 10s_received_packets_storm
os: linux freebsd
hosts: *
families: *
- lookup: average -10s of received
+ lookup: average -10s unaligned of received
calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
every: 10s
units: %
diff --git a/conf.d/health.d/netfilter.conf b/health/health.d/netfilter.conf
index fa1732b33..1d07752cc 100644
--- a/conf.d/health.d/netfilter.conf
+++ b/health/health.d/netfilter.conf
@@ -19,7 +19,7 @@
os: linux
hosts: *
lookup: max -10s unaligned of connections
- calc: $this * 100 / $netfilter.conntrack.max
+ calc: $this * 100 / $netfilter_conntrack_max
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (70) : (80))
diff --git a/conf.d/health.d/nginx.conf b/health/health.d/nginx.conf
index a686c3d99..a686c3d99 100644
--- a/conf.d/health.d/nginx.conf
+++ b/health/health.d/nginx.conf
diff --git a/conf.d/health.d/nginx_plus.conf b/health/health.d/nginx_plus.conf
index 5a171a76d..5a171a76d 100644
--- a/conf.d/health.d/nginx_plus.conf
+++ b/health/health.d/nginx_plus.conf
diff --git a/conf.d/health.d/portcheck.conf b/health/health.d/portcheck.conf
index f42b63d30..f42b63d30 100644
--- a/conf.d/health.d/portcheck.conf
+++ b/health/health.d/portcheck.conf
diff --git a/conf.d/health.d/postgres.conf b/health/health.d/postgres.conf
index 4e0583b85..4e0583b85 100644
--- a/conf.d/health.d/postgres.conf
+++ b/health/health.d/postgres.conf
diff --git a/conf.d/health.d/qos.conf b/health/health.d/qos.conf
index 7290d15ff..7290d15ff 100644
--- a/conf.d/health.d/qos.conf
+++ b/health/health.d/qos.conf
diff --git a/conf.d/health.d/ram.conf b/health/health.d/ram.conf
index b6dc5f945..4e437322c 100644
--- a/conf.d/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -3,7 +3,7 @@
alarm: used_ram_to_ignore
on: system.ram
- os: linux
+ os: linux freebsd
hosts: *
calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz)
every: 10s
@@ -41,7 +41,7 @@ alarm: ram_in_use
on: system.ram
os: freebsd
hosts: *
- calc: (($active + $wired) - $used_ram_to_ignore) * 100 / (($active + $wired) - $used_ram_to_ignore + $cached + $free)
+ calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive)
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (80) : (90))
@@ -54,7 +54,7 @@ delay: down 15m multiplier 1.5 max 1h
on: system.ram
os: freebsd
hosts: *
- calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $buffers)
+ calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers)
units: %
every: 10s
warn: $this < (($status >= $WARNING) ? ( 5) : (10))
diff --git a/conf.d/health.d/redis.conf b/health/health.d/redis.conf
index c08a884a6..c08a884a6 100644
--- a/conf.d/health.d/redis.conf
+++ b/health/health.d/redis.conf
diff --git a/conf.d/health.d/retroshare.conf b/health/health.d/retroshare.conf
index 2344b60ec..2344b60ec 100644
--- a/conf.d/health.d/retroshare.conf
+++ b/health/health.d/retroshare.conf
diff --git a/conf.d/health.d/softnet.conf b/health/health.d/softnet.conf
index 77c804bfd..77c804bfd 100644
--- a/conf.d/health.d/softnet.conf
+++ b/health/health.d/softnet.conf
diff --git a/conf.d/health.d/squid.conf b/health/health.d/squid.conf
index 06cc9678f..06cc9678f 100644
--- a/conf.d/health.d/squid.conf
+++ b/health/health.d/squid.conf
diff --git a/conf.d/health.d/stiebeleltron.conf b/health/health.d/stiebeleltron.conf
index e0361eb20..e0361eb20 100644
--- a/conf.d/health.d/stiebeleltron.conf
+++ b/health/health.d/stiebeleltron.conf
diff --git a/conf.d/health.d/swap.conf b/health/health.d/swap.conf
index f920b0807..f920b0807 100644
--- a/conf.d/health.d/swap.conf
+++ b/health/health.d/swap.conf
diff --git a/conf.d/health.d/tcp_conn.conf b/health/health.d/tcp_conn.conf
index 7aa9a9800..7aa9a9800 100644
--- a/conf.d/health.d/tcp_conn.conf
+++ b/health/health.d/tcp_conn.conf
diff --git a/health/health.d/tcp_listen.conf b/health/health.d/tcp_listen.conf
new file mode 100644
index 000000000..552930ab7
--- /dev/null
+++ b/health/health.d/tcp_listen.conf
@@ -0,0 +1,82 @@
+#
+# There are two queues involved when incoming TCP connections are handled
+# (both at the kernel):
+#
+# SYN queue
+# The SYN queue tracks TCP handshakes until connections are fully established.
+# It overflows when too many incoming TCP connection requests hang in the
+# half-open state and the server is not configured to fall back to SYN cookies.
+# Overflows are usually caused by SYN flood DoS attacks (i.e. someone sends
+# lots of SYN packets and never completes the handshakes).
+#
+# Accept queue
+# The accept queue holds fully established TCP connections waiting to be handled
+# by the listening application. It overflows when the server application fails
+# to accept new connections at the rate they are coming in.
+#
+#
+# -----------------------------------------------------------------------------
+# tcp accept queue (at the kernel)
+
+ alarm: 1m_tcp_accept_queue_overflows
+ on: ip.tcp_accept_queue
+ os: linux
+ hosts: *
+ lookup: sum -60s unaligned absolute of ListenOverflows
+ units: overflows
+ every: 10s
+ crit: $this > 0
+ delay: up 0 down 5m multiplier 1.5 max 1h
+ info: the number of times the TCP accept queue of the kernel overflown, during the last minute
+ to: sysadmin
+
+# THIS IS TOO GENERIC
+# CHECK: https://github.com/netdata/netdata/issues/3234#issuecomment-423935842
+ alarm: 1m_tcp_accept_queue_drops
+ on: ip.tcp_accept_queue
+ os: linux
+ hosts: *
+ lookup: sum -60s unaligned absolute of ListenDrops
+ units: drops
+ every: 10s
+# warn: $this > 0
+ crit: $this > (($status == $CRITICAL) ? (0) : (150))
+ delay: up 0 down 5m multiplier 1.5 max 1h
+ info: the number of times the TCP accept queue of the kernel dropped packets, during the last minute (includes bogus packets received)
+ to: sysadmin
+
+
+# -----------------------------------------------------------------------------
+# tcp SYN queue (at the kernel)
+
+# When the SYN queue is full, either TcpExtTCPReqQFullDoCookies or
+# TcpExtTCPReqQFullDrop is incremented, depending on whether SYN cookies are
+# enabled or not. In both cases this probably indicates a SYN flood attack,
+# so i guess a notification should be sent.
+
+ alarm: 1m_tcp_syn_queue_drops
+ on: ip.tcp_syn_queue
+ os: linux
+ hosts: *
+ lookup: sum -60s unaligned absolute of TCPReqQFullDrop
+ units: drops
+ every: 10s
+ warn: $this > 0
+ crit: $this > (($status == $CRITICAL) ? (0) : (60))
+ delay: up 0 down 5m multiplier 1.5 max 1h
+ info: the number of times the TCP SYN queue of the kernel was full and dropped packets, during the last minute
+ to: sysadmin
+
+ alarm: 1m_tcp_syn_queue_cookies
+ on: ip.tcp_syn_queue
+ os: linux
+ hosts: *
+ lookup: sum -60s unaligned absolute of TCPReqQFullDoCookies
+ units: cookies
+ every: 10s
+ warn: $this > 0
+ crit: $this > (($status == $CRITICAL) ? (0) : (60))
+ delay: up 0 down 5m multiplier 1.5 max 1h
+ info: the number of times the TCP SYN queue of the kernel was full and sent SYN cookies, during the last minute
+ to: sysadmin
+
diff --git a/conf.d/health.d/tcp_mem.conf b/health/health.d/tcp_mem.conf
index 6927d5765..6927d5765 100644
--- a/conf.d/health.d/tcp_mem.conf
+++ b/health/health.d/tcp_mem.conf
diff --git a/conf.d/health.d/tcp_orphans.conf b/health/health.d/tcp_orphans.conf
index 280d6590f..280d6590f 100644
--- a/conf.d/health.d/tcp_orphans.conf
+++ b/health/health.d/tcp_orphans.conf
diff --git a/conf.d/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf
index 91dad3c6a..91dad3c6a 100644
--- a/conf.d/health.d/tcp_resets.conf
+++ b/health/health.d/tcp_resets.conf
diff --git a/conf.d/health.d/udp_errors.conf b/health/health.d/udp_errors.conf
index 382b39658..5140228f5 100644
--- a/conf.d/health.d/udp_errors.conf
+++ b/health/health.d/udp_errors.conf
@@ -27,7 +27,7 @@
units: errors
every: 10s
warn: $this > 0
- crit: $this > 100
+ crit: $this > (($status == $CRITICAL) ? (0) : (100))
info: number of UDP receive buffer errors during the last minute
delay: up 0 down 60m multiplier 1.2 max 2h
to: sysadmin
@@ -43,7 +43,7 @@
units: errors
every: 10s
warn: $this > 0
- crit: $this > 100
+ crit: $this > (($status == $CRITICAL) ? (0) : (100))
info: number of UDP send buffer errors during the last minute
delay: up 0 down 60m multiplier 1.2 max 2h
to: sysadmin
diff --git a/conf.d/health.d/varnish.conf b/health/health.d/varnish.conf
index cca7446b4..cca7446b4 100644
--- a/conf.d/health.d/varnish.conf
+++ b/health/health.d/varnish.conf
diff --git a/conf.d/health.d/web_log.conf b/health/health.d/web_log.conf
index d8be88b47..d8be88b47 100644
--- a/conf.d/health.d/web_log.conf
+++ b/health/health.d/web_log.conf
diff --git a/conf.d/health.d/zfs.conf b/health/health.d/zfs.conf
index af73824e6..af73824e6 100644
--- a/conf.d/health.d/zfs.conf
+++ b/health/health.d/zfs.conf