diff options
Diffstat (limited to '')
-rw-r--r-- | health/health.d/adaptec_raid.conf | 24 | ||||
-rw-r--r-- | health/health.d/apache.conf (renamed from conf.d/health.d/apache.conf) | 0 | ||||
-rw-r--r-- | health/health.d/apcupsd.conf | 40 | ||||
-rw-r--r-- | health/health.d/backend.conf (renamed from conf.d/health.d/backend.conf) | 0 | ||||
-rw-r--r-- | health/health.d/bcache.conf | 22 | ||||
-rw-r--r-- | health/health.d/beanstalkd.conf (renamed from conf.d/health.d/beanstalkd.conf) | 0 | ||||
-rw-r--r-- | health/health.d/bind_rndc.conf (renamed from conf.d/health.d/bind_rndc.conf) | 0 | ||||
-rw-r--r-- | health/health.d/boinc.conf | 62 | ||||
-rw-r--r-- | health/health.d/btrfs.conf (renamed from conf.d/health.d/btrfs.conf) | 0 | ||||
-rw-r--r-- | health/health.d/ceph.conf (renamed from conf.d/health.d/ceph.conf) | 0 | ||||
-rw-r--r-- | health/health.d/couchdb.conf (renamed from conf.d/health.d/couchdb.conf) | 0 | ||||
-rw-r--r-- | health/health.d/cpu.conf (renamed from conf.d/health.d/cpu.conf) | 0 | ||||
-rw-r--r-- | health/health.d/disks.conf (renamed from conf.d/health.d/disks.conf) | 0 | ||||
-rw-r--r-- | health/health.d/dockerd.conf | 8 | ||||
-rw-r--r-- | health/health.d/elasticsearch.conf (renamed from conf.d/health.d/elasticsearch.conf) | 0 | ||||
-rw-r--r-- | health/health.d/entropy.conf (renamed from conf.d/health.d/entropy.conf) | 0 | ||||
-rw-r--r-- | health/health.d/fping.conf (renamed from conf.d/health.d/fping.conf) | 0 | ||||
-rw-r--r-- | health/health.d/fronius.conf (renamed from conf.d/health.d/fronius.conf) | 0 | ||||
-rw-r--r-- | health/health.d/haproxy.conf (renamed from conf.d/health.d/haproxy.conf) | 0 | ||||
-rw-r--r-- | health/health.d/httpcheck.conf (renamed from conf.d/health.d/httpcheck.conf) | 0 | ||||
-rw-r--r-- | health/health.d/ipc.conf (renamed from conf.d/health.d/ipc.conf) | 4 | ||||
-rw-r--r-- | health/health.d/ipfs.conf (renamed from conf.d/health.d/ipfs.conf) | 0 | ||||
-rw-r--r-- | health/health.d/ipmi.conf (renamed from conf.d/health.d/ipmi.conf) | 0 | ||||
-rw-r--r-- | health/health.d/isc_dhcpd.conf (renamed from conf.d/health.d/isc_dhcpd.conf) | 0 | ||||
-rw-r--r-- | health/health.d/lighttpd.conf (renamed from conf.d/health.d/lighttpd.conf) | 0 | ||||
-rw-r--r-- | health/health.d/linux_power_supply.conf | 12 | ||||
-rw-r--r-- | health/health.d/load.conf | 56 | ||||
-rw-r--r-- | health/health.d/mdstat.conf (renamed from conf.d/health.d/mdstat.conf) | 23 | ||||
-rw-r--r-- | health/health.d/megacli.conf | 48 | ||||
-rw-r--r-- | health/health.d/memcached.conf (renamed from conf.d/health.d/memcached.conf) | 0 | ||||
-rw-r--r-- | health/health.d/memory.conf (renamed from conf.d/health.d/memory.conf) | 0 | ||||
-rw-r--r-- | health/health.d/mongodb.conf (renamed from conf.d/health.d/mongodb.conf) | 0 | ||||
-rw-r--r-- | health/health.d/mysql.conf (renamed from conf.d/health.d/mysql.conf) | 15 | ||||
-rw-r--r-- | health/health.d/named.conf (renamed from conf.d/health.d/named.conf) | 0 | ||||
-rw-r--r-- | health/health.d/net.conf (renamed from conf.d/health.d/net.conf) | 37 | ||||
-rw-r--r-- | health/health.d/netfilter.conf (renamed from conf.d/health.d/netfilter.conf) | 2 | ||||
-rw-r--r-- | health/health.d/nginx.conf (renamed from conf.d/health.d/nginx.conf) | 0 | ||||
-rw-r--r-- | health/health.d/nginx_plus.conf (renamed from conf.d/health.d/nginx_plus.conf) | 0 | ||||
-rw-r--r-- | health/health.d/portcheck.conf (renamed from conf.d/health.d/portcheck.conf) | 0 | ||||
-rw-r--r-- | health/health.d/postgres.conf (renamed from conf.d/health.d/postgres.conf) | 0 | ||||
-rw-r--r-- | health/health.d/qos.conf (renamed from conf.d/health.d/qos.conf) | 0 | ||||
-rw-r--r-- | health/health.d/ram.conf (renamed from conf.d/health.d/ram.conf) | 6 | ||||
-rw-r--r-- | health/health.d/redis.conf (renamed from conf.d/health.d/redis.conf) | 0 | ||||
-rw-r--r-- | health/health.d/retroshare.conf (renamed from conf.d/health.d/retroshare.conf) | 0 | ||||
-rw-r--r-- | health/health.d/softnet.conf (renamed from conf.d/health.d/softnet.conf) | 0 | ||||
-rw-r--r-- | health/health.d/squid.conf (renamed from conf.d/health.d/squid.conf) | 0 | ||||
-rw-r--r-- | health/health.d/stiebeleltron.conf (renamed from conf.d/health.d/stiebeleltron.conf) | 0 | ||||
-rw-r--r-- | health/health.d/swap.conf (renamed from conf.d/health.d/swap.conf) | 0 | ||||
-rw-r--r-- | health/health.d/tcp_conn.conf (renamed from conf.d/health.d/tcp_conn.conf) | 0 | ||||
-rw-r--r-- | health/health.d/tcp_listen.conf | 82 | ||||
-rw-r--r-- | health/health.d/tcp_mem.conf (renamed from conf.d/health.d/tcp_mem.conf) | 0 | ||||
-rw-r--r-- | health/health.d/tcp_orphans.conf (renamed from conf.d/health.d/tcp_orphans.conf) | 0 | ||||
-rw-r--r-- | health/health.d/tcp_resets.conf (renamed from conf.d/health.d/tcp_resets.conf) | 0 | ||||
-rw-r--r-- | health/health.d/udp_errors.conf (renamed from conf.d/health.d/udp_errors.conf) | 4 | ||||
-rw-r--r-- | health/health.d/varnish.conf (renamed from conf.d/health.d/varnish.conf) | 0 | ||||
-rw-r--r-- | health/health.d/web_log.conf (renamed from conf.d/health.d/web_log.conf) | 0 | ||||
-rw-r--r-- | health/health.d/zfs.conf (renamed from conf.d/health.d/zfs.conf) | 0 |
57 files changed, 428 insertions, 17 deletions
diff --git a/health/health.d/adaptec_raid.conf b/health/health.d/adaptec_raid.conf new file mode 100644 index 000000000..a1301ce8a --- /dev/null +++ b/health/health.d/adaptec_raid.conf @@ -0,0 +1,24 @@ + +# logical device status check + +template: adapter_raid_ld_status + on: adapter_raid.ld_status + lookup: max -5s + units: bool + every: 10s + crit: $this > 0 + delay: down 5m multiplier 1.5 max 1h + info: at least 1 logical device is failed or degraded + to: sysadmin + +# physical device state check + +template: adapter_raid_pd_state + on: adapter_raid.pd_state + lookup: max -5s + units: bool + every: 10s + crit: $this > 0 + delay: down 5m multiplier 1.5 max 1h + info: at least 1 physical device is not in online state + to: sysadmin diff --git a/conf.d/health.d/apache.conf b/health/health.d/apache.conf index 0c98b8778..0c98b8778 100644 --- a/conf.d/health.d/apache.conf +++ b/health/health.d/apache.conf diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf new file mode 100644 index 000000000..4f86037ba --- /dev/null +++ b/health/health.d/apcupsd.conf @@ -0,0 +1,40 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +template: 10min_ups_load + on: apcupsd.load + os: * + hosts: * + lookup: average -10m unaligned of percentage + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 10m multiplier 1.5 max 1h + info: average UPS load for the last 10 minutes + to: sitemgr + +# Discussion in https://github.com/netdata/netdata/pull/3928: +# Fire the alarm as soon as it's going on battery (99% charge) and clear only when full. +template: ups_charge + on: apcupsd.charge + os: * + hosts: * + lookup: average -60s unaligned of charge + units: % + every: 60s + warn: $this < 100 + crit: $this < (($status == $CRITICAL) ? (60) : (50)) + delay: down 10m multiplier 1.5 max 1h + info: current UPS charge, averaged over the last 60 seconds to reduce measurement errors + to: sitemgr + +template: apcupsd_last_collected_secs + on: apcupsd.load + calc: $now - $last_collected_t + every: 10s + units: seconds ago + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sitemgr diff --git a/conf.d/health.d/backend.conf b/health/health.d/backend.conf index 7af100d8f..7af100d8f 100644 --- a/conf.d/health.d/backend.conf +++ b/health/health.d/backend.conf diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf new file mode 100644 index 000000000..f0da9ac5e --- /dev/null +++ b/health/health.d/bcache.conf @@ -0,0 +1,22 @@ + +template: bcache_cache_errors + on: disk.bcache_cache_read_races + lookup: sum -10m unaligned absolute + units: errors + every: 1m + warn: $this > 0 + crit: $this > ( ($status >= $CRITICAL) ? (0) : (10) ) + delay: down 1h multiplier 1.5 max 2h + info: the number of times bcache had issues using the cache, during the last 10 mins (this usually means your SSD cache is failing) + to: sysadmin + +template: bcache_cache_dirty + on: disk.bcache_cache_alloc + calc: $dirty + $metadata + $undefined + units: % + every: 1m + warn: $this > ( ($status >= $WARNING ) ? ( 70 ) : ( 90 ) ) + crit: $this > ( ($status >= $CRITICAL) ? ( 90 ) : ( 95 ) ) + delay: up 1m down 1h multiplier 1.5 max 2h + info: the percentage of cache space used for dirty and metadata (this usually means your SSD cache is too small) + to: sysadmin diff --git a/conf.d/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf index 30dc27328..30dc27328 100644 --- a/conf.d/health.d/beanstalkd.conf +++ b/health/health.d/beanstalkd.conf diff --git a/conf.d/health.d/bind_rndc.conf b/health/health.d/bind_rndc.conf index 4145e77cd..4145e77cd 100644 --- a/conf.d/health.d/bind_rndc.conf +++ b/health/health.d/bind_rndc.conf diff --git a/health/health.d/boinc.conf b/health/health.d/boinc.conf new file mode 100644 index 000000000..43c588db6 --- /dev/null +++ b/health/health.d/boinc.conf @@ -0,0 +1,62 @@ +# Alarms for various BOINC issues. + +# Warn on any compute errors encountered. +template: boinc_compute_errors + on: boinc.states + os: * + hosts: * +families: * + lookup: average -10m unaligned of comperror + units: tasks + every: 1m + warn: $this > 0 + crit: $this > 1 + delay: up 1m down 5m multiplier 1.5 max 1h + info: the total number of compute errors over the past 10 minutes + to: sysadmin + +# Warn on lots of upload errors +template: boinc_upload_errors + on: boinc.states + os: * + hosts: * +families: * + lookup: average -10m unaligned of upload_failed + units: tasks + every: 1m + warn: $this > 0 + crit: $this > 1 + delay: up 1m down 5m multiplier 1.5 max 1h + info: the average number of failed uploads over the past 10 minutes + to: sysadmin + +# Warn on the task queue being empty +template: boinc_total_tasks + on: boinc.tasks + os: * + hosts: * +families: * + lookup: average -10m unaligned of total + units: tasks + every: 1m + warn: $this < 1 + crit: $this < 0.1 + delay: up 5m down 10m multiplier 1.5 max 1h + info: the total number of locally available tasks + to: sysadmin + +# Warn on no active tasks with a non-empty queue +template: boinc_active_tasks + on: boinc.tasks + os: * + hosts: * +families: * + lookup: average -10m unaligned of active + calc: ($boinc_total_tasks >= 1) ? ($this) : (inf) + units: tasks + every: 1m + warn: $this < 1 + crit: $this < 0.1 + delay: up 5m down 10m multiplier 1.5 max 1h + info: the total number of active tasks + to: sysadmin diff --git a/conf.d/health.d/btrfs.conf b/health/health.d/btrfs.conf index b27aa544f..b27aa544f 100644 --- a/conf.d/health.d/btrfs.conf +++ b/health/health.d/btrfs.conf diff --git a/conf.d/health.d/ceph.conf b/health/health.d/ceph.conf index de16f7b6f..de16f7b6f 100644 --- a/conf.d/health.d/ceph.conf +++ b/health/health.d/ceph.conf diff --git a/conf.d/health.d/couchdb.conf b/health/health.d/couchdb.conf index 4a2895280..4a2895280 100644 --- a/conf.d/health.d/couchdb.conf +++ b/health/health.d/couchdb.conf diff --git a/conf.d/health.d/cpu.conf b/health/health.d/cpu.conf index fa8189856..fa8189856 100644 --- a/conf.d/health.d/cpu.conf +++ b/health/health.d/cpu.conf diff --git a/conf.d/health.d/disks.conf b/health/health.d/disks.conf index 26f85848a..26f85848a 100644 --- a/conf.d/health.d/disks.conf +++ b/health/health.d/disks.conf diff --git a/health/health.d/dockerd.conf b/health/health.d/dockerd.conf new file mode 100644 index 000000000..729906cdb --- /dev/null +++ b/health/health.d/dockerd.conf @@ -0,0 +1,8 @@ +template: docker_unhealthy_containers + on: docker.unhealthy_containers + units: unhealthy containers + every: 10s + lookup: average -10s + crit: $this > 0 + info: number of unhealthy containers + to: sysadmin diff --git a/conf.d/health.d/elasticsearch.conf b/health/health.d/elasticsearch.conf index dffd40965..dffd40965 100644 --- a/conf.d/health.d/elasticsearch.conf +++ b/health/health.d/elasticsearch.conf diff --git a/conf.d/health.d/entropy.conf b/health/health.d/entropy.conf index 66d44ec13..66d44ec13 100644 --- a/conf.d/health.d/entropy.conf +++ b/health/health.d/entropy.conf diff --git a/conf.d/health.d/fping.conf b/health/health.d/fping.conf index 43658fef6..43658fef6 100644 --- a/conf.d/health.d/fping.conf +++ b/health/health.d/fping.conf diff --git a/conf.d/health.d/fronius.conf b/health/health.d/fronius.conf index cdf6c8fcb..cdf6c8fcb 100644 --- a/conf.d/health.d/fronius.conf +++ b/health/health.d/fronius.conf diff --git a/conf.d/health.d/haproxy.conf b/health/health.d/haproxy.conf index e49c70d48..e49c70d48 100644 --- a/conf.d/health.d/haproxy.conf +++ b/health/health.d/haproxy.conf diff --git a/conf.d/health.d/httpcheck.conf b/health/health.d/httpcheck.conf index 0ddf35eab..0ddf35eab 100644 --- a/conf.d/health.d/httpcheck.conf +++ b/health/health.d/httpcheck.conf diff --git a/conf.d/health.d/ipc.conf b/health/health.d/ipc.conf index 03cf264d8..989d6e912 100644 --- a/conf.d/health.d/ipc.conf +++ b/health/health.d/ipc.conf @@ -5,7 +5,7 @@ on: system.ipc_semaphores os: linux hosts: * - calc: $semaphores * 100 / $ipc.semaphores.max + calc: $semaphores * 100 / $ipc_semaphores_max units: % every: 10s warn: $this > (($status >= $WARNING) ? (70) : (80)) @@ -18,7 +18,7 @@ on: system.ipc_semaphore_arrays os: linux hosts: * - calc: $arrays * 100 / $ipc.semaphores.arrays.max + calc: $arrays * 100 / $ipc_semaphores_arrays_max units: % every: 10s warn: $this > (($status >= $WARNING) ? (70) : (80)) diff --git a/conf.d/health.d/ipfs.conf b/health/health.d/ipfs.conf index 3f77572d6..3f77572d6 100644 --- a/conf.d/health.d/ipfs.conf +++ b/health/health.d/ipfs.conf diff --git a/conf.d/health.d/ipmi.conf b/health/health.d/ipmi.conf index c25581964..c25581964 100644 --- a/conf.d/health.d/ipmi.conf +++ b/health/health.d/ipmi.conf diff --git a/conf.d/health.d/isc_dhcpd.conf b/health/health.d/isc_dhcpd.conf index 8054656ff..8054656ff 100644 --- a/conf.d/health.d/isc_dhcpd.conf +++ b/health/health.d/isc_dhcpd.conf diff --git a/conf.d/health.d/lighttpd.conf b/health/health.d/lighttpd.conf index 915907a4a..915907a4a 100644 --- a/conf.d/health.d/lighttpd.conf +++ b/health/health.d/lighttpd.conf diff --git a/health/health.d/linux_power_supply.conf b/health/health.d/linux_power_supply.conf new file mode 100644 index 000000000..27a172a14 --- /dev/null +++ b/health/health.d/linux_power_supply.conf @@ -0,0 +1,12 @@ +# Alert on low battery capacity. + +template: linux_power_supply_capacity + on: power_supply.capacity + calc: $capacity + units: % + every: 10s + warn: $this < 10 + crit: $this < 5 + delay: up 0 down 5m multiplier 1.2 max 1h + info: the percentage remaining capacity of the power supply + to: sysadmin diff --git a/health/health.d/load.conf b/health/health.d/load.conf new file mode 100644 index 000000000..ee0c54b8e --- /dev/null +++ b/health/health.d/load.conf @@ -0,0 +1,56 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + +# Calculate the base trigger point for the load average alarms. +# This is the maximum number of CPU's in the system over the past 1 +# minute, with a special case for a single CPU of setting the trigger at 2. + alarm: load_trigger + on: system.load + os: linux + hosts: * + calc: ($active_processors == nan or $active_processors == inf or $active_processors < 2) ? ( 2 ) : ( $active_processors ) + units: cpus + every: 1m + info: trigger point for load average alarms + +# Send alarms if the load average is unusually high. +# These intentionally _do not_ calculate the average over the sampled +# time period because the values being checked already are averages. + alarm: load_average_15 + on: system.load + os: linux + hosts: * + lookup: max -1m unaligned of load15 + units: load + every: 1m + warn: $this > (($status >= $WARNING) ? (1.75 * $load_trigger) : (2 * $load_trigger)) + crit: $this > (($status == $CRITICAL) ? (3.5 * $load_trigger) : (4 * $load_trigger)) + delay: down 15m multiplier 1.5 max 1h + info: fifteen-minute load average + to: sysadmin + + alarm: load_average_5 + on: system.load + os: linux + hosts: * + lookup: max -1m unaligned of load5 + units: load + every: 1m + warn: $this > (($status >= $WARNING) ? (3.5 * $load_trigger) : (4 * $load_trigger)) + crit: $this > (($status == $CRITICAL) ? (7 * $load_trigger) : (8 * $load_trigger)) + delay: down 15m multiplier 1.5 max 1h + info: five-minute load average + to: sysadmin + + alarm: load_average_1 + on: system.load + os: linux + hosts: * + lookup: max -1m unaligned of load1 + units: load + every: 1m + warn: $this > (($status >= $WARNING) ? (7 * $load_trigger) : (8 * $load_trigger)) + crit: $this > (($status == $CRITICAL) ? (14 * $load_trigger) : (16 * $load_trigger)) + delay: down 15m multiplier 1.5 max 1h + info: one-minute load average + to: sysadmin diff --git a/conf.d/health.d/mdstat.conf b/health/health.d/mdstat.conf index c9e7d20db..0f5f2837e 100644 --- a/conf.d/health.d/mdstat.conf +++ b/health/health.d/mdstat.conf @@ -1,3 +1,13 @@ +template: mdstat_last_collected + on: md.disks + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + info: number of seconds since the last successful data collection + to: sysadmin + template: mdstat_disks on: md.disks units: failed devices @@ -7,12 +17,11 @@ template: mdstat_disks info: Array is degraded! to: sysadmin -template: mdstat_last_collected - on: md.disks - calc: $now - $last_collected_t - units: seconds ago +template: mdstat_mismatch_cnt + on: md.mismatch_cnt + units: unsynchronized blocks + calc: $count every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - info: number of seconds since the last successful data collection + crit: $this > 0 + info: Mismatch count! to: sysadmin diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf new file mode 100644 index 000000000..1881a7be1 --- /dev/null +++ b/health/health.d/megacli.conf @@ -0,0 +1,48 @@ + alarm: adapter_state + on: megacli.adapter_degraded + units: is degraded + lookup: sum -10s + every: 10s + crit: $this > 0 + info: adapter state + to: sysadmin + + template: bbu_relative_charge + on: megacli.bbu_relative_charge + units: percent + lookup: average -10s + every: 10s + warn: $this <= (($status >= $WARNING) ? (85) : (80)) + crit: $this <= (($status == $CRITICAL) ? (50) : (40)) + info: BBU relative state of charge + to: sysadmin + + template: bbu_cycle_count + on: megacli.bbu_cycle_count + units: cycle count + lookup: average -10s + every: 10s + warn: $this >= 100 + crit: $this >= 500 + info: BBU cycle count + to: sysadmin + + alarm: pd_media_errors + on: megacli.pd_media_error + units: media errors + lookup: sum -10s + every: 10s + warn: $this > 0 + delay: down 1m multiplier 2 max 10m + info: physical drive media errors + to: sysadmin + + alarm: pd_predictive_failures + on: megacli.pd_predictive_failure + units: predictive failures + lookup: sum -10s + every: 10s + warn: $this > 0 + delay: down 1m multiplier 2 max 10m + info: physical drive predictive failures + to: sysadmin diff --git a/conf.d/health.d/memcached.conf b/health/health.d/memcached.conf index d248ef57a..d248ef57a 100644 --- a/conf.d/health.d/memcached.conf +++ b/health/health.d/memcached.conf diff --git a/conf.d/health.d/memory.conf b/health/health.d/memory.conf index 4a0e6e522..4a0e6e522 100644 --- a/conf.d/health.d/memory.conf +++ b/health/health.d/memory.conf diff --git a/conf.d/health.d/mongodb.conf b/health/health.d/mongodb.conf index a80cb3112..a80cb3112 100644 --- a/conf.d/health.d/mongodb.conf +++ b/health/health.d/mongodb.conf diff --git a/conf.d/health.d/mysql.conf b/health/health.d/mysql.conf index 1eeb993f0..39c401915 100644 --- a/conf.d/health.d/mysql.conf +++ b/health/health.d/mysql.conf @@ -60,6 +60,21 @@ template: mysql_10s_waited_locks_ratio # ----------------------------------------------------------------------------- +# connections + +template: mysql_connections + on: mysql.connections_active + calc: $active * 100 / $limit + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (60) : (70)) + crit: $this > (($status == $CRITICAL) ? (80) : (90)) + delay: down 15m multiplier 1.5 max 1h + info: the ratio of current active connections vs the maximum possible number of connections + to: dba + + +# ----------------------------------------------------------------------------- # replication template: mysql_replication diff --git a/conf.d/health.d/named.conf b/health/health.d/named.conf index 4fc65c8ee..4fc65c8ee 100644 --- a/conf.d/health.d/named.conf +++ b/health/health.d/named.conf diff --git a/conf.d/health.d/net.conf b/health/health.d/net.conf index 22a88927d..489016dd5 100644 --- a/conf.d/health.d/net.conf +++ b/health/health.d/net.conf @@ -2,6 +2,39 @@ # you can disable an alarm notification by setting the 'to' line to: silent # ----------------------------------------------------------------------------- +# net traffic overflow + + template: 1m_received_traffic_overflow + on: net.net + os: linux + hosts: * + families: * + lookup: average -1m unaligned absolute of received + calc: ($nic_speed_max > 0) ? ($this * 100 / ($nic_speed_max * 1000)) : ( nan ) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (90)) + delay: down 1m multiplier 1.5 max 1h + info: interface received bandwidth usage over net device speed max + to: sysadmin + + template: 1m_sent_traffic_overflow + on: net.net + os: linux + hosts: * + families: * + lookup: average -1m unaligned absolute of sent + calc: ($nic_speed_max > 0) ? ($this * 100 / ($nic_speed_max * 1000)) : ( nan ) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (90)) + delay: down 1m multiplier 1.5 max 1h + info: interface sent bandwidth usage over net device speed max + to: sysadmin + +# ----------------------------------------------------------------------------- # dropped packets # check if an interface is dropping packets @@ -101,7 +134,7 @@ template: 1m_received_packets_rate os: linux freebsd hosts: * families: * - lookup: average -1m of received + lookup: average -1m unaligned of received units: packets every: 10s info: the average number of packets received during the last minute @@ -111,7 +144,7 @@ template: 10s_received_packets_storm os: linux freebsd hosts: * families: * - lookup: average -10s of received + lookup: average -10s unaligned of received calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate)) every: 10s units: % diff --git a/conf.d/health.d/netfilter.conf b/health/health.d/netfilter.conf index fa1732b33..1d07752cc 100644 --- a/conf.d/health.d/netfilter.conf +++ b/health/health.d/netfilter.conf @@ -19,7 +19,7 @@ os: linux hosts: * lookup: max -10s unaligned of connections - calc: $this * 100 / $netfilter.conntrack.max + calc: $this * 100 / $netfilter_conntrack_max units: % every: 10s warn: $this > (($status >= $WARNING) ? (70) : (80)) diff --git a/conf.d/health.d/nginx.conf b/health/health.d/nginx.conf index a686c3d99..a686c3d99 100644 --- a/conf.d/health.d/nginx.conf +++ b/health/health.d/nginx.conf diff --git a/conf.d/health.d/nginx_plus.conf b/health/health.d/nginx_plus.conf index 5a171a76d..5a171a76d 100644 --- a/conf.d/health.d/nginx_plus.conf +++ b/health/health.d/nginx_plus.conf diff --git a/conf.d/health.d/portcheck.conf b/health/health.d/portcheck.conf index f42b63d30..f42b63d30 100644 --- a/conf.d/health.d/portcheck.conf +++ b/health/health.d/portcheck.conf diff --git a/conf.d/health.d/postgres.conf b/health/health.d/postgres.conf index 4e0583b85..4e0583b85 100644 --- a/conf.d/health.d/postgres.conf +++ b/health/health.d/postgres.conf diff --git a/conf.d/health.d/qos.conf b/health/health.d/qos.conf index 7290d15ff..7290d15ff 100644 --- a/conf.d/health.d/qos.conf +++ b/health/health.d/qos.conf diff --git a/conf.d/health.d/ram.conf b/health/health.d/ram.conf index b6dc5f945..4e437322c 100644 --- a/conf.d/health.d/ram.conf +++ b/health/health.d/ram.conf @@ -3,7 +3,7 @@ alarm: used_ram_to_ignore on: system.ram - os: linux + os: linux freebsd hosts: * calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz) every: 10s @@ -41,7 +41,7 @@ alarm: ram_in_use on: system.ram os: freebsd hosts: * - calc: (($active + $wired) - $used_ram_to_ignore) * 100 / (($active + $wired) - $used_ram_to_ignore + $cached + $free) + calc: ($active + $wired + $laundry + $buffers - $used_ram_to_ignore) * 100 / ($active + $wired + $laundry + $buffers - $used_ram_to_ignore + $cache + $free + $inactive) units: % every: 10s warn: $this > (($status >= $WARNING) ? (80) : (90)) @@ -54,7 +54,7 @@ delay: down 15m multiplier 1.5 max 1h on: system.ram os: freebsd hosts: * - calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $buffers) + calc: ($free + $inactive + $used_ram_to_ignore) * 100 / ($free + $active + $inactive + $wired + $cache + $laundry + $buffers) units: % every: 10s warn: $this < (($status >= $WARNING) ? ( 5) : (10)) diff --git a/conf.d/health.d/redis.conf b/health/health.d/redis.conf index c08a884a6..c08a884a6 100644 --- a/conf.d/health.d/redis.conf +++ b/health/health.d/redis.conf diff --git a/conf.d/health.d/retroshare.conf b/health/health.d/retroshare.conf index 2344b60ec..2344b60ec 100644 --- a/conf.d/health.d/retroshare.conf +++ b/health/health.d/retroshare.conf diff --git a/conf.d/health.d/softnet.conf b/health/health.d/softnet.conf index 77c804bfd..77c804bfd 100644 --- a/conf.d/health.d/softnet.conf +++ b/health/health.d/softnet.conf diff --git a/conf.d/health.d/squid.conf b/health/health.d/squid.conf index 06cc9678f..06cc9678f 100644 --- a/conf.d/health.d/squid.conf +++ b/health/health.d/squid.conf diff --git a/conf.d/health.d/stiebeleltron.conf b/health/health.d/stiebeleltron.conf index e0361eb20..e0361eb20 100644 --- a/conf.d/health.d/stiebeleltron.conf +++ b/health/health.d/stiebeleltron.conf diff --git a/conf.d/health.d/swap.conf b/health/health.d/swap.conf index f920b0807..f920b0807 100644 --- a/conf.d/health.d/swap.conf +++ b/health/health.d/swap.conf diff --git a/conf.d/health.d/tcp_conn.conf b/health/health.d/tcp_conn.conf index 7aa9a9800..7aa9a9800 100644 --- a/conf.d/health.d/tcp_conn.conf +++ b/health/health.d/tcp_conn.conf diff --git a/health/health.d/tcp_listen.conf b/health/health.d/tcp_listen.conf new file mode 100644 index 000000000..552930ab7 --- /dev/null +++ b/health/health.d/tcp_listen.conf @@ -0,0 +1,82 @@ +# +# There are two queues involved when incoming TCP connections are handled +# (both at the kernel): +# +# SYN queue +# The SYN queue tracks TCP handshakes until connections are fully established. +# It overflows when too many incoming TCP connection requests hang in the +# half-open state and the server is not configured to fall back to SYN cookies. +# Overflows are usually caused by SYN flood DoS attacks (i.e. someone sends +# lots of SYN packets and never completes the handshakes). +# +# Accept queue +# The accept queue holds fully established TCP connections waiting to be handled +# by the listening application. It overflows when the server application fails +# to accept new connections at the rate they are coming in. +# +# +# ----------------------------------------------------------------------------- +# tcp accept queue (at the kernel) + + alarm: 1m_tcp_accept_queue_overflows + on: ip.tcp_accept_queue + os: linux + hosts: * + lookup: sum -60s unaligned absolute of ListenOverflows + units: overflows + every: 10s + crit: $this > 0 + delay: up 0 down 5m multiplier 1.5 max 1h + info: the number of times the TCP accept queue of the kernel overflown, during the last minute + to: sysadmin + +# THIS IS TOO GENERIC +# CHECK: https://github.com/netdata/netdata/issues/3234#issuecomment-423935842 + alarm: 1m_tcp_accept_queue_drops + on: ip.tcp_accept_queue + os: linux + hosts: * + lookup: sum -60s unaligned absolute of ListenDrops + units: drops + every: 10s +# warn: $this > 0 + crit: $this > (($status == $CRITICAL) ? (0) : (150)) + delay: up 0 down 5m multiplier 1.5 max 1h + info: the number of times the TCP accept queue of the kernel dropped packets, during the last minute (includes bogus packets received) + to: sysadmin + + +# ----------------------------------------------------------------------------- +# tcp SYN queue (at the kernel) + +# When the SYN queue is full, either TcpExtTCPReqQFullDoCookies or +# TcpExtTCPReqQFullDrop is incremented, depending on whether SYN cookies are +# enabled or not. In both cases this probably indicates a SYN flood attack, +# so i guess a notification should be sent. + + alarm: 1m_tcp_syn_queue_drops + on: ip.tcp_syn_queue + os: linux + hosts: * + lookup: sum -60s unaligned absolute of TCPReqQFullDrop + units: drops + every: 10s + warn: $this > 0 + crit: $this > (($status == $CRITICAL) ? (0) : (60)) + delay: up 0 down 5m multiplier 1.5 max 1h + info: the number of times the TCP SYN queue of the kernel was full and dropped packets, during the last minute + to: sysadmin + + alarm: 1m_tcp_syn_queue_cookies + on: ip.tcp_syn_queue + os: linux + hosts: * + lookup: sum -60s unaligned absolute of TCPReqQFullDoCookies + units: cookies + every: 10s + warn: $this > 0 + crit: $this > (($status == $CRITICAL) ? (0) : (60)) + delay: up 0 down 5m multiplier 1.5 max 1h + info: the number of times the TCP SYN queue of the kernel was full and sent SYN cookies, during the last minute + to: sysadmin + diff --git a/conf.d/health.d/tcp_mem.conf b/health/health.d/tcp_mem.conf index 6927d5765..6927d5765 100644 --- a/conf.d/health.d/tcp_mem.conf +++ b/health/health.d/tcp_mem.conf diff --git a/conf.d/health.d/tcp_orphans.conf b/health/health.d/tcp_orphans.conf index 280d6590f..280d6590f 100644 --- a/conf.d/health.d/tcp_orphans.conf +++ b/health/health.d/tcp_orphans.conf diff --git a/conf.d/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf index 91dad3c6a..91dad3c6a 100644 --- a/conf.d/health.d/tcp_resets.conf +++ b/health/health.d/tcp_resets.conf diff --git a/conf.d/health.d/udp_errors.conf b/health/health.d/udp_errors.conf index 382b39658..5140228f5 100644 --- a/conf.d/health.d/udp_errors.conf +++ b/health/health.d/udp_errors.conf @@ -27,7 +27,7 @@ units: errors every: 10s warn: $this > 0 - crit: $this > 100 + crit: $this > (($status == $CRITICAL) ? (0) : (100)) info: number of UDP receive buffer errors during the last minute delay: up 0 down 60m multiplier 1.2 max 2h to: sysadmin @@ -43,7 +43,7 @@ units: errors every: 10s warn: $this > 0 - crit: $this > 100 + crit: $this > (($status == $CRITICAL) ? (0) : (100)) info: number of UDP send buffer errors during the last minute delay: up 0 down 60m multiplier 1.2 max 2h to: sysadmin diff --git a/conf.d/health.d/varnish.conf b/health/health.d/varnish.conf index cca7446b4..cca7446b4 100644 --- a/conf.d/health.d/varnish.conf +++ b/health/health.d/varnish.conf diff --git a/conf.d/health.d/web_log.conf b/health/health.d/web_log.conf index d8be88b47..d8be88b47 100644 --- a/conf.d/health.d/web_log.conf +++ b/health/health.d/web_log.conf diff --git a/conf.d/health.d/zfs.conf b/health/health.d/zfs.conf index af73824e6..af73824e6 100644 --- a/conf.d/health.d/zfs.conf +++ b/health/health.d/zfs.conf |