From f99c4526d94d3e04124c5c48ab4a3da6ca53a458 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Wed, 31 Mar 2021 14:58:11 +0200 Subject: Adding upstream version 1.30.0. Signed-off-by: Daniel Baumann --- health/Makefile.am | 1 - health/health.c | 6 +- health/health.d/adaptec_raid.conf | 16 +- health/health.d/anomalies.conf | 8 +- health/health.d/apcupsd.conf | 8 +- health/health.d/apps_plugin.conf | 15 -- health/health.d/backend.conf | 25 +- health/health.d/bcache.conf | 14 +- health/health.d/beanstalkd.conf | 10 +- health/health.d/bind_rndc.conf | 4 +- health/health.d/boinc.conf | 8 +- health/health.d/btrfs.conf | 9 +- health/health.d/ceph.conf | 13 +- health/health.d/cgroups.conf | 17 +- health/health.d/cockroachdb.conf | 6 +- health/health.d/cpu.conf | 8 +- health/health.d/dbengine.conf | 10 +- health/health.d/disks.conf | 104 ++++----- health/health.d/dns_query.conf | 2 +- health/health.d/dnsmasq_dhcp.conf | 4 +- health/health.d/dockerd.conf | 2 +- health/health.d/entropy.conf | 4 +- health/health.d/exporting.conf | 11 - health/health.d/fping.conf | 13 +- health/health.d/gearman.conf | 4 +- health/health.d/haproxy.conf | 4 +- health/health.d/hdfs.conf | 10 +- health/health.d/httpcheck.conf | 42 ++-- health/health.d/ioping.conf | 4 +- health/health.d/ipc.conf | 4 +- health/health.d/ipfs.conf | 2 +- health/health.d/ipmi.conf | 4 +- health/health.d/isc_dhcpd.conf | 20 +- health/health.d/kubelet.conf | 43 ++-- health/health.d/linux_power_supply.conf | 2 +- health/health.d/load.conf | 20 +- health/health.d/mdstat.conf | 7 +- health/health.d/megacli.conf | 68 +++--- health/health.d/memcached.conf | 13 +- health/health.d/memory.conf | 4 +- health/health.d/mysql.conf | 26 ++- health/health.d/net.conf | 97 ++++---- health/health.d/netfilter.conf | 19 +- health/health.d/pihole.conf | 10 +- health/health.d/portcheck.conf | 12 +- health/health.d/processes.conf | 6 +- health/health.d/ram.conf | 11 +- health/health.d/redis.conf | 4 +- health/health.d/retroshare.conf | 2 +- health/health.d/riakkv.conf | 38 +-- health/health.d/scaleio.conf | 4 +- health/health.d/softnet.conf | 13 +- health/health.d/swap.conf | 22 +- health/health.d/synchronization.conf | 12 + health/health.d/tcp_conn.conf | 4 +- health/health.d/tcp_listen.conf | 9 +- health/health.d/tcp_mem.conf | 4 +- health/health.d/tcp_orphans.conf | 4 +- health/health.d/tcp_resets.conf | 28 +-- health/health.d/udp_errors.conf | 29 +-- health/health.d/unbound.conf | 4 +- health/health.d/vcsa.conf | 24 +- health/health.d/vernemq.conf | 319 +++++++++----------------- health/health.d/vsphere.conf | 62 ++--- health/health.d/web_log.conf | 50 ++-- health/health.d/whoisquery.conf | 2 +- health/health.d/wmi.conf | 24 +- health/health.d/x509check.conf | 4 +- health/health.d/zfs.conf | 2 +- health/health.h | 2 +- health/health_config.c | 2 +- health/health_json.c | 19 +- health/health_log.c | 6 +- health/notifications/alarm-notify.sh.in | 35 ++- health/notifications/email/README.md | 2 +- health/notifications/health_alarm_notify.conf | 4 + health/notifications/stackpulse/README.md | 5 +- 77 files changed, 682 insertions(+), 807 deletions(-) delete mode 100644 health/health.d/apps_plugin.conf create mode 100644 health/health.d/synchronization.conf (limited to 'health') diff --git a/health/Makefile.am b/health/Makefile.am index 399d6df5a..0802dc750 100644 --- a/health/Makefile.am +++ b/health/Makefile.am @@ -29,7 +29,6 @@ dist_healthconfig_DATA = \ health.d/anomalies.conf \ health.d/apache.conf \ health.d/apcupsd.conf \ - health.d/apps_plugin.conf \ health.d/backend.conf \ health.d/bcache.conf \ health.d/beanstalkd.conf \ diff --git a/health/health.c b/health/health.c index b81361e8a..0793100a6 100644 --- a/health/health.c +++ b/health/health.c @@ -966,12 +966,14 @@ void *health_main(void *ptr) { } else if(unlikely(rc->status == RRDCALC_STATUS_CLEAR)) { if(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE)) { if(rc->old_status == RRDCALC_STATUS_CRITICAL) { - repeat_every = rc->crit_repeat_every; + repeat_every = 1; } else if (rc->old_status == RRDCALC_STATUS_WARNING) { - repeat_every = rc->warn_repeat_every; + repeat_every = 1; } } } + } else { + continue; } if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) { diff --git a/health/health.d/adaptec_raid.conf b/health/health.d/adaptec_raid.conf index a1301ce8a..0753c6e5d 100644 --- a/health/health.d/adaptec_raid.conf +++ b/health/health.d/adaptec_raid.conf @@ -1,24 +1,24 @@ # logical device status check -template: adapter_raid_ld_status - on: adapter_raid.ld_status - lookup: max -5s +template: adaptec_raid_ld_status + on: adaptec_raid.ld_status + lookup: max -10s foreach * units: bool every: 10s crit: $this > 0 delay: down 5m multiplier 1.5 max 1h - info: at least 1 logical device is failed or degraded + info: logical device status is failed or degraded to: sysadmin # physical device state check -template: adapter_raid_pd_state - on: adapter_raid.pd_state - lookup: max -5s +template: adaptec_raid_pd_state + on: adaptec_raid.pd_state + lookup: max -10s foreach * units: bool every: 10s crit: $this > 0 delay: down 5m multiplier 1.5 max 1h - info: at least 1 physical device is not in online state + info: physical device state is not online to: sysadmin diff --git a/health/health.d/anomalies.conf b/health/health.d/anomalies.conf index a2d248efe..c4c96eaf9 100644 --- a/health/health.d/anomalies.conf +++ b/health/health.d/anomalies.conf @@ -1,17 +1,17 @@ # raise a warning alarm if an anomaly probability is consistently above 50% -template: anomaly_probabilities +template: anomalies_anomaly_probabilities on: anomalies.probability lookup: average -2m foreach * every: 1m warn: $this > 50 - info: average anomaly probability > 50% for last 2 minutes + info: average anomaly probability over the last 2 minutes # raise a warning alarm if an anomaly flag is consistently firing -template: anomaly_flags +template: anomalies_anomaly_flags on: anomalies.anomaly lookup: sum -2m foreach * every: 1m warn: $this > 10 - info: count of anomalies > 10 for last 2 minutes + info: number of anomalies in the last 2 minutes diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf index 4f86037ba..12384fac6 100644 --- a/health/health.d/apcupsd.conf +++ b/health/health.d/apcupsd.conf @@ -1,6 +1,6 @@ # you can disable an alarm notification by setting the 'to' line to: silent -template: 10min_ups_load +template: apcupsd_10min_ups_load on: apcupsd.load os: * hosts: * @@ -10,12 +10,12 @@ template: 10min_ups_load warn: $this > (($status >= $WARNING) ? (70) : (80)) crit: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 10m multiplier 1.5 max 1h - info: average UPS load for the last 10 minutes + info: average UPS load over the last 10 minutes to: sitemgr # Discussion in https://github.com/netdata/netdata/pull/3928: # Fire the alarm as soon as it's going on battery (99% charge) and clear only when full. -template: ups_charge +template: apcupsd_ups_charge on: apcupsd.charge os: * hosts: * @@ -25,7 +25,7 @@ template: ups_charge warn: $this < 100 crit: $this < (($status == $CRITICAL) ? (60) : (50)) delay: down 10m multiplier 1.5 max 1h - info: current UPS charge, averaged over the last 60 seconds to reduce measurement errors + info: average UPS charge over the last minute to: sitemgr template: apcupsd_last_collected_secs diff --git a/health/health.d/apps_plugin.conf b/health/health.d/apps_plugin.conf deleted file mode 100644 index 9a27bc6ba..000000000 --- a/health/health.d/apps_plugin.conf +++ /dev/null @@ -1,15 +0,0 @@ -# you can disable an alarm notification by setting the 'to' line to: silent - -# disabled due to https://github.com/netdata/netdata/issues/10327 -# -# alarm: used_file_descriptors -# on: apps.files -# hosts: * -# calc: $fdperc -# units: % -# every: 5s -# warn: $this > (($status >= $WARNING) ? (75) : (80)) -# crit: $this > (($status == $CRITICAL) ? (85) : (90)) -# delay: down 5m multiplier 1.5 max 1h -# info: Peak percentage of file descriptors used -# to: sysadmin diff --git a/health/health.d/backend.conf b/health/health.d/backend.conf index e51b8aa5f..8089dc94e 100644 --- a/health/health.d/backend.conf +++ b/health/health.d/backend.conf @@ -6,7 +6,7 @@ every: 1m warn: $this > 0 delay: down 5m multiplier 1.5 max 1h - info: The backends subsystem is deprecated and will be removed soon. Migrate your configuration to exporting.conf. + info: the backends subsystem is deprecated and will be removed soon. Migrate your configuration to exporting.conf. to: sysadmin # make sure we are sending data to backend @@ -31,26 +31,3 @@ delay: down 5m multiplier 1.5 max 1h info: percentage of metrics sent to the backend server to: dba - - alarm: backend_metrics_lost - on: netdata.backend_metrics - units: metrics - calc: abs($lost) - every: 10s - crit: ($this != 0) || ($status == $CRITICAL && abs($sent) == 0) - delay: down 5m multiplier 1.5 max 1h - info: number of metrics lost due to repeating failures to contact the backend server - to: dba - - -# this chart has been removed from netdata -# alarm: backend_slow -# on: netdata.backend_latency -# units: % -# calc: $latency * 100 / ($update_every * 1000) -# every: 10s -# warn: $this > 50 -# crit: $this > 100 -# delay: down 5m multiplier 1.5 max 1h -# info: the percentage of time between iterations needed by the backend time to process the data sent by netdata -# to: dba diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf index f0da9ac5e..d5fccf4f7 100644 --- a/health/health.d/bcache.conf +++ b/health/health.d/bcache.conf @@ -1,13 +1,14 @@ template: bcache_cache_errors on: disk.bcache_cache_read_races - lookup: sum -10m unaligned absolute + lookup: sum -1m unaligned absolute units: errors every: 1m warn: $this > 0 - crit: $this > ( ($status >= $CRITICAL) ? (0) : (10) ) - delay: down 1h multiplier 1.5 max 2h - info: the number of times bcache had issues using the cache, during the last 10 mins (this usually means your SSD cache is failing) + delay: up 2m down 1h multiplier 1.5 max 2h + info: number of times data was read from the cache, \ + the bucket was reused and invalidated in the last 10 minutes \ + (when this occurs the data is reread from the backing device) to: sysadmin template: bcache_cache_dirty @@ -16,7 +17,8 @@ template: bcache_cache_dirty units: % every: 1m warn: $this > ( ($status >= $WARNING ) ? ( 70 ) : ( 90 ) ) - crit: $this > ( ($status >= $CRITICAL) ? ( 90 ) : ( 95 ) ) + crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) ) delay: up 1m down 1h multiplier 1.5 max 2h - info: the percentage of cache space used for dirty and metadata (this usually means your SSD cache is too small) + info: percentage of cache space used for dirty data and metadata \ + (this usually means your SSD cache is too small) to: sysadmin diff --git a/health/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf index 30dc27328..0c428ecbc 100644 --- a/health/health.d/beanstalkd.conf +++ b/health/health.d/beanstalkd.conf @@ -1,6 +1,6 @@ # get the number of buried jobs in all queues -template: server_buried_jobs +template: beanstalk_server_buried_jobs on: beanstalk.current_jobs calc: $buried units: jobs @@ -8,12 +8,14 @@ template: server_buried_jobs warn: $this > 0 crit: $this > 10 delay: up 0 down 5m multiplier 1.2 max 1h - info: the number of buried jobs aggregated across all tubes + info: number of buried jobs across all tubes. \ + You need to manually kick them so they can be processed. \ + Presence of buried jobs in a tube does not affect new jobs. to: sysadmin # get the number of buried jobs per queue -#template: tube_buried_jobs +#template: beanstalk_tube_buried_jobs # on: beanstalk.jobs # calc: $buried # units: jobs @@ -26,7 +28,7 @@ template: server_buried_jobs # get the current number of tubes -#template: number_of_tubes +#template: beanstalk_number_of_tubes # on: beanstalk.current_tubes # calc: $tubes # every: 10s diff --git a/health/health.d/bind_rndc.conf b/health/health.d/bind_rndc.conf index 4145e77cd..5cc7a72f7 100644 --- a/health/health.d/bind_rndc.conf +++ b/health/health.d/bind_rndc.conf @@ -1,9 +1,9 @@ - template: bind_rndc_stats_file_size +template: bind_rndc_stats_file_size on: bind_rndc.stats_size units: megabytes every: 60 calc: $stats_size warn: $this > 512 crit: $this > 1024 - info: Bind stats file is very large! Consider to create logrotate conf file for it! + info: BIND statistics-file size to: sysadmin diff --git a/health/health.d/boinc.conf b/health/health.d/boinc.conf index 43c588db6..25b7f1994 100644 --- a/health/health.d/boinc.conf +++ b/health/health.d/boinc.conf @@ -12,7 +12,7 @@ families: * warn: $this > 0 crit: $this > 1 delay: up 1m down 5m multiplier 1.5 max 1h - info: the total number of compute errors over the past 10 minutes + info: average number of compute errors over the last 10 minutes to: sysadmin # Warn on lots of upload errors @@ -27,7 +27,7 @@ families: * warn: $this > 0 crit: $this > 1 delay: up 1m down 5m multiplier 1.5 max 1h - info: the average number of failed uploads over the past 10 minutes + info: average number of failed uploads over the last 10 minutes to: sysadmin # Warn on the task queue being empty @@ -42,7 +42,7 @@ families: * warn: $this < 1 crit: $this < 0.1 delay: up 5m down 10m multiplier 1.5 max 1h - info: the total number of locally available tasks + info: average number of total tasks over the last 10 minutes to: sysadmin # Warn on no active tasks with a non-empty queue @@ -58,5 +58,5 @@ families: * warn: $this < 1 crit: $this < 0.1 delay: up 5m down 10m multiplier 1.5 max 1h - info: the total number of active tasks + info: average number of active tasks over the last 10 minutes to: sysadmin diff --git a/health/health.d/btrfs.conf b/health/health.d/btrfs.conf index b27aa544f..93ab8748a 100644 --- a/health/health.d/btrfs.conf +++ b/health/health.d/btrfs.conf @@ -10,7 +10,7 @@ families: * warn: $this > (($status >= $WARNING) ? (90) : (95)) crit: $this > (($status == $CRITICAL) ? (95) : (98)) delay: up 1m down 15m multiplier 1.5 max 1h - info: the percentage of allocated BTRFS physical disk space + info: percentage of allocated BTRFS physical disk space to: sysadmin template: btrfs_data @@ -24,7 +24,7 @@ families: * warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98 crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98 delay: up 1m down 15m multiplier 1.5 max 1h - info: the percentage of used BTRFS data space + info: utilization of BTRFS data space to: sysadmin template: btrfs_metadata @@ -38,7 +38,7 @@ families: * warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98 crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98 delay: up 1m down 15m multiplier 1.5 max 1h - info: the percentage of used BTRFS metadata space + info: utilization of BTRFS metadata space to: sysadmin template: btrfs_system @@ -52,6 +52,5 @@ families: * warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98 crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98 delay: up 1m down 15m multiplier 1.5 max 1h - info: the percentage of used BTRFS system space + info: utilization of BTRFS system space to: sysadmin - diff --git a/health/health.d/ceph.conf b/health/health.d/ceph.conf index de16f7b6f..cdbab0f67 100644 --- a/health/health.d/ceph.conf +++ b/health/health.d/ceph.conf @@ -1,13 +1,12 @@ # low ceph disk available -template: cluster_space_usage +template: ceph_cluster_space_usage on: ceph.general_usage - calc: $avail * 100 / ($avail + $used) + calc: $used * 100 / ($used + $avail) units: % - every: 10s - warn: $this < 10 - crit: $this < 1 + every: 1m + warn: $this > (($status >= $WARNING ) ? (85) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 5m multiplier 1.2 max 1h - info: ceph disk usage is almost full + info: cluster disk space utilization to: sysadmin - diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf index 79ece53f9..c0a16f154 100644 --- a/health/health.d/cgroups.conf +++ b/health/health.d/cgroups.conf @@ -11,7 +11,7 @@ template: cgroup_10min_cpu_usage warn: $this > (($status >= $WARNING) ? (75) : (85)) crit: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 15m multiplier 1.5 max 1h - info: cpu utilization for the last 10 minutes + info: average cgroup CPU utilization over the last 10 minutes to: sysadmin template: cgroup_ram_in_use @@ -24,18 +24,5 @@ template: cgroup_ram_in_use warn: $this > (($status >= $WARNING) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 15m multiplier 1.5 max 1h - info: RAM used by cgroup - to: sysadmin - -template: cgroup_ram_and_swap_in_use - on: cgroup.mem_usage - os: linux - hosts: * - calc: ($ram + $swap) * 100 / $memory_and_swap_limit - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: down 15m multiplier 1.5 max 1h - info: RAM and Swap used by cgroup + info: cgroup memory utilization to: sysadmin diff --git a/health/health.d/cockroachdb.conf b/health/health.d/cockroachdb.conf index 8ab2c9d0f..47773d04c 100644 --- a/health/health.d/cockroachdb.conf +++ b/health/health.d/cockroachdb.conf @@ -22,7 +22,7 @@ template: cockroachdb_used_storage_capacity warn: $this > (($status >= $WARNING) ? (80) : (85)) crit: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 15m multiplier 1.5 max 1h - info: entire disk usage percentage + info: storage capacity utilization to: dba template: cockroachdb_used_usable_storage_capacity @@ -33,7 +33,7 @@ template: cockroachdb_used_usable_storage_capacity warn: $this > (($status >= $WARNING) ? (80) : (85)) crit: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 15m multiplier 1.5 max 1h - info: usable space usage percentage + info: storage usable space utilization to: dba # Replication @@ -67,7 +67,7 @@ template: cockroachdb_open_file_descriptors_limit every: 10s warn: $this > 80 delay: down 15m multiplier 1.5 max 1h - info: open file descriptors usage percentage + info: open file descriptors utilization (against softlimit) to: dba # SQL diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf index fa8189856..32c69f8f5 100644 --- a/health/health.d/cpu.conf +++ b/health/health.d/cpu.conf @@ -11,7 +11,7 @@ template: 10min_cpu_usage warn: $this > (($status >= $WARNING) ? (75) : (85)) crit: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 15m multiplier 1.5 max 1h - info: average cpu utilization for the last 10 minutes (excluding iowait, nice and steal) + info: average CPU utilization over the last 10 minutes (excluding iowait, nice and steal) to: sysadmin template: 10min_cpu_iowait @@ -24,7 +24,7 @@ template: 10min_cpu_iowait warn: $this > (($status >= $WARNING) ? (20) : (40)) crit: $this > (($status == $CRITICAL) ? (40) : (50)) delay: down 15m multiplier 1.5 max 1h - info: average CPU wait I/O for the last 10 minutes + info: average CPU iowait time over the last 10 minutes to: sysadmin template: 20min_steal_cpu @@ -37,7 +37,7 @@ template: 20min_steal_cpu warn: $this > (($status >= $WARNING) ? (5) : (10)) crit: $this > (($status == $CRITICAL) ? (20) : (30)) delay: down 1h multiplier 1.5 max 2h - info: average CPU steal time for the last 20 minutes + info: average CPU steal time over the last 20 minutes to: sysadmin ## FreeBSD @@ -51,5 +51,5 @@ template: 10min_cpu_usage warn: $this > (($status >= $WARNING) ? (75) : (85)) crit: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 15m multiplier 1.5 max 1h - info: average cpu utilization for the last 10 minutes (excluding nice) + info: average CPU utilization over the last 10 minutes (excluding nice) to: sysadmin diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf index 274673e3e..3e51d37ec 100644 --- a/health/health.d/dbengine.conf +++ b/health/health.d/dbengine.conf @@ -10,7 +10,7 @@ lookup: sum -10m unaligned of fs_errors every: 10s crit: $this > 0 delay: down 15m multiplier 1.5 max 1h - info: number of File-System errors dbengine came across the last 10 minutes (too many open files, wrong permissions etc) + info: number of filesystem errors in the last 10 minutes (too many open files, wrong permissions, etc) to: sysadmin alarm: 10min_dbengine_global_io_errors @@ -22,7 +22,7 @@ lookup: sum -10m unaligned of io_errors every: 10s crit: $this > 0 delay: down 1h multiplier 1.5 max 3h - info: number of IO errors dbengine came across the last 10 minutes (CRC errors, out of space, bad disk etc) + info: number of IO errors in the last 10 minutes (CRC errors, out of space, bad disk, etc) to: sysadmin alarm: 10min_dbengine_global_flushing_warnings @@ -34,7 +34,8 @@ lookup: sum -10m unaligned of pg_cache_over_half_dirty_events every: 10s warn: $this > 0 delay: down 1h multiplier 1.5 max 3h - info: number of times in the last 10 minutes that dbengine dirty pages were over 50% of the instance's page cache, metric data at risk of not being stored in the database, please reduce disk load or use faster disks + info: number of times when dbengine dirty pages were over 50% of the instance's page cache in the last 10 minutes. \ + Metric data are at risk of not being stored in the database. To remedy, reduce disk load or use faster disks. to: sysadmin alarm: 10min_dbengine_global_flushing_errors @@ -46,5 +47,6 @@ lookup: sum -10m unaligned of flushing_pressure_deletions every: 10s crit: $this != 0 delay: down 1h multiplier 1.5 max 3h - info: number of pages deleted due to failure to flush data to disk in the last 10 minutes, metric data were lost to unblock data collection, please reduce disk load or use faster disks + info: number of pages deleted due to failure to flush data to disk in the last 10 minutes. \ + Metric data were lost to unblock data collection. To fix, reduce disk load or use faster disks. to: sysadmin diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf index 9c194ced2..d0cd60cfc 100644 --- a/health/health.d/disks.conf +++ b/health/health.d/disks.conf @@ -20,7 +20,7 @@ families: !/dev !/dev/* !/run !/run/* * warn: $this > (($status >= $WARNING ) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: up 1m down 15m multiplier 1.5 max 1h - info: current disk space usage + info: disk space utilization to: sysadmin template: disk_inode_usage @@ -34,7 +34,7 @@ families: !/dev !/dev/* !/run !/run/* * warn: $this > (($status >= $WARNING) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: up 1m down 15m multiplier 1.5 max 1h - info: current disk inode usage + info: disk inode utilization to: sysadmin @@ -49,35 +49,35 @@ families: !/dev !/dev/* !/run !/run/* * # we will use it in the next template to find # the hours remaining -template: disk_fill_rate - on: disk.space - os: linux freebsd - hosts: * -families: * - lookup: min -10m at -50m unaligned of avail - calc: ($this - $avail) / (($now - $after) / 3600) - every: 1m - units: GB/hour - info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour +# template: disk_fill_rate +# on: disk.space +# os: linux freebsd +# hosts: * +# families: * +# lookup: min -10m at -50m unaligned of avail +# calc: ($this - $avail) / (($now - $after) / 3600) +# every: 1m +# units: GB/hour +# info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour # calculate the hours remaining # if the disk continues to fill # in this rate -template: out_of_disk_space_time - on: disk.space - os: linux freebsd - hosts: * -families: * - calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf) - units: hours - every: 10s - warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) - crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) - delay: down 15m multiplier 1.2 max 1h - info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour - to: sysadmin +# template: out_of_disk_space_time +# on: disk.space +# os: linux freebsd +# hosts: * +# families: * +# calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf) +# units: hours +# every: 10s +# warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) +# crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) +# delay: down 15m multiplier 1.2 max 1h +# info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour +# to: sysadmin # ----------------------------------------------------------------------------- @@ -91,34 +91,34 @@ families: * # we will use it in the next template to find # the hours remaining -template: disk_inode_rate - on: disk.inodes - os: linux freebsd - hosts: * -families: * - lookup: min -10m at -50m unaligned of avail - calc: ($this - $avail) / (($now - $after) / 3600) - every: 1m - units: inodes/hour - info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour +# template: disk_inode_rate +# on: disk.inodes +# os: linux freebsd +# hosts: * +# families: * +# lookup: min -10m at -50m unaligned of avail +# calc: ($this - $avail) / (($now - $after) / 3600) +# every: 1m +# units: inodes/hour +# info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour # calculate the hours remaining # if the disk inodes are allocated # in this rate -template: out_of_disk_inodes_time - on: disk.inodes - os: linux freebsd - hosts: * -families: * - calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf) - units: hours - every: 10s - warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) - crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) - delay: down 15m multiplier 1.2 max 1h - info: estimated time the disk will run out of inodes, if the system continues to allocate inodes with the rate of the last hour - to: sysadmin +# template: out_of_disk_inodes_time +# on: disk.inodes +# os: linux freebsd +# hosts: * +# families: * +# calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf) +# units: hours +# every: 10s +# warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) +# crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) +# delay: down 15m multiplier 1.2 max 1h +# info: estimated time the disk will run out of inodes, if the system continues to allocate inodes with the rate of the last hour +# to: sysadmin # ----------------------------------------------------------------------------- @@ -141,8 +141,8 @@ families: * warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1)) crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1)) delay: down 15m multiplier 1.2 max 1h - info: the percentage of time the disk was busy, during the last 10 minutes - to: sysadmin + info: average percentage of time the disk was busy over the last 10 minutes + to: silent # raise an alarm if the disk backlog @@ -163,5 +163,5 @@ families: * warn: $this > $green * (($status >= $WARNING) ? (0.7) : (1)) crit: $this > $red * (($status == $CRITICAL) ? (0.7) : (1)) delay: down 15m multiplier 1.2 max 1h - info: average of the kernel estimated disk backlog, for the last 10 minutes - to: sysadmin + info: average disk backlog size over the last 10 minutes + to: silent diff --git a/health/health.d/dns_query.conf b/health/health.d/dns_query.conf index 113c950e6..64770b986 100644 --- a/health/health.d/dns_query.conf +++ b/health/health.d/dns_query.conf @@ -8,5 +8,5 @@ template: dns_query_time_query_time every: 10s warn: $this == nan delay: up 20s down 5m multiplier 1.5 max 1h - info: query round trip time + info: average DNS query round trip time over the last 10 seconds to: sysadmin diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf index ecf3b84a8..dff1f07d4 100644 --- a/health/health.d/dnsmasq_dhcp.conf +++ b/health/health.d/dnsmasq_dhcp.conf @@ -6,7 +6,7 @@ template: dnsmasq_dhcp_dhcp_range_utilization units: % calc: $used warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) ) - crit: $this > ( ($status >= $CRITICAL) ? ( 90 ) : ( 95 ) ) + crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) ) delay: down 5m - info: dhcp-range utilization above threshold! + info: DHCP range utilization to: sysadmin diff --git a/health/health.d/dockerd.conf b/health/health.d/dockerd.conf index 729906cdb..122d82b8a 100644 --- a/health/health.d/dockerd.conf +++ b/health/health.d/dockerd.conf @@ -4,5 +4,5 @@ template: docker_unhealthy_containers every: 10s lookup: average -10s crit: $this > 0 - info: number of unhealthy containers + info: average number of unhealthy docker containers over the last 10 seconds to: sysadmin diff --git a/health/health.d/entropy.conf b/health/health.d/entropy.conf index 66d44ec13..0be9d45ba 100644 --- a/health/health.d/entropy.conf +++ b/health/health.d/entropy.conf @@ -7,10 +7,10 @@ on: system.entropy os: linux hosts: * - lookup: min -10m unaligned + lookup: min -5m unaligned units: entries every: 5m warn: $this < (($status >= $WARNING) ? (200) : (100)) delay: down 1h multiplier 1.5 max 2h - info: minimum entries in the random numbers pool in the last 10 minutes + info: minimum number of entries in the random numbers pool in the last 5 minutes to: silent diff --git a/health/health.d/exporting.conf b/health/health.d/exporting.conf index 506cb0cf7..735fb5ae7 100644 --- a/health/health.d/exporting.conf +++ b/health/health.d/exporting.conf @@ -21,14 +21,3 @@ families: * delay: down 5m multiplier 1.5 max 1h info: percentage of metrics sent to the external database server to: dba - -template: exporting_metrics_lost -families: * - on: exporting_data_size - units: metrics - calc: abs($lost) - every: 10s - crit: ($this != 0) || ($status == $CRITICAL && abs($sent) == 0) - delay: down 5m multiplier 1.5 max 1h - info: number of metrics lost due to repeating failures to contact the external database server - to: dba diff --git a/health/health.d/fping.conf b/health/health.d/fping.conf index 43658fef6..92c1525bd 100644 --- a/health/health.d/fping.conf +++ b/health/health.d/fping.conf @@ -11,18 +11,18 @@ families: * info: number of seconds since the last successful data collection to: sysadmin -template: host_reachable +template: fping_host_reachable families: * on: fping.latency calc: $average != nan units: up/down every: 10s crit: $this == 0 - info: states if the remote host is reachable delay: down 30m multiplier 1.5 max 2h + info: reachability status of the network host (0: unreachable, 1: reachable) to: sysadmin -template: host_latency +template: fping_host_latency families: * on: fping.latency lookup: average -10s unaligned of average @@ -32,11 +32,11 @@ families: * red: 1000 warn: $this > $green OR $max > $red crit: $this > $red - info: average round trip delay during the last 10 seconds delay: down 30m multiplier 1.5 max 2h + info: average latency to the network host over the last 10 seconds to: sysadmin -template: packet_loss +template: fping_packet_loss families: * on: fping.quality lookup: average -10m unaligned of returned @@ -47,7 +47,6 @@ families: * every: 10s warn: $this > $green crit: $this > $red - info: packet loss percentage delay: down 30m multiplier 1.5 max 2h + info: packet loss ratio to the network host over the last 10 minutes to: sysadmin - diff --git a/health/health.d/gearman.conf b/health/health.d/gearman.conf index e3863ae5e..d148f7b7c 100644 --- a/health/health.d/gearman.conf +++ b/health/health.d/gearman.conf @@ -18,5 +18,5 @@ template: gearman_workers_queued warn: $this > 30000 crit: $this > 100000 delay: down 5m multiplier 1.5 max 1h - info: number of queued jobs - to: sysadmin \ No newline at end of file + info: average number of queued jobs over the last 10 minutes + to: sysadmin diff --git a/health/health.d/haproxy.conf b/health/health.d/haproxy.conf index e49c70d48..9cd070668 100644 --- a/health/health.d/haproxy.conf +++ b/health/health.d/haproxy.conf @@ -4,7 +4,7 @@ template: haproxy_backend_server_status every: 10s lookup: average -10s crit: $this > 0 - info: number of failed haproxy backend servers + info: average number of failed haproxy backend servers over the last 10 seconds to: sysadmin template: haproxy_backend_status @@ -13,7 +13,7 @@ template: haproxy_backend_status every: 10s lookup: average -10s crit: $this > 0 - info: number of failed haproxy backends + info: average number of failed haproxy backends over the last 10 seconds to: sysadmin template: haproxy_last_collected diff --git a/health/health.d/hdfs.conf b/health/health.d/hdfs.conf index 678faab4c..7345df4d2 100644 --- a/health/health.d/hdfs.conf +++ b/health/health.d/hdfs.conf @@ -23,7 +23,7 @@ template: hdfs_capacity_usage warn: $this > (($status >= $WARNING) ? (70) : (80)) crit: $this > (($status == $CRITICAL) ? (80) : (98)) delay: down 15m multiplier 1.5 max 1h - info: used capacity + info: summary datanodes space capacity utilization to: sysadmin @@ -36,7 +36,7 @@ template: hdfs_missing_blocks every: 10s warn: $this > 0 delay: down 15m multiplier 1.5 max 1h - info: missing blocks + info: number of missing blocks to: sysadmin @@ -47,7 +47,7 @@ template: hdfs_stale_nodes every: 10s warn: $this > 0 delay: down 15m multiplier 1.5 max 1h - info: stale data nodes + info: number of datanodes marked stale due to delayed heartbeat to: sysadmin @@ -58,7 +58,7 @@ template: hdfs_dead_nodes every: 10s crit: $this > 0 delay: down 15m multiplier 1.5 max 1h - info: dead data nodes + info: number of datanodes which are currently dead to: sysadmin @@ -71,5 +71,5 @@ template: hdfs_num_failed_volumes every: 10s warn: $this > 0 delay: down 15m multiplier 1.5 max 1h - info: failed volumes + info: number of failed volumes to: sysadmin diff --git a/health/health.d/httpcheck.conf b/health/health.d/httpcheck.conf index 0ddf35eab..0158f63eb 100644 --- a/health/health.d/httpcheck.conf +++ b/health/health.d/httpcheck.conf @@ -11,17 +11,17 @@ families: * to: sysadmin # This is a fast-reacting no-notification alarm ideal for custom dashboards or badges -template: web_service_up +template: httpcheck_web_service_up families: * on: httpcheck.status lookup: average -1m unaligned percentage of success calc: ($this < 75) ? (0) : ($this) every: 5s units: up/down - info: at least 75% verified responses during last 60 seconds, ideal for badges + info: average ratio of successful HTTP requests over the last minute (at least 75%) to: silent -template: web_service_bad_content +template: httpcheck_web_service_bad_content families: * on: httpcheck.status lookup: average -5m unaligned percentage of bad_content @@ -30,11 +30,11 @@ families: * warn: $this >= 10 AND $this < 40 crit: $this >= 40 delay: down 5m multiplier 1.5 max 1h - info: average of unexpected http response content during the last 5 minutes + info: average ratio of HTTP responses with unexpected content over the last 5 minutes options: no-clear-notification to: webmaster -template: web_service_bad_status +template: httpcheck_web_service_bad_status families: * on: httpcheck.status lookup: average -5m unaligned percentage of bad_status @@ -43,57 +43,57 @@ families: * warn: $this >= 10 AND $this < 40 crit: $this >= 40 delay: down 5m multiplier 1.5 max 1h - info: average of unexpected http status during the last 5 minutes + info: average ratio of HTTP responses with unexpected status over the last 5 minutes options: no-clear-notification to: webmaster -template: web_service_timeouts +template: httpcheck_web_service_timeouts families: * on: httpcheck.status lookup: average -5m unaligned percentage of timeout every: 10s units: % - info: average of timeouts during the last 5 minutes + info: average ratio of HTTP request timeouts over the last 5 minutes -template: no_web_service_connections +template: httpcheck_no_web_service_connections families: * on: httpcheck.status lookup: average -5m unaligned percentage of no_connection every: 10s units: % - info: average of failed requests during the last 5 minutes + info: average ratio of failed requests during the last 5 minutes # combined timeout & no connection alarm -template: web_service_unreachable +template: httpcheck_web_service_unreachable families: * on: httpcheck.status - calc: ($no_web_service_connections >= $web_service_timeouts) ? ($no_web_service_connections) : ($web_service_timeouts) + calc: ($httpcheck_no_web_service_connections >= $httpcheck_web_service_timeouts) ? ($httpcheck_no_web_service_connections) : ($httpcheck_web_service_timeouts) units: % every: 10s - warn: ($no_web_service_connections >= 10 OR $web_service_timeouts >= 10) AND ($no_web_service_connections < 40 OR $web_service_timeouts < 40) - crit: $no_web_service_connections >= 40 OR $web_service_timeouts >= 40 + warn: ($httpcheck_no_web_service_connections >= 10 OR $httpcheck_web_service_timeouts >= 10) AND ($httpcheck_no_web_service_connections < 40 OR $httpcheck_web_service_timeouts < 40) + crit: $httpcheck_no_web_service_connections >= 40 OR $httpcheck_web_service_timeouts >= 40 delay: down 5m multiplier 1.5 max 1h - info: average of failed requests either due to timeouts or no connection during the last 5 minutes + info: ratio of failed requests either due to timeouts or no connection over the last 5 minutes options: no-clear-notification to: webmaster -template: 1h_web_service_response_time +template: httpcheck_1h_web_service_response_time families: * on: httpcheck.responsetime lookup: average -1h unaligned of time every: 30s units: ms - info: average response time over the last hour + info: average HTTP response time over the last hour -template: web_service_slow +template: httpcheck_web_service_slow families: * on: httpcheck.responsetime lookup: average -3m unaligned of time units: ms every: 10s - warn: ($this > ($1h_web_service_response_time * 2) ) - crit: ($this > ($1h_web_service_response_time * 3) ) - info: average response time over the last 3 minutes, compared to the average over the last hour + warn: ($this > ($httpcheck_1h_web_service_response_time * 2) ) + crit: ($this > ($httpcheck_1h_web_service_response_time * 3) ) delay: down 5m multiplier 1.5 max 1h + info: average HTTP response time over the last 3 minutes, compared to the average over the last hour options: no-clear-notification to: webmaster diff --git a/health/health.d/ioping.conf b/health/health.d/ioping.conf index 59a5c8edc..fa0196ef8 100644 --- a/health/health.d/ioping.conf +++ b/health/health.d/ioping.conf @@ -1,4 +1,4 @@ -template: disk_latency +template: ioping_disk_latency families: * on: ioping.latency lookup: average -10s unaligned of average @@ -8,6 +8,6 @@ families: * red: 1000 warn: $this > $green OR $max > $red crit: $this > $red - info: average round trip delay during the last 10 seconds delay: down 30m multiplier 1.5 max 2h + info: average I/O latency over the last 10 seconds to: sysadmin diff --git a/health/health.d/ipc.conf b/health/health.d/ipc.conf index 989d6e912..f4a0f56da 100644 --- a/health/health.d/ipc.conf +++ b/health/health.d/ipc.conf @@ -11,7 +11,7 @@ warn: $this > (($status >= $WARNING) ? (70) : (80)) crit: $this > (($status == $CRITICAL) ? (70) : (90)) delay: down 5m multiplier 1.5 max 1h - info: the percentage of IPC semaphores used + info: IPC semaphore utilization to: sysadmin alarm: semaphore_arrays_used @@ -24,5 +24,5 @@ warn: $this > (($status >= $WARNING) ? (70) : (80)) crit: $this > (($status == $CRITICAL) ? (70) : (90)) delay: down 5m multiplier 1.5 max 1h - info: the percentage of IPC semaphore arrays used + info: IPC semaphore arrays utilization to: sysadmin diff --git a/health/health.d/ipfs.conf b/health/health.d/ipfs.conf index 3f77572d6..fd53c2c46 100644 --- a/health/health.d/ipfs.conf +++ b/health/health.d/ipfs.conf @@ -7,5 +7,5 @@ template: ipfs_datastore_usage warn: $this > (($status >= $WARNING) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 15m multiplier 1.5 max 1h - info: ipfs Datastore close to running out of space + info: IPFS datastore utilization to: sysadmin diff --git a/health/health.d/ipmi.conf b/health/health.d/ipmi.conf index c25581964..563d7a7ea 100644 --- a/health/health.d/ipmi.conf +++ b/health/health.d/ipmi.conf @@ -6,7 +6,7 @@ warn: $this > 0 crit: $critical > 0 delay: up 5m down 15m multiplier 1.5 max 1h - info: the number IPMI sensors in non-nominal state + info: number of IPMI sensors in non-nominal state to: sysadmin alarm: ipmi_events @@ -16,5 +16,5 @@ every: 10s warn: $this > 0 delay: up 5m down 15m multiplier 1.5 max 1h - info: the number of events in the IPMI System Event Log (SEL) + info: number of events in the IPMI System Event Log (SEL) to: sysadmin diff --git a/health/health.d/isc_dhcpd.conf b/health/health.d/isc_dhcpd.conf index 8054656ff..d1f93969a 100644 --- a/health/health.d/isc_dhcpd.conf +++ b/health/health.d/isc_dhcpd.conf @@ -1,10 +1,10 @@ - template: isc_dhcpd_leases_size - on: isc_dhcpd.leases_total - units: KB - every: 60 - calc: $leases_size - warn: $this > 3072 - crit: $this > 6144 - delay: up 2m down 5m - info: dhcpd.leases file too big! Module can slow down your server. - to: sysadmin +# template: isc_dhcpd_leases_size +# on: isc_dhcpd.leases_total +# units: KB +# every: 60 +# calc: $leases_size +# warn: $this > 3072 +# crit: $this > 6144 +# delay: up 2m down 5m +# info: dhcpd.leases file too big! Module can slow down your server. +# to: sysadmin diff --git a/health/health.d/kubelet.conf b/health/health.d/kubelet.conf index d2ef24b58..5eda59b2c 100644 --- a/health/health.d/kubelet.conf +++ b/health/health.d/kubelet.conf @@ -4,26 +4,26 @@ # True (1) if the node is experiencing a configuration-related error, false (0) otherwise. - template: node_config_error + template: kubelet_node_config_error on: k8s_kubelet.kubelet_node_config_error calc: $kubelet_node_config_error units: bool every: 10s warn: $this == 1 delay: down 1m multiplier 1.5 max 2h - info: the node is experiencing a configuration-related error + info: the node is experiencing a configuration-related error (0: false, 1: true) to: sysadmin # Failed Token() requests to the alternate token source - template: token_requests + template: kubelet_token_requests lookup: sum -10s of token_fail_count on: k8s_kubelet.kubelet_token_requests units: failed requests every: 10s warn: $this > 0 delay: down 1m multiplier 1.5 max 2h - info: failed token requests to alternate token source + info: number of failed Token() requests to the alternate token source to: sysadmin # Docker and runtime operation errors @@ -35,7 +35,7 @@ every: 10s warn: $this > (($status >= $WARNING) ? (0) : (20)) delay: up 30s down 1m multiplier 1.5 max 2h - info: operations error + info: number of Docker or runtime operation errors to: sysadmin # ----------------------------------------------------------------------------- @@ -53,63 +53,66 @@ # quantile 0.5 -template: 1m_kubelet_pleg_relist_latency_quantile_05 +template: kubelet_1m_pleg_relist_latency_quantile_05 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds lookup: average -1m unaligned of kubelet_pleg_relist_latency_05 units: microseconds every: 10s - info: the average value of pleg relisting latency during the last minute (quantile 0.5) + info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.5) -template: 10s_kubelet_pleg_relist_latency_quantile_05 +template: kubelet_10s_pleg_relist_latency_quantile_05 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds lookup: average -10s unaligned of kubelet_pleg_relist_latency_05 - calc: $this * 100 / (($1m_kubelet_pleg_relist_latency_quantile_05 < 1000)?(1000):($1m_kubelet_pleg_relist_latency_quantile_05)) + calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05)) every: 10s units: % warn: $this > (($status >= $WARNING)?(100):(200)) crit: $this > (($status >= $WARNING)?(200):(400)) delay: down 1m multiplier 1.5 max 2h - info: the % of the pleg relisting latency in the last 10 seconds, compared to the last minute (quantile 0.5) + info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \ + compared to the last minute (quantile 0.5) to: sysadmin # quantile 0.9 -template: 1m_kubelet_pleg_relist_latency_quantile_09 +template: kubelet_1m_pleg_relist_latency_quantile_09 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds lookup: average -1m unaligned of kubelet_pleg_relist_latency_09 units: microseconds every: 10s - info: the average value of pleg relisting latency during the last minute (quantile 0.9) + info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.9) -template: 10s_kubelet_pleg_relist_latency_quantile_09 +template: kubelet_10s_pleg_relist_latency_quantile_09 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds lookup: average -10s unaligned of kubelet_pleg_relist_latency_09 - calc: $this * 100 / (($1m_kubelet_pleg_relist_latency_quantile_09 < 1000)?(1000):($1m_kubelet_pleg_relist_latency_quantile_09)) + calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09)) every: 10s units: % warn: $this > (($status >= $WARNING)?(200):(400)) crit: $this > (($status >= $WARNING)?(400):(800)) delay: down 1m multiplier 1.5 max 2h - info: the % of the pleg relisting latency in the last 10 seconds, compared to the last minute (quantile 0.9) + info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \ + compared to the last minute (quantile 0.9) to: sysadmin # quantile 0.99 -template: 1m_kubelet_pleg_relist_latency_quantile_099 +template: kubelet_1m_pleg_relist_latency_quantile_099 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds lookup: average -1m unaligned of kubelet_pleg_relist_latency_099 units: microseconds every: 10s - info: the average value of pleg relisting latency during the last minute (quantile 0.99) + info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.99) -template: 10s_kubelet_pleg_relist_latency_quantile_099 +template: kubelet_10s_pleg_relist_latency_quantile_099 on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds lookup: average -10s unaligned of kubelet_pleg_relist_latency_099 - calc: $this * 100 / (($1m_kubelet_pleg_relist_latency_quantile_099 < 1000)?(1000):($1m_kubelet_pleg_relist_latency_quantile_099)) + calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099)) every: 10s units: % warn: $this > (($status >= $WARNING)?(400):(800)) crit: $this > (($status >= $WARNING)?(800):(1200)) delay: down 1m multiplier 1.5 max 2h - info: the % of the pleg relisting latency in the last 10 seconds, compared to the last minute (quantile 0.99) + info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \ + compared to the last minute (quantile 0.99) to: sysadmin diff --git a/health/health.d/linux_power_supply.conf b/health/health.d/linux_power_supply.conf index 38727be2f..a27ea0722 100644 --- a/health/health.d/linux_power_supply.conf +++ b/health/health.d/linux_power_supply.conf @@ -8,5 +8,5 @@ template: linux_power_supply_capacity warn: $this < 10 crit: $this < 5 delay: up 30s down 5m multiplier 1.2 max 1h - info: the percentage remaining capacity of the power supply + info: percentage of remaining power supply capacity to: sysadmin diff --git a/health/health.d/load.conf b/health/health.d/load.conf index ee0c54b8e..ffaea1723 100644 --- a/health/health.d/load.conf +++ b/health/health.d/load.conf @@ -4,18 +4,19 @@ # Calculate the base trigger point for the load average alarms. # This is the maximum number of CPU's in the system over the past 1 # minute, with a special case for a single CPU of setting the trigger at 2. - alarm: load_trigger + alarm: load_cpu_number on: system.load os: linux hosts: * calc: ($active_processors == nan or $active_processors == inf or $active_processors < 2) ? ( 2 ) : ( $active_processors ) units: cpus every: 1m - info: trigger point for load average alarms + info: number of active CPU cores in the system # Send alarms if the load average is unusually high. # These intentionally _do not_ calculate the average over the sampled # time period because the values being checked already are averages. + alarm: load_average_15 on: system.load os: linux @@ -23,10 +24,9 @@ lookup: max -1m unaligned of load15 units: load every: 1m - warn: $this > (($status >= $WARNING) ? (1.75 * $load_trigger) : (2 * $load_trigger)) - crit: $this > (($status == $CRITICAL) ? (3.5 * $load_trigger) : (4 * $load_trigger)) + warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 175 : 200) delay: down 15m multiplier 1.5 max 1h - info: fifteen-minute load average + info: system fifteen-minute load average to: sysadmin alarm: load_average_5 @@ -36,10 +36,9 @@ lookup: max -1m unaligned of load5 units: load every: 1m - warn: $this > (($status >= $WARNING) ? (3.5 * $load_trigger) : (4 * $load_trigger)) - crit: $this > (($status == $CRITICAL) ? (7 * $load_trigger) : (8 * $load_trigger)) + warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 350 : 400) delay: down 15m multiplier 1.5 max 1h - info: five-minute load average + info: system five-minute load average to: sysadmin alarm: load_average_1 @@ -49,8 +48,7 @@ lookup: max -1m unaligned of load1 units: load every: 1m - warn: $this > (($status >= $WARNING) ? (7 * $load_trigger) : (8 * $load_trigger)) - crit: $this > (($status == $CRITICAL) ? (14 * $load_trigger) : (16 * $load_trigger)) + warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 700 : 800) delay: down 15m multiplier 1.5 max 1h - info: one-minute load average + info: system one-minute load average to: sysadmin diff --git a/health/health.d/mdstat.conf b/health/health.d/mdstat.conf index 2f906e187..ca2d0d9fb 100644 --- a/health/health.d/mdstat.conf +++ b/health/health.d/mdstat.conf @@ -14,7 +14,8 @@ template: mdstat_disks every: 10s calc: $down crit: $this > 0 - info: Array is degraded! + info: number of devices in the down state. \ + Any number > 0 indicates that the array is degraded. to: sysadmin template: mdstat_mismatch_cnt @@ -24,7 +25,7 @@ template: mdstat_mismatch_cnt every: 60s warn: $this > 1024 delay: up 30m - info: Mismatch count! + info: number of unsynchronized blocks to: sysadmin template: mdstat_nonredundant_last_collected @@ -35,4 +36,4 @@ template: mdstat_nonredundant_last_collected warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) info: number of seconds since the last successful data collection - to: sysadmin \ No newline at end of file + to: sysadmin diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf index 6e81a2a0e..f861765d2 100644 --- a/health/health.d/megacli.conf +++ b/health/health.d/megacli.conf @@ -1,48 +1,56 @@ -template: adapter_state + +## Adapters (controllers) + +template: megacli_adapter_state on: megacli.adapter_degraded - units: is degraded - lookup: sum -10s + lookup: max -10s foreach * + units: boolean every: 10s crit: $this > 0 - info: adapter state + delay: down 5m multiplier 2 max 10m + info: adapter is in the degraded state (0: false, 1: true) + to: sysadmin + +## Physical Disks + +template: megacli_pd_predictive_failures + on: megacli.pd_predictive_failure + lookup: sum -10s foreach * + units: predictive failures + every: 10s + warn: $this > 0 + delay: up 1m down 5m multiplier 2 max 10m + info: number of physical drive predictive failures + to: sysadmin + +template: megacli_pd_media_errors + on: megacli.pd_media_error + lookup: sum -10s foreach * + units: media errors + every: 10s + warn: $this > 0 + delay: up 1m down 5m multiplier 2 max 10m + info: number of physical drive media errors to: sysadmin -template: bbu_relative_charge +## Battery Backup Units (BBU) + +template: megacli_bbu_relative_charge on: megacli.bbu_relative_charge - units: percent lookup: average -10s + units: percent every: 10s warn: $this <= (($status >= $WARNING) ? (85) : (80)) crit: $this <= (($status == $CRITICAL) ? (50) : (40)) - info: BBU relative state of charge + info: average battery backup unit (BBU) relative state of charge over the last 10 seconds to: sysadmin -template: bbu_cycle_count +template: megacli_bbu_cycle_count on: megacli.bbu_cycle_count - units: cycle count lookup: average -10s + units: cycles every: 10s warn: $this >= 100 crit: $this >= 500 - info: BBU cycle count - to: sysadmin - -template: pd_media_errors - on: megacli.pd_media_error - units: media errors - lookup: sum -10s - every: 10s - warn: $this > 0 - delay: down 1m multiplier 2 max 10m - info: physical drive media errors - to: sysadmin - -template: pd_predictive_failures - on: megacli.pd_predictive_failure - units: predictive failures - lookup: sum -10s - every: 10s - warn: $this > 0 - delay: down 1m multiplier 2 max 10m - info: physical drive predictive failures + info: average battery backup unit (BBU) charge cycles count over the last 10 seconds to: sysadmin diff --git a/health/health.d/memcached.conf b/health/health.d/memcached.conf index d248ef57a..e610f181f 100644 --- a/health/health.d/memcached.conf +++ b/health/health.d/memcached.conf @@ -23,30 +23,31 @@ template: memcached_cache_memory_usage warn: $this > (($status >= $WARNING) ? (70) : (80)) crit: $this > (($status == $CRITICAL) ? (80) : (90)) delay: up 0 down 15m multiplier 1.5 max 1h - info: current cache memory usage + info: cache memory utilization to: dba # find the rate memcached cache is filling -template: cache_fill_rate +template: memcached_cache_fill_rate on: memcached.cache lookup: min -10m at -50m unaligned of available calc: ($this - $available) / (($now - $after) / 3600) units: KB/hour every: 1m - info: average rate the cache fills up (positive), or frees up (negative) space, for the last hour + info: average rate the cache fills up (positive), or frees up (negative) space over the last hour # find the hours remaining until memcached cache is full -template: out_of_cache_space_time +template: memcached_out_of_cache_space_time on: memcached.cache - calc: ($cache_fill_rate > 0) ? ($available / $cache_fill_rate) : (inf) + calc: ($memcached_cache_fill_rate > 0) ? ($available / $memcached_cache_fill_rate) : (inf) units: hours every: 10s warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) delay: down 15m multiplier 1.5 max 1h - info: estimated time the cache will run out of space, if the system continues to add data with the rate of the last hour + info: estimated time the cache will run out of space \ + if the system continues to add data at the same rate as the past hour to: dba diff --git a/health/health.d/memory.conf b/health/health.d/memory.conf index 4a0e6e522..e95c0aad8 100644 --- a/health/health.d/memory.conf +++ b/health/health.d/memory.conf @@ -10,7 +10,7 @@ every: 1m warn: $this > 0 delay: down 1h multiplier 1.5 max 1h - info: number of ECC correctable errors during the last hour + info: number of ECC correctable errors in the last 10 minutes to: sysadmin alarm: 1hour_ecc_memory_uncorrectable @@ -22,7 +22,7 @@ every: 1m crit: $this > 0 delay: down 1h multiplier 1.5 max 1h - info: number of ECC uncorrectable errors during the last hour + info: number of ECC uncorrectable errors in the last 10 minutes to: sysadmin alarm: 1hour_memory_hw_corrupted diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf index 62cef5a2e..7451b3f4d 100644 --- a/health/health.d/mysql.conf +++ b/health/health.d/mysql.conf @@ -24,7 +24,7 @@ template: mysql_10s_slow_queries warn: $this > (($status >= $WARNING) ? (5) : (10)) crit: $this > (($status == $CRITICAL) ? (10) : (20)) delay: down 5m multiplier 1.5 max 1h - info: number of mysql slow queries over the last 10 seconds + info: number of slow queries in the last 10 seconds to: dba @@ -36,7 +36,7 @@ template: mysql_10s_table_locks_immediate lookup: sum -10s absolute of immediate units: immediate locks every: 10s - info: number of table immediate locks over the last 10 seconds + info: number of table immediate locks in the last 10 seconds to: dba template: mysql_10s_table_locks_waited @@ -44,7 +44,7 @@ template: mysql_10s_table_locks_waited lookup: sum -10s absolute of waited units: waited locks every: 10s - info: number of table waited locks over the last 10 seconds + info: number of table waited locks in the last 10 seconds to: dba template: mysql_10s_waited_locks_ratio @@ -55,7 +55,7 @@ template: mysql_10s_waited_locks_ratio warn: $this > (($status >= $WARNING) ? (10) : (25)) crit: $this > (($status == $CRITICAL) ? (25) : (50)) delay: down 30m multiplier 1.5 max 1h - info: the ratio of mysql waited table locks, for the last 10 seconds + info: ratio of waited table locks over the last 10 seconds to: dba @@ -70,7 +70,7 @@ template: mysql_connections warn: $this > (($status >= $WARNING) ? (60) : (70)) crit: $this > (($status == $CRITICAL) ? (80) : (90)) delay: down 15m multiplier 1.5 max 1h - info: the ratio of current active connections vs the maximum possible number of connections + info: client connections utilization to: dba @@ -84,7 +84,7 @@ template: mysql_replication every: 10s crit: $this == 0 delay: down 5m multiplier 1.5 max 1h - info: checks if mysql replication has stopped + info: replication status (0: stopped, 1: working) to: dba template: mysql_replication_lag @@ -95,7 +95,8 @@ template: mysql_replication_lag warn: $this > (($status >= $WARNING) ? (5) : (10)) crit: $this > (($status == $CRITICAL) ? (10) : (30)) delay: down 15m multiplier 1.5 max 1h - info: the number of seconds mysql replication is behind this master + info: difference between the timestamp of the latest transaction processed by the SQL thread and \ + the timestamp of the same transaction when it was processed on the master to: dba @@ -107,7 +108,7 @@ template: mysql_galera_cluster_size_max_2m lookup: max -2m absolute units: nodes every: 10s - info: max cluster size 2 minute + info: maximum galera cluster size in the last 2 minutes to: dba template: mysql_galera_cluster_size @@ -118,7 +119,7 @@ template: mysql_galera_cluster_size warn: $this > $mysql_galera_cluster_size_max_2m crit: $this < $mysql_galera_cluster_size_max_2m delay: up 20s down 5m multiplier 1.5 max 1h - info: cluster size has changed + info: current galera cluster size, compared to the maximum size in the last 2 minutes to: dba # galera node state @@ -130,7 +131,8 @@ template: mysql_galera_cluster_state warn: $this < 4 crit: $this < 2 delay: up 30s down 5m multiplier 1.5 max 1h - info: node state (0: undefined, 1: joining, 2: donor/desynced, 3: joined, 4: synced) + info: galera node state \ + (0: undefined, 1: joining, 2: donor/desynced, 3: joined, 4: synced) to: dba @@ -142,5 +144,7 @@ template: mysql_galera_cluster_status every: 10s crit: $mysql_galera_cluster_state != nan AND $this != 0 delay: up 30s down 5m multiplier 1.5 max 1h - info: node and cluster status (-1: unknown, 0: primary/quorum present, 1: non-primary/quorum lost, 2: disconnected) + info: galera node cluster component status \ + (-1: unknown, 0: primary/quorum present, 1: non-primary/quorum lost, 2: disconnected). \ + Any other value than primary indicates that the node is part of a nonoperational component. to: dba diff --git a/health/health.d/net.conf b/health/health.d/net.conf index 261290e51..33202421f 100644 --- a/health/health.d/net.conf +++ b/health/health.d/net.conf @@ -12,7 +12,7 @@ calc: ( $nic_speed_max > 0 ) ? ( $nic_speed_max) : ( nan ) units: Mbit every: 10s - info: The current speed of the physical network interface + info: network interface current speed template: 1m_received_traffic_overflow on: net.net @@ -20,13 +20,12 @@ hosts: * families: * lookup: average -1m unaligned absolute of received - calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan ) + calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed)) : ( nan ) units: % every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (85)) - crit: $this > (($status == $CRITICAL) ? (85) : (90)) - delay: down 1m multiplier 1.5 max 1h - info: interface received bandwidth usage over net device speed max + warn: $this > (($status >= $WARNING) ? (85) : (90)) + delay: up 1m down 1m multiplier 1.5 max 1h + info: average inbound utilization for the network interface over the last minute to: sysadmin template: 1m_sent_traffic_overflow @@ -35,13 +34,12 @@ hosts: * families: * lookup: average -1m unaligned absolute of sent - calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan ) + calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed)) : ( nan ) units: % every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (85)) - crit: $this > (($status == $CRITICAL) ? (85) : (90)) - delay: down 1m multiplier 1.5 max 1h - info: interface sent bandwidth usage over net device speed max + warn: $this > (($status >= $WARNING) ? (85) : (90)) + delay: up 1m down 1m multiplier 1.5 max 1h + info: average outbound utilization for the network interface over the last minute to: sysadmin # ----------------------------------------------------------------------------- @@ -58,56 +56,76 @@ template: inbound_packets_dropped on: net.drops os: linux hosts: * -families: * +families: !net* * lookup: sum -10m unaligned absolute of inbound units: packets every: 1m - warn: $this >= 5 - delay: down 1h multiplier 1.5 max 2h - info: interface inbound dropped packets in the last 10 minutes - to: sysadmin + info: number of inbound dropped packets for the network interface in the last 10 minutes template: outbound_packets_dropped on: net.drops os: linux hosts: * -families: * +families: !net* * lookup: sum -10m unaligned absolute of outbound units: packets every: 1m - warn: $this >= 5 - delay: down 1h multiplier 1.5 max 2h - info: interface outbound dropped packets in the last 10 minutes - to: sysadmin + info: number of outbound dropped packets for the network interface in the last 10 minutes template: inbound_packets_dropped_ratio on: net.packets os: linux hosts: * -families: * +families: !net* !wl* * lookup: sum -10m unaligned absolute of received - calc: (($inbound_packets_dropped != nan AND $this > 0) ? ($inbound_packets_dropped * 100 / $this) : (0)) + calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0)) units: % every: 1m - warn: $this >= 0.1 - crit: $this >= 2 - delay: down 1h multiplier 1.5 max 2h - info: the ratio of inbound dropped packets vs the total number of received packets of the network interface, during the last 10 minutes + warn: $this >= 2 + delay: up 1m down 1h multiplier 1.5 max 2h + info: ratio of inbound dropped packets for the network interface over the last 10 minutes to: sysadmin template: outbound_packets_dropped_ratio on: net.packets os: linux hosts: * -families: * +families: !net* !wl* * lookup: sum -10m unaligned absolute of sent - calc: (($outbound_packets_dropped != nan AND $this > 0) ? ($outbound_packets_dropped * 100 / $this) : (0)) + calc: (($outbound_packets_dropped != nan AND $this > 1000) ? ($outbound_packets_dropped * 100 / $this) : (0)) units: % every: 1m - warn: $this >= 0.1 - crit: $this >= 2 - delay: down 1h multiplier 1.5 max 2h - info: the ratio of outbound dropped packets vs the total number of sent packets of the network interface, during the last 10 minutes + warn: $this >= 2 + delay: up 1m down 1h multiplier 1.5 max 2h + info: ratio of outbound dropped packets for the network interface over the last 10 minutes + to: sysadmin + +template: wifi_inbound_packets_dropped_ratio + on: net.packets + os: linux + hosts: * +families: wl* + lookup: sum -10m unaligned absolute of received + calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0)) + units: % + every: 1m + warn: $this >= 10 + delay: up 1m down 1h multiplier 1.5 max 2h + info: ratio of inbound dropped packets for the network interface over the last 10 minutes + to: sysadmin + +template: wifi_outbound_packets_dropped_ratio + on: net.packets + os: linux + hosts: * +families: wl* + lookup: sum -10m unaligned absolute of sent + calc: (($outbound_packets_dropped != nan AND $this > 1000) ? ($outbound_packets_dropped * 100 / $this) : (0)) + units: % + every: 1m + warn: $this >= 10 + delay: up 1m down 1h multiplier 1.5 max 2h + info: ratio of outbound dropped packets for the network interface over the last 10 minutes to: sysadmin # ----------------------------------------------------------------------------- @@ -123,7 +141,7 @@ families: * every: 1m warn: $this >= 5 delay: down 1h multiplier 1.5 max 2h - info: interface inbound errors in the last 10 minutes + info: number of inbound errors for the network interface in the last 10 minutes to: sysadmin template: interface_outbound_errors @@ -136,7 +154,7 @@ families: * every: 1m warn: $this >= 5 delay: down 1h multiplier 1.5 max 2h - info: interface outbound errors in the last 10 minutes + info: number of outbound errors for the network interface in the last 10 minutes to: sysadmin # ----------------------------------------------------------------------------- @@ -157,7 +175,7 @@ families: * every: 1m warn: $this > 0 delay: down 1h multiplier 1.5 max 2h - info: interface fifo errors in the last 10 minutes + info: number of FIFO errors for the network interface in the last 10 minutes to: sysadmin # ----------------------------------------------------------------------------- @@ -177,7 +195,7 @@ families: * lookup: average -1m unaligned of received units: packets every: 10s - info: the average number of packets received during the last minute + info: average number of packets received by the network interface over the last minute template: 10s_received_packets_storm on: net.packets @@ -189,7 +207,8 @@ families: * every: 10s units: % warn: $this > (($status >= $WARNING)?(200):(5000)) - crit: $this > (($status >= $WARNING)?(5000):(6000)) + crit: $this > (($status == $CRITICAL)?(5000):(6000)) options: no-clear-notification - info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute (clear notification for this alarm will not be sent) + info: ratio of average number of received packets for the network interface over the last 10 seconds, \ + compared to the rate over the last minute to: sysadmin diff --git a/health/health.d/netfilter.conf b/health/health.d/netfilter.conf index 1d07752cc..f827d8e46 100644 --- a/health/health.d/netfilter.conf +++ b/health/health.d/netfilter.conf @@ -1,19 +1,6 @@ # you can disable an alarm notification by setting the 'to' line to: silent - alarm: netfilter_last_collected_secs - on: netfilter.conntrack_sockets - os: linux - hosts: * - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - alarm: netfilter_conntrack_full on: netfilter.conntrack_sockets os: linux @@ -22,8 +9,8 @@ calc: $this * 100 / $netfilter_conntrack_max units: % every: 10s - warn: $this > (($status >= $WARNING) ? (70) : (80)) - crit: $this > (($status == $CRITICAL) ? (80) : (90)) + warn: $this > (($status >= $WARNING) ? (85) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (95)) delay: down 5m multiplier 1.5 max 1h - info: the number of connections tracked by the netfilter connection tracker, as a percentage of the connection tracker table size + info: netfilter connection tracker table size utilization to: sysadmin diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf index b255d35f9..f450b7122 100644 --- a/health/health.d/pihole.conf +++ b/health/health.d/pihole.conf @@ -20,9 +20,9 @@ template: pihole_blocked_queries units: % calc: $blocked warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) ) - crit: $this > ( ($status >= $CRITICAL) ? ( 55 ) : ( 75 ) ) + crit: $this > ( ($status == $CRITICAL) ? ( 55 ) : ( 75 ) ) delay: up 2m down 5m - info: percentage of blocked dns queries for the last 24 hour + info: percentage of blocked dns queries over the last 24 hour to: sysadmin @@ -36,7 +36,7 @@ template: pihole_blocklist_last_update calc: $ago warn: $this > 60 * 60 * 24 * 8 crit: $this > 60 * 60 * 24 * 8 * 2 - info: blocklist last update time + info: gravity.list (blocklist) file last update time to: sysadmin # Gravity file check (gravity.list). @@ -48,7 +48,7 @@ template: pihole_blocklist_gravity_file calc: $file_exists crit: $this != 1 delay: up 2m down 5m - info: gravity file existence + info: gravity.list (blocklist) file existence state (0: exists, 1: not-exists) to: sysadmin # Pi-hole's ability to block unwanted domains. @@ -61,5 +61,5 @@ template: pihole_status calc: $enabled warn: $this != 1 delay: up 2m down 5m - info: unwanted domains blocking status + info: unwanted domains blocking status (0: enabled, 1: disabled) to: sysadmin diff --git a/health/health.d/portcheck.conf b/health/health.d/portcheck.conf index 696333fd8..29dcebbc7 100644 --- a/health/health.d/portcheck.conf +++ b/health/health.d/portcheck.conf @@ -11,17 +11,17 @@ families: * to: sysadmin # This is a fast-reacting no-notification alarm ideal for custom dashboards or badges -template: service_reachable +template: portcheck_service_reachable families: * on: portcheck.status lookup: average -1m unaligned percentage of success calc: ($this < 75) ? (0) : ($this) every: 5s units: up/down - info: at least 75% successful connections during last 60 seconds, ideal for badges + info: average ratio of successful connections over the last minute (at least 75%) to: silent -template: connection_timeouts +template: portcheck_connection_timeouts families: * on: portcheck.status lookup: average -5m unaligned percentage of timeout @@ -30,10 +30,10 @@ families: * warn: $this >= 10 AND $this < 40 crit: $this >= 40 delay: down 5m multiplier 1.5 max 1h - info: average of timeouts during the last 5 minutes + info: average ratio of timeouts over the last 5 minutes to: sysadmin -template: connection_fails +template: portcheck_connection_fails families: * on: portcheck.status lookup: average -5m unaligned percentage of no_connection,failed @@ -42,5 +42,5 @@ families: * warn: $this >= 10 AND $this < 40 crit: $this >= 40 delay: down 5m multiplier 1.5 max 1h - info: average of failed connections during the last 5 minutes + info: average ratio of failed connections over the last 5 minutes to: sysadmin diff --git a/health/health.d/processes.conf b/health/health.d/processes.conf index 293f1aa0d..b464d8f64 100644 --- a/health/health.d/processes.conf +++ b/health/health.d/processes.conf @@ -6,8 +6,8 @@ calc: $active * 100 / $pidmax units: % every: 5s - warn: $this > (($status >= $WARNING) ? (75) : (80)) - crit: $this > (($status == $CRITICAL) ? (85) : (90)) + warn: $this > (($status >= $WARNING) ? (85) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (95)) delay: down 5m multiplier 1.5 max 1h - info: the percentage of active processes + info: system process IDs (PID) space utilization to: sysadmin diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf index 0a71dac84..2daecc489 100644 --- a/health/health.d/ram.conf +++ b/health/health.d/ram.conf @@ -7,7 +7,8 @@ hosts: * calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz - $zfs.arc_size.min) every: 10s - info: the amount of memory that is reported as used, but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC) + info: amount of memory reported as used, \ + but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC) alarm: ram_in_use on: system.ram @@ -20,7 +21,7 @@ warn: $this > (($status >= $WARNING) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 15m multiplier 1.5 max 1h - info: system RAM used + info: system memory utilization to: sysadmin alarm: ram_available @@ -33,7 +34,7 @@ warn: $this < (($status >= $WARNING) ? (15) : (10)) crit: $this < (($status == $CRITICAL) ? (10) : ( 5)) delay: down 15m multiplier 1.5 max 1h - info: estimated amount of RAM available for userspace processes, without causing swapping + info: percentage of estimated amount of RAM available for userspace processes, without causing swapping to: sysadmin ## FreeBSD @@ -47,7 +48,7 @@ warn: $this > (($status >= $WARNING) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 15m multiplier 1.5 max 1h - info: system RAM usage + info: system memory utilization to: sysadmin alarm: ram_available @@ -60,5 +61,5 @@ warn: $this < (($status >= $WARNING) ? (15) : (10)) crit: $this < (($status == $CRITICAL) ? (10) : ( 5)) delay: down 15m multiplier 1.5 max 1h - info: estimated amount of RAM available for userspace processes, without causing swapping + info: percentage of estimated amount of RAM available for userspace processes, without causing swapping to: sysadmin diff --git a/health/health.d/redis.conf b/health/health.d/redis.conf index c08a884a6..43f98a1d4 100644 --- a/health/health.d/redis.conf +++ b/health/health.d/redis.conf @@ -18,7 +18,7 @@ families: * every: 10s crit: $rdb_last_bgsave_status != 0 units: ok/failed - info: states if redis bgsave is working + info: status of the last RDB save operation (0: ok, 1: error) delay: down 5m multiplier 1.5 max 1h to: dba @@ -29,6 +29,6 @@ families: * warn: $rdb_bgsave_in_progress > 600 crit: $rdb_bgsave_in_progress > 1200 units: seconds - info: the time redis needs to save its database + info: duration of the on-going RDB save operation delay: down 5m multiplier 1.5 max 1h to: dba diff --git a/health/health.d/retroshare.conf b/health/health.d/retroshare.conf index 2344b60ec..51b1deb4c 100644 --- a/health/health.d/retroshare.conf +++ b/health/health.d/retroshare.conf @@ -21,5 +21,5 @@ template: retroshare_dht_working warn: $this < (($status >= $WARNING) ? (120) : (100)) crit: $this < (($status == $CRITICAL) ? (10) : (1)) delay: up 0 down 15m multiplier 1.5 max 1h - info: Checks if the DHT has enough peers to operate + info: number of DHT peers to: sysadmin diff --git a/health/health.d/riakkv.conf b/health/health.d/riakkv.conf index 745302778..d63460264 100644 --- a/health/health.d/riakkv.conf +++ b/health/health.d/riakkv.conf @@ -1,5 +1,5 @@ # Ensure that Riak is running. template: riak_last_collected_secs -template: riak_last_collected_secs +template: riakkv_last_collected_secs on: riak.kv.throughput calc: $now - $last_collected_t units: seconds ago @@ -11,7 +11,7 @@ template: riak_last_collected_secs to: dba # Warn if a list keys operation is running. -template: riak_list_keys_active +template: riakkv_list_keys_active on: riak.core.fsm_active calc: $list_fsm_active units: state machines @@ -23,44 +23,50 @@ template: riak_list_keys_active ## Timing healthchecks # KV GET -template: 1h_kv_get_mean_latency +template: riakkv_1h_kv_get_mean_latency on: riak.kv.latency.get calc: $node_get_fsm_time_mean lookup: average -1h unaligned of time every: 30s units: ms - info: mean average KV GET latency over the last hour + info: average time between reception of client GET request and \ + subsequent response to client over the last hour -template: riak_kv_get_slow +template: riakkv_kv_get_slow on: riak.kv.latency.get calc: $mean lookup: average -3m unaligned of time units: ms every: 10s - warn: ($this > ($1h_kv_get_mean_latency * 2) ) - crit: ($this > ($1h_kv_get_mean_latency * 3) ) - info: average KV GET time over the last 3 minutes, compared to the average over the last hour + warn: ($this > ($riakkv_1h_kv_get_mean_latency * 2) ) + crit: ($this > ($riakkv_1h_kv_get_mean_latency * 3) ) + info: average time between reception of client GET request and \ + subsequent response to the client over the last 3 minutes, \ + compared to the average over the last hour delay: down 5m multiplier 1.5 max 1h to: dba # KV PUT -template: 1h_kv_put_mean_latency +template: riakkv_1h_kv_put_mean_latency on: riak.kv.latency.put calc: $node_put_fsm_time_mean lookup: average -1h unaligned of time every: 30s units: ms - info: mean average KV PUT latency over the last hour + info: average time between reception of client PUT request and \ + subsequent response to the client over the last hour -template: riak_kv_put_slow +template: riakkv_kv_put_slow on: riak.kv.latency.put calc: $mean lookup: average -3m unaligned of time units: ms every: 10s - warn: ($this > ($1h_kv_put_mean_latency * 2) ) - crit: ($this > ($1h_kv_put_mean_latency * 3) ) - info: average KV PUT time over the last 3 minutes, compared to the average over the last hour + warn: ($this > ($riakkv_1h_kv_put_mean_latency * 2) ) + crit: ($this > ($riakkv_1h_kv_put_mean_latency * 3) ) + info: average time between reception of client PUT request and \ + subsequent response to the client over the last 3 minutes, \ + compared to the average over the last hour delay: down 5m multiplier 1.5 max 1h to: dba @@ -69,12 +75,12 @@ template: riak_kv_put_slow # Default Erlang VM process limit: 262144 # On systems observed, this is < 2000, but may grow depending on load. -template: riak_vm_high_process_count +template: riakkv_vm_high_process_count on: riak.vm calc: $sys_process_count units: processes every: 10s warn: $this > 10000 crit: $this > 100000 - info: number of processes running in the Erlang VM (the default limit on ERTS 10.2.4 is 262144) + info: number of processes running in the Erlang VM to: dba diff --git a/health/health.d/scaleio.conf b/health/health.d/scaleio.conf index 1a3088a2a..ab9771bb4 100644 --- a/health/health.d/scaleio.conf +++ b/health/health.d/scaleio.conf @@ -22,7 +22,7 @@ template: scaleio_storage_pool_capacity_utilization warn: $this > (($status >= $WARNING) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 15m multiplier 1.5 max 1h - info: Storage Pool capacity utilization + info: storage pool capacity utilization to: sysadmin @@ -34,5 +34,5 @@ template: scaleio_sdc_mdm_connection_state every: 10s warn: $this != 1 delay: up 30s down 5m multiplier 1.5 max 1h - info: Sdc connection to MDM state + info: Data Client (SDC) to Metadata Manager (MDM) connection state (0: disconnected, 1: connected) to: sysadmin diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf index f835f2aee..f761e4a01 100644 --- a/health/health.d/softnet.conf +++ b/health/health.d/softnet.conf @@ -12,7 +12,8 @@ every: 10s warn: $this > (($status >= $WARNING) ? (0) : (10)) delay: down 1h multiplier 1.5 max 2h - info: average number of packets dropped in the last 1min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets) + info: average number of dropped packets in the last minute \ + due to exceeded net.core.netdev_max_backlog to: sysadmin alarm: 1min_netdev_budget_ran_outs @@ -24,7 +25,9 @@ every: 10s warn: $this > (($status >= $WARNING) ? (0) : (10)) delay: down 1h multiplier 1.5 max 2h - info: average number of times, during the last 1min, ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs, with work remaining (this can be a cause for dropped packets) + info: average number of times ksoftirq ran out of sysctl net.core.netdev_budget or \ + net.core.netdev_budget_usecs with work remaining over the last minute \ + (this can be a cause for dropped packets) to: silent alarm: 10min_netisr_backlog_exceeded @@ -34,7 +37,9 @@ lookup: average -1m unaligned absolute of qdrops units: packets every: 10s - warn: $this > (($status >+ $WARNING) ? (0) : (10)) + warn: $this > (($status >= $WARNING) ? (0) : (10)) delay: down 1h multiplier 1.5 max 2h - info: average number of drops in the last 1min, because sysctl net.route.netisr_maxqlen was exceeded (this can be a cause for dropped packets) + info: average number of drops in the last minute \ + due to exceeded sysctl net.route.netisr_maxqlen \ + (this can be a cause for dropped packets) to: sysadmin diff --git a/health/health.d/swap.conf b/health/health.d/swap.conf index f920b0807..66c36c13c 100644 --- a/health/health.d/swap.conf +++ b/health/health.d/swap.conf @@ -10,23 +10,9 @@ calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free ) units: % of RAM every: 1m - warn: $this > (($status >= $WARNING) ? (10) : (20)) - crit: $this > (($status == $CRITICAL) ? (20) : (30)) - delay: up 0 down 15m multiplier 1.5 max 1h - info: the amount of memory swapped in the last 30 minutes, as a percentage of the system RAM - to: sysadmin - - alarm: ram_in_swap - on: system.swap - os: linux - hosts: * - calc: $used * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free ) - units: % of RAM - every: 10s - warn: $this > (($status >= $WARNING) ? (15) : (20)) - crit: $this > (($status == $CRITICAL) ? (40) : (50)) - delay: up 30s down 15m multiplier 1.5 max 1h - info: the swap memory used, as a percentage of the system RAM + warn: $this > (($status >= $WARNING) ? (20) : (30)) + delay: down 15m multiplier 1.5 max 1h + info: percentage of the system RAM swapped in the last 30 minutes to: sysadmin alarm: used_swap @@ -39,5 +25,5 @@ warn: $this > (($status >= $WARNING) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: up 30s down 15m multiplier 1.5 max 1h - info: the percentage of swap memory used + info: swap memory utilization to: sysadmin diff --git a/health/health.d/synchronization.conf b/health/health.d/synchronization.conf new file mode 100644 index 000000000..417624adb --- /dev/null +++ b/health/health.d/synchronization.conf @@ -0,0 +1,12 @@ + alarm: sync_freq + on: mem.sync + lookup: sum -1m of sync + units: calls + plugin: ebpf.plugin + every: 1m + warn: $this > 6 + delay: up 1m down 10m multiplier 1.5 max 1h + info: number of sync() system calls. \ + Every call causes all pending modifications to filesystem metadata and \ + cached file data to be written to the underlying filesystems. + to: sysadmin diff --git a/health/health.d/tcp_conn.conf b/health/health.d/tcp_conn.conf index 7aa9a9800..38b1062dc 100644 --- a/health/health.d/tcp_conn.conf +++ b/health/health.d/tcp_conn.conf @@ -13,7 +13,7 @@ units: % every: 10s warn: $this > (($status >= $WARNING ) ? ( 60 ) : ( 80 )) - crit: $this > (($status >= $CRITICAL) ? ( 80 ) : ( 90 )) + crit: $this > (($status == $CRITICAL) ? ( 80 ) : ( 90 )) delay: up 0 down 5m multiplier 1.5 max 1h - info: the percentage of IPv4 TCP connections over the max allowed + info: IPv4 TCP connections utilization to: sysadmin diff --git a/health/health.d/tcp_listen.conf b/health/health.d/tcp_listen.conf index 3b3072577..dad462ebf 100644 --- a/health/health.d/tcp_listen.conf +++ b/health/health.d/tcp_listen.conf @@ -28,7 +28,7 @@ warn: $this > 1 crit: $this > (($status == $CRITICAL) ? (1) : (5)) delay: up 0 down 5m multiplier 1.5 max 1h - info: the average number of times the TCP accept queue of the kernel overflown, during the last minute + info: average number of overflows in the TCP accept queue over the last minute to: sysadmin # THIS IS TOO GENERIC @@ -43,7 +43,7 @@ warn: $this > 1 crit: $this > (($status == $CRITICAL) ? (1) : (5)) delay: up 0 down 5m multiplier 1.5 max 1h - info: the average number of times the TCP accept queue of the kernel dropped packets, during the last minute (includes bogus packets received) + info: average number of dropped packets in the TCP accept queue over the last minute to: sysadmin @@ -65,7 +65,8 @@ warn: $this > 1 crit: $this > (($status == $CRITICAL) ? (0) : (5)) delay: up 10 down 5m multiplier 1.5 max 1h - info: the number of times the TCP SYN queue of the kernel was full and dropped packets, during the last minute + info: average number of SYN requests was dropped due to the full TCP SYN queue over the last minute \ + (SYN cookies were not enabled) to: sysadmin alarm: 1m_tcp_syn_queue_cookies @@ -78,6 +79,6 @@ warn: $this > 1 crit: $this > (($status == $CRITICAL) ? (0) : (5)) delay: up 10 down 5m multiplier 1.5 max 1h - info: the number of times the TCP SYN queue of the kernel was full and sent SYN cookies, during the last minute + info: average number of sent SYN cookies due to the full TCP SYN queue over the last minute to: sysadmin diff --git a/health/health.d/tcp_mem.conf b/health/health.d/tcp_mem.conf index 6927d5765..29d4ad68b 100644 --- a/health/health.d/tcp_mem.conf +++ b/health/health.d/tcp_mem.conf @@ -14,7 +14,7 @@ units: % every: 10s warn: ${mem} > (($status >= $WARNING ) ? ( ${tcp_mem_pressure} * 0.8 ) : ( ${tcp_mem_pressure} )) - crit: ${mem} > (($status >= $CRITICAL ) ? ( ${tcp_mem_pressure} ) : ( ${tcp_mem_high} * 0.9 )) + crit: ${mem} > (($status == $CRITICAL ) ? ( ${tcp_mem_pressure} ) : ( ${tcp_mem_high} * 0.9 )) delay: up 0 down 5m multiplier 1.5 max 1h - info: the amount of TCP memory as a percentage of its max memory limit + info: TCP memory utilization to: sysadmin diff --git a/health/health.d/tcp_orphans.conf b/health/health.d/tcp_orphans.conf index 280d6590f..17ff7a956 100644 --- a/health/health.d/tcp_orphans.conf +++ b/health/health.d/tcp_orphans.conf @@ -15,7 +15,7 @@ units: % every: 10s warn: $this > (($status >= $WARNING ) ? ( 20 ) : ( 25 )) - crit: $this > (($status >= $CRITICAL) ? ( 25 ) : ( 50 )) + crit: $this > (($status == $CRITICAL) ? ( 25 ) : ( 50 )) delay: up 0 down 5m multiplier 1.5 max 1h - info: the percentage of orphan IPv4 TCP sockets over the max allowed (this may lead to too-many-orphans errors) + info: orphan IPv4 TCP sockets utilization to: sysadmin diff --git a/health/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf index 36a550a5d..af2a75252 100644 --- a/health/health.d/tcp_resets.conf +++ b/health/health.d/tcp_resets.conf @@ -1,21 +1,6 @@ # you can disable an alarm notification by setting the 'to' line to: silent -# ----------------------------------------------------------------------------- - - alarm: ipv4_tcphandshake_last_collected_secs - on: ipv4.tcphandshake - os: linux freebsd - hosts: * - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: up 0 down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - # ----------------------------------------------------------------------------- # tcp resets this host sends @@ -26,7 +11,7 @@ lookup: average -1m at -10s unaligned absolute of OutRsts units: tcp resets/s every: 10s - info: average TCP RESETS this host is sending, over the last minute + info: average number of sent TCP RESETS over the last minute alarm: 10s_ipv4_tcp_resets_sent on: ipv4.tcphandshake @@ -38,7 +23,10 @@ warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (20))) delay: up 20s down 60m multiplier 1.2 max 2h options: no-clear-notification - info: average TCP RESETS this host is sending, over the last 10 seconds (this can be an indication that a port scan is made, or that a service running on this host has crashed; clear notification for this alarm will not be sent) + info: average number of sent TCP RESETS over the last 10 seconds. \ + This can indicate a port scan, \ + or that a service running on this host has crashed. \ + Netdata will not send a clear notification for this alarm. to: sysadmin # ----------------------------------------------------------------------------- @@ -51,7 +39,7 @@ lookup: average -1m at -10s unaligned absolute of AttemptFails units: tcp resets/s every: 10s - info: average TCP RESETS this host is sending, over the last minute + info: average number of received TCP RESETS over the last minute alarm: 10s_ipv4_tcp_resets_received on: ipv4.tcphandshake @@ -63,5 +51,7 @@ warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (10))) delay: up 20s down 60m multiplier 1.2 max 2h options: no-clear-notification - info: average TCP RESETS this host is receiving, over the last 10 seconds (this can be an indication that a service this host needs, has crashed; clear notification for this alarm will not be sent) + info: average number of received TCP RESETS over the last 10 seconds. \ + This can be an indication that a service this host needs has crashed. \ + Netdata will not send a clear notification for this alarm. to: sysadmin diff --git a/health/health.d/udp_errors.conf b/health/health.d/udp_errors.conf index 1e47b5c8b..4836d6310 100644 --- a/health/health.d/udp_errors.conf +++ b/health/health.d/udp_errors.conf @@ -1,21 +1,6 @@ # you can disable an alarm notification by setting the 'to' line to: silent -# ----------------------------------------------------------------------------- - - alarm: ipv4_udperrors_last_collected_secs - on: ipv4.udperrors - os: linux freebsd - hosts: * - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: up 0 down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sysadmin - # ----------------------------------------------------------------------------- # UDP receive buffer errors @@ -26,10 +11,9 @@ lookup: average -1m unaligned absolute of RcvbufErrors units: errors every: 10s - warn: $this > 1 - crit: $this > (($status == $CRITICAL) ? (0) : (10)) - info: average number of UDP receive buffer errors during the last minute - delay: up 0 down 60m multiplier 1.2 max 2h + warn: $this > (($status >= $WARNING) ? (0) : (10)) + info: average number of UDP receive buffer errors over the last minute + delay: up 1m down 60m multiplier 1.2 max 2h to: sysadmin # ----------------------------------------------------------------------------- @@ -42,8 +26,7 @@ lookup: average -1m unaligned absolute of SndbufErrors units: errors every: 10s - warn: $this > 1 - crit: $this > (($status == $CRITICAL) ? (0) : (10)) - info: number of UDP send buffer errors during the last minute - delay: up 0 down 60m multiplier 1.2 max 2h + warn: $this > (($status >= $WARNING) ? (0) : (10)) + info: average number of UDP send buffer errors over the last minute + delay: up 1m down 60m multiplier 1.2 max 2h to: sysadmin diff --git a/health/health.d/unbound.conf b/health/health.d/unbound.conf index bdedc11a0..567baf188 100644 --- a/health/health.d/unbound.conf +++ b/health/health.d/unbound.conf @@ -21,7 +21,7 @@ template: unbound_request_list_overwritten every: 10s warn: $this > 5 delay: up 10 down 5m multiplier 1.5 max 1h - info: the number of overwritten queries in the request-list + info: number of overwritten queries in the request-list to: sysadmin template: unbound_request_list_dropped @@ -31,5 +31,5 @@ template: unbound_request_list_dropped every: 10s warn: $this > 0 delay: up 10 down 5m multiplier 1.5 max 1h - info: the number of dropped queries in the request-list + info: number of dropped queries in the request-list to: sysadmin diff --git a/health/health.d/vcsa.conf b/health/health.d/vcsa.conf index 7bb98a9ba..f4b03d4cf 100644 --- a/health/health.d/vcsa.conf +++ b/health/health.d/vcsa.conf @@ -27,7 +27,8 @@ template: vcsa_system_health warn: ($this == 1) || ($this == 2) crit: $this == 3 delay: down 1m multiplier 1.5 max 1h - info: overall system health status + info: overall system health status \ + (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey) to: sysadmin # Components health: @@ -45,7 +46,8 @@ template: vcsa_swap_health warn: $this == 1 crit: ($this == 2) || ($this == 3) delay: down 1m multiplier 1.5 max 1h - info: swap health status + info: swap health status \ + (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey) to: sysadmin template: vcsa_storage_health @@ -56,7 +58,8 @@ template: vcsa_storage_health warn: $this == 1 crit: ($this == 2) || ($this == 3) delay: down 1m multiplier 1.5 max 1h - info: storage health status + info: storage health status \ + (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey) to: sysadmin template: vcsa_mem_health @@ -67,7 +70,8 @@ template: vcsa_mem_health warn: $this == 1 crit: ($this == 2) || ($this == 3) delay: down 1m multiplier 1.5 max 1h - info: mem health status + info: memory health status \ + (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey) to: sysadmin template: vcsa_load_health @@ -78,7 +82,8 @@ template: vcsa_load_health warn: $this == 1 crit: ($this == 2) || ($this == 3) delay: down 1m multiplier 1.5 max 1h - info: load health status + info: load health status \ + (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey) to: sysadmin template: vcsa_database_storage_health @@ -89,7 +94,8 @@ template: vcsa_database_storage_health warn: $this == 1 crit: ($this == 2) || ($this == 3) delay: down 1m multiplier 1.5 max 1h - info: database storage health status + info: database storage health status \ + (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey) to: sysadmin template: vcsa_applmgmt_health @@ -100,7 +106,8 @@ template: vcsa_applmgmt_health warn: $this == 1 crit: ($this == 2) || ($this == 3) delay: down 1m multiplier 1.5 max 1h - info: appl mgmt health status + info: applmgmt health status \ + (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey) to: sysadmin @@ -118,5 +125,6 @@ template: vcsa_software_updates_health warn: $this == 4 crit: $this == 3 delay: down 1m multiplier 1.5 max 1h - info: software packages health status + info: software updates availability status \ + (-1: unknown, 0: green, 2: orange, 3: red, 4: grey) to: sysadmin diff --git a/health/health.d/vernemq.conf b/health/health.d/vernemq.conf index 36bbaf82b..9598dd39c 100644 --- a/health/health.d/vernemq.conf +++ b/health/health.d/vernemq.conf @@ -18,10 +18,10 @@ template: vernemq_socket_errors on: vernemq.socket_errors lookup: sum -1m unaligned absolute of socket_error units: errors - every: 10s - warn: $this > (($status == $WARNING) ? (0) : (5)) - delay: down 5m multiplier 1.5 max 2h - info: socket errors in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + info: number of socket errors in the last minute to: sysadmin # Queues dropped/expired/unhandled PUBLISH messages @@ -30,30 +30,30 @@ template: vernemq_queue_message_drop on: vernemq.queue_undelivered_messages lookup: sum -1m unaligned absolute of queue_message_drop units: dropped messages - every: 10s - warn: $this > (($status == $WARNING) ? (0) : (5)) - delay: down 5m multiplier 1.5 max 2h - info: dropped messaged due to full queues in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of dropped messaged due to full queues in the last minute to: sysadmin template: vernemq_queue_message_expired on: vernemq.queue_undelivered_messages lookup: sum -1m unaligned absolute of queue_message_expired units: expired messages - every: 10s - warn: $this > (($status == $WARNING) ? (0) : (15)) - delay: down 5m multiplier 1.5 max 2h - info: messages which expired before delivery in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (15)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of messages which expired before delivery in the last minute to: sysadmin template: vernemq_queue_message_unhandled on: vernemq.queue_undelivered_messages lookup: sum -1m unaligned absolute of queue_message_unhandled units: unhandled messages - every: 10s - warn: $this > (($status == $WARNING) ? (0) : (5)) - delay: down 5m multiplier 1.5 max 2h - info: unhandled messages (connections with clean session=true) in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of unhandled messages (connections with clean session=true) in the last minute to: sysadmin # Erlang VM @@ -66,19 +66,19 @@ template: vernemq_average_scheduler_utilization warn: $this > (($status >= $WARNING) ? (75) : (85)) crit: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 15m multiplier 1.5 max 1h - info: average scheduler utilization for the last 10 minutes + info: average scheduler utilization over the last 10 minutes to: sysadmin # Cluster communication and netsplits template: vernemq_cluster_dropped on: vernemq.cluster_dropped - lookup: average -1m unaligned - units: KiB/s - every: 10s + lookup: sum -1m unaligned + units: KiB + every: 1m warn: $this > 0 - delay: down 5m multiplier 1.5 max 1h - info: the amount of traffic dropped during communication with the cluster nodes in the last minute + delay: up 5m down 5m multiplier 1.5 max 1h + info: amount of traffic dropped during communication with the cluster nodes in the last minute to: sysadmin template: vernemq_netsplits @@ -88,68 +88,41 @@ template: vernemq_netsplits every: 10s warn: $this > 0 delay: down 5m multiplier 1.5 max 2h - info: detected netsplits in the last minute + info: number of detected netsplits (split brain situation) in the last minute to: sysadmin # Unsuccessful CONNACK -template: vernemq_mqtt_connack_sent_reason_success - on: vernemq.mqtt_connack_sent_reason - lookup: sum -1m unaligned absolute match-names of success - units: packets - every: 10s - info: successful v3/v5 CONNACK sent in the last minute - to: sysadmin - template: vernemq_mqtt_connack_sent_reason_unsuccessful on: vernemq.mqtt_connack_sent_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_connack_sent_reason_success + lookup: sum -1m unaligned absolute match-names of !success,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: unsuccessful v3/v5 CONNACK sent in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of sent unsuccessful v3/v5 CONNACK packets in the last minute to: sysadmin # Not normal DISCONNECT -template: vernemq_mqtt_disconnect_received_reason_normal_disconnect - on: vernemq.mqtt_disconnect_received_reason - lookup: sum -1m unaligned absolute match-names of normal_disconnect - units: packets - every: 10s - info: normal v5 DISCONNECT received in the last minute - to: sysadmin - -template: vernemq_mqtt_disconnect_sent_reason_normal_disconnect - on: vernemq.mqtt_disconnect_sent_reason - lookup: sum -1m unaligned absolute match-names of normal_disconnect - units: packets - every: 10s - info: normal v5 DISCONNECT sent in the last minute - to: sysadmin - template: vernemq_mqtt_disconnect_received_reason_not_normal on: vernemq.mqtt_disconnect_received_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_disconnect_received_reason_normal_disconnect + lookup: sum -1m unaligned absolute match-names of !normal_disconnect,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: not normal v5 DISCONNECT received in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of received not normal v5 DISCONNECT packets in the last minute to: sysadmin template: vernemq_mqtt_disconnect_sent_reason_not_normal on: vernemq.mqtt_disconnect_sent_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_disconnect_sent_reason_normal_disconnect + lookup: sum -1m unaligned absolute match-names of !normal_disconnect,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: not normal v5 DISCONNECT sent in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of sent not normal v5 DISCONNECT packets in the last minute to: sysadmin # SUBSCRIBE errors and unauthorized attempts @@ -158,20 +131,20 @@ template: vernemq_mqtt_subscribe_error on: vernemq.mqtt_subscribe_error lookup: sum -1m unaligned absolute units: failed ops - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: failed v3/v5 SUBSCRIBE operations in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of failed v3/v5 SUBSCRIBE operations in the last minute to: sysadmin template: vernemq_mqtt_subscribe_auth_error on: vernemq.mqtt_subscribe_auth_error lookup: sum -1m unaligned absolute units: attempts - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: unauthorized v3/v5 SUBSCRIBE attempts in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of unauthorized v3/v5 SUBSCRIBE attempts in the last minute to: sysadmin # UNSUBSCRIBE errors @@ -180,10 +153,10 @@ template: vernemq_mqtt_unsubscribe_error on: vernemq.mqtt_unsubscribe_error lookup: sum -1m unaligned absolute units: failed ops - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: failed v3/v5 UNSUBSCRIBE operations in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of failed v3/v5 UNSUBSCRIBE operations in the last minute to: sysadmin # PUBLISH errors and unauthorized attempts @@ -192,208 +165,136 @@ template: vernemq_mqtt_publish_errors on: vernemq.mqtt_publish_errors lookup: sum -1m unaligned absolute units: failed ops - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: failed v3/v5 PUBLISH operations in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of failed v3/v5 PUBLISH operations in the last minute to: sysadmin template: vernemq_mqtt_publish_auth_errors on: vernemq.mqtt_publish_auth_errors lookup: sum -1m unaligned absolute units: attempts - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: unauthorized v3/v5 PUBLISH attempts in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of unauthorized v3/v5 PUBLISH attempts in the last minute to: sysadmin # Unsuccessful and unexpected PUBACK -template: vernemq_mqtt_puback_received_reason_success - on: vernemq.mqtt_puback_received_reason - lookup: sum -1m unaligned absolute match-names of success - units: packets - every: 10s - info: successful v5 PUBACK received in the last minute - to: sysadmin - -template: vernemq_mqtt_puback_sent_reason_success - on: vernemq.mqtt_puback_sent_reason - lookup: sum -1m unaligned absolute match-names of success - units: packets - every: 10s - info: successful v5 PUBACK sent in the last minute - to: sysadmin - template: vernemq_mqtt_puback_received_reason_unsuccessful on: vernemq.mqtt_puback_received_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_puback_received_reason_success + lookup: sum -1m unaligned absolute match-names of !success,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: unsuccessful v5 PUBACK received in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of received unsuccessful v5 PUBACK packets in the last minute to: sysadmin template: vernemq_mqtt_puback_sent_reason_unsuccessful on: vernemq.mqtt_puback_sent_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_puback_sent_reason_success + lookup: sum -1m unaligned absolute match-names of !success,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: unsuccessful v5 PUBACK sent in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of sent unsuccessful v5 PUBACK packets in the last minute to: sysadmin template: vernemq_mqtt_puback_unexpected on: vernemq.mqtt_puback_invalid_error lookup: sum -1m unaligned absolute units: messages - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: unexpected v3/v5 PUBACK received in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of received unexpected v3/v5 PUBACK packets in the last minute to: sysadmin # Unsuccessful and unexpected PUBREC -template: vernemq_mqtt_pubrec_received_reason_success - on: vernemq.mqtt_pubrec_received_reason - lookup: sum -1m unaligned absolute match-names of success - units: packets - every: 10s - info: successful v5 PUBREC received in the last minute - to: sysadmin - -template: vernemq_mqtt_pubrec_sent_reason_success - on: vernemq.mqtt_pubrec_sent_reason - lookup: sum -1m unaligned absolute match-names of success - units: packets - every: 10s - info: successful v5 PUBREC sent in the last minute - to: sysadmin - template: vernemq_mqtt_pubrec_received_reason_unsuccessful on: vernemq.mqtt_pubrec_received_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_pubrec_received_reason_success + lookup: sum -1m unaligned absolute match-names of !success,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: unsuccessful v5 PUBREC received in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of received unsuccessful v5 PUBREC packets in the last minute to: sysadmin template: vernemq_mqtt_pubrec_sent_reason_unsuccessful on: vernemq.mqtt_pubrec_sent_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_pubrec_sent_reason_success + lookup: sum -1m unaligned absolute match-names of !success,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: unsuccessful v5 PUBREC sent in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of sent unsuccessful v5 PUBREC packets in the last minute to: sysadmin template: vernemq_mqtt_pubrec_invalid_error on: vernemq.mqtt_pubrec_invalid_error lookup: sum -1m unaligned absolute units: messages - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: unexpected v3 PUBREC received in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of received unexpected v3 PUBREC packets in the last minute to: sysadmin # Unsuccessful PUBREL -template: vernemq_mqtt_pubrel_received_reason_success - on: vernemq.mqtt_pubrel_received_reason - lookup: sum -1m unaligned absolute match-names of success - units: packets - every: 10s - info: successful v5 PUBREL received in the last minute - to: sysadmin - -template: vernemq_mqtt_pubrel_sent_reason_success - on: vernemq.mqtt_pubrel_sent_reason - lookup: sum -1m unaligned absolute match-names of success - units: packets - every: 10s - info: successful v5 PUBREL sent in the last minute - to: sysadmin - template: vernemq_mqtt_pubrel_received_reason_unsuccessful on: vernemq.mqtt_pubrel_received_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_pubrel_received_reason_success + lookup: sum -1m unaligned absolute match-names of !success,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: unsuccessful v5 PUBREL received in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of received unsuccessful v5 PUBREL packets in the last minute to: sysadmin template: vernemq_mqtt_pubrel_sent_reason_unsuccessful on: vernemq.mqtt_pubrel_sent_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_pubrel_sent_reason_success + lookup: sum -1m unaligned absolute match-names of !success,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: unsuccessful v5 PUBREL sent in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of sent unsuccessful v5 PUBREL packets in the last minute to: sysadmin # Unsuccessful and unexpected PUBCOMP -template: vernemq_mqtt_pubcomp_received_reason_success - on: vernemq.mqtt_pubcomp_received_reason - lookup: sum -1m unaligned absolute match-names of success - units: packets - every: 10s - info: successful v5 PUBCOMP received in the last minute - to: sysadmin - -template: vernemq_mqtt_pubcomp_sent_reason_success - on: vernemq.mqtt_pubcomp_sent_reason - lookup: sum -1m unaligned absolute match-names of success - units: packets - every: 10s - info: successful v5 PUBCOMP sent in the last minute - to: sysadmin - template: vernemq_mqtt_pubcomp_received_reason_unsuccessful on: vernemq.mqtt_pubcomp_received_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_pubcomp_received_reason_success + lookup: sum -1m unaligned absolute match-names of !success,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: unsuccessful v5 PUBCOMP received in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of received unsuccessful v5 PUBCOMP packets in the last minute to: sysadmin template: vernemq_mqtt_pubcomp_sent_reason_unsuccessful on: vernemq.mqtt_pubcomp_sent_reason - lookup: sum -1m unaligned absolute - calc: $this - $vernemq_mqtt_pubcomp_sent_reason_success + lookup: sum -1m unaligned absolute match-names of !success,* units: packets - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: unsuccessful v5 PUBCOMP sent in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of sent unsuccessful v5 PUBCOMP packets in the last minute to: sysadmin template: vernemq_mqtt_pubcomp_unexpected on: vernemq.mqtt_pubcomp_invalid_error lookup: sum -1m unaligned absolute units: messages - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - info: unexpected v3/v5 PUBCOMP received in the last minute + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 5m down 5m multiplier 1.5 max 2h + info: number of received unexpected v3/v5 PUBCOMP packets in the last minute to: sysadmin diff --git a/health/health.d/vsphere.conf b/health/health.d/vsphere.conf index d8b2be190..3e1414c16 100644 --- a/health/health.d/vsphere.conf +++ b/health/health.d/vsphere.conf @@ -13,7 +13,7 @@ template: vsphere_vm_mem_usage warn: $this > (($status >= $WARNING) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 15m multiplier 1.5 max 1h - info: used RAM + info: virtual machine memory utilization # -----------------------------------------------HOST Specific---------------------------------------------------------- # Memory @@ -27,7 +27,7 @@ template: vsphere_host_mem_usage warn: $this > (($status >= $WARNING) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 15m multiplier 1.5 max 1h - info: used RAM + info: host memory utilization # Network errors @@ -38,10 +38,7 @@ families: * lookup: sum -10m unaligned absolute match-names of rx units: packets every: 1m - warn: $this >= 5 - delay: down 1h multiplier 1.5 max 2h - info: interface inbound dropped packets in the last 10 minutes - to: sysadmin + info: number of inbound errors for the network interface in the last 10 minutes template: vsphere_outbound_packets_errors on: vsphere.net_errors_total @@ -50,10 +47,7 @@ families: * lookup: sum -10m unaligned absolute match-names of tx units: packets every: 1m - warn: $this >= 5 - delay: down 1h multiplier 1.5 max 2h - info: interface outbound dropped packets in the last 10 minutes - to: sysadmin + info: number of outbound errors for the network interface in the last 10 minutes # Network errors ratio @@ -62,13 +56,12 @@ template: vsphere_inbound_packets_errors_ratio hosts: * families: * lookup: sum -10m unaligned absolute match-names of rx - calc: (($vsphere_inbound_packets_errors != nan AND $this > 0) ? ($vsphere_inbound_packets_errors * 100 / $this) : (0)) + calc: (($vsphere_inbound_packets_errors != nan AND $this > 1000) ? ($vsphere_inbound_packets_errors * 100 / $this) : (0)) units: % every: 1m - warn: $this >= 0.1 - crit: $this >= 2 - delay: down 1h multiplier 1.5 max 2h - info: the ratio of inbound errors vs the total number of received packets of the network interface, during the last 10 minutes + warn: $this >= 2 + delay: up 1m down 1h multiplier 1.5 max 2h + info: ratio of inbound errors for the network interface over the last 10 minutes to: sysadmin template: vsphere_outbound_packets_errors_ratio @@ -76,13 +69,12 @@ template: vsphere_outbound_packets_errors_ratio hosts: * families: * lookup: sum -10m unaligned absolute match-names of tx - calc: (($vsphere_outbound_packets_errors != nan AND $this > 0) ? ($vsphere_outbound_packets_errors * 100 / $this) : (0)) + calc: (($vsphere_outbound_packets_errors != nan AND $this > 1000) ? ($vsphere_outbound_packets_errors * 100 / $this) : (0)) units: % every: 1m - warn: $this >= 0.1 - crit: $this >= 2 - delay: down 1h multiplier 1.5 max 2h - info: the ratio of outbound errors vs the total number of sent packets of the network interface, during the last 10 minutes + warn: $this >= 2 + delay: up 1m down 1h multiplier 1.5 max 2h + info: ratio of outbound errors for the network interface over the last 10 minutes to: sysadmin # -----------------------------------------------Common------------------------------------------------------------------- @@ -97,7 +89,7 @@ template: vsphere_cpu_usage warn: $this > (($status >= $WARNING) ? (75) : (85)) crit: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 15m multiplier 1.5 max 1h - info: cpu utilization for the last 10 minutes + info: average CPU utilization to: sysadmin # Network drops @@ -109,10 +101,7 @@ families: * lookup: sum -10m unaligned absolute match-names of rx units: packets every: 1m - warn: $this >= 5 - delay: down 1h multiplier 1.5 max 2h - info: interface inbound dropped packets in the last 10 minutes - to: sysadmin + info: number of inbound dropped packets for the network interface in the last 10 minutes template: vsphere_outbound_packets_dropped on: vsphere.net_drops_total @@ -121,10 +110,7 @@ families: * lookup: sum -10m unaligned absolute match-names of tx units: packets every: 1m - warn: $this >= 5 - delay: down 1h multiplier 1.5 max 2h - info: interface outbound dropped packets in the last 10 minutes - to: sysadmin + info: number of outbound dropped packets for the network interface in the last 10 minutes # Network drops ratio @@ -133,13 +119,12 @@ template: vsphere_inbound_packets_dropped_ratio hosts: * families: * lookup: sum -10m unaligned absolute match-names of rx - calc: (($vsphere_inbound_packets_dropped != nan AND $this > 0) ? ($vsphere_inbound_packets_dropped * 100 / $this) : (0)) + calc: (($vsphere_inbound_packets_dropped != nan AND $this > 1000) ? ($vsphere_inbound_packets_dropped * 100 / $this) : (0)) units: % every: 1m - warn: $this >= 0.1 - crit: $this >= 2 - delay: down 1h multiplier 1.5 max 2h - info: the ratio of inbound dropped packets vs the total number of received packets of the network interface, during the last 10 minutes + warn: $this >= 2 + delay: up 1m down 1h multiplier 1.5 max 2h + info: ratio of inbound dropped packets for the network interface over the last 10 minutes to: sysadmin template: vsphere_outbound_packets_dropped_ratio @@ -147,11 +132,10 @@ template: vsphere_outbound_packets_dropped_ratio hosts: * families: * lookup: sum -10m unaligned absolute match-names of tx - calc: (($vsphere_outbound_packets_dropped != nan AND $this > 0) ? ($vsphere_outbound_packets_dropped * 100 / $this) : (0)) + calc: (($vsphere_outbound_packets_dropped != nan AND $this > 1000) ? ($vsphere_outbound_packets_dropped * 100 / $this) : (0)) units: % every: 1m - warn: $this >= 0.1 - crit: $this >= 2 - delay: down 1h multiplier 1.5 max 2h - info: the ratio of outbound dropped packets vs the total number of sent packets of the network interface, during the last 10 minutes + warn: $this >= 2 + delay: up 1m down 1h multiplier 1.5 max 2h + info: ratio of outbound dropped packets for the network interface over the last 10 minutes to: sysadmin diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf index 44de38a48..0b01990cb 100644 --- a/health/health.d/web_log.conf +++ b/health/health.d/web_log.conf @@ -31,7 +31,7 @@ families: * calc: ($this == 0)?(1):($this) units: requests every: 10s - info: the sum of all HTTP requests over the last minute + info: number of HTTP requests in the last minute template: 1m_successful on: web_log.response_statuses @@ -43,7 +43,7 @@ families: * warn: ($1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 ) crit: ($1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 ) delay: up 2m down 15m multiplier 1.5 max 1h - info: the ratio of successful HTTP responses (1xx, 2xx, 304, 401) over the last minute + info: ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401) to: webmaster template: 1m_redirects @@ -56,7 +56,7 @@ families: * warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 ) crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 ) delay: up 2m down 15m multiplier 1.5 max 1h - info: the ratio of HTTP redirects (3xx except 304) over the last minute + info: ratio of redirection HTTP requests over the last minute (3xx except 304) to: webmaster template: 1m_bad_requests @@ -69,7 +69,7 @@ families: * warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 ) crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 ) delay: up 2m down 15m multiplier 1.5 max 1h - info: the ratio of HTTP bad requests (4xx except 401) over the last minute + info: ratio of client error HTTP requests over the last minute (4xx except 401) to: webmaster template: 1m_internal_errors @@ -82,7 +82,7 @@ families: * warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 ) crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 ) delay: up 2m down 15m multiplier 1.5 max 1h - info: the ratio of HTTP internal server errors (5xx), over the last minute + info: ratio of server error HTTP requests over the last minute (5xx) to: webmaster # unmatched lines @@ -101,10 +101,10 @@ families: * calc: ($this == 0)?(1):($this) units: requests every: 10s - info: the sum of all HTTP requests over the last minute + info: number of HTTP requests over the last minute template: 1m_unmatched -on: web_log.response_codes + on: web_log.response_codes families: * lookup: sum -1m unaligned of unmatched calc: $this * 100 / $1m_total_requests @@ -112,7 +112,7 @@ families: * every: 10s warn: ($1m_total_requests > 120) ? ($this > 1) : ( 0 ) delay: up 1m down 5m multiplier 1.5 max 1h - info: the ratio of unmatched lines, over the last minute + info: percentage of unparsed log lines over the last minute to: webmaster # ----------------------------------------------------------------------------- @@ -131,7 +131,7 @@ families: * lookup: average -10m unaligned of avg units: ms every: 30s - info: the average time to respond to HTTP requests, over the last 10 minutes + info: average HTTP response time over the last 10 minutes template: web_slow on: web_log.response_time @@ -144,7 +144,7 @@ families: * warn: ($1m_requests > 120) ? ($this > $green && $this > ($10m_response_time * 2) ) : ( 0 ) crit: ($1m_requests > 120) ? ($this > $red && $this > ($10m_response_time * 4) ) : ( 0 ) delay: down 15m multiplier 1.5 max 1h - info: the average time to respond to HTTP requests, over the last 1 minute + info: average HTTP response time over the last minute options: no-clear-notification to: webmaster @@ -165,7 +165,7 @@ families: * lookup: average -5m at -5m unaligned of successful_requests units: requests/s every: 30s - info: average rate of successful HTTP requests over the last 5 minutes + info: average number of successful HTTP requests for the 5 minutes starting 10 minutes ago template: 5m_successful on: web_log.response_statuses @@ -173,7 +173,7 @@ families: * lookup: average -5m unaligned of successful_requests units: requests/s every: 30s - info: average successful HTTP requests over the last 5 minutes + info: average number of successful HTTP requests over the last 5 minutes template: 5m_requests_ratio on: web_log.response_codes @@ -185,7 +185,7 @@ families: * crit: ($5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0) delay: down 15m multiplier 1.5 max 1h options: no-clear-notification - info: the percentage of successful web requests over the last 5 minutes, \ + info: ratio of successful HTTP requests over the last 5 minutes, \ compared with the previous 5 minutes \ (clear notification for this alarm will not be sent) to: webmaster @@ -224,7 +224,7 @@ families: * calc: ($this == 0)?(1):($this) units: requests every: 10s - info: the sum of all HTTP requests over the last minute + info: number of HTTP requests in the last minute template: web_log_1m_unmatched on: web_log.excluded_requests @@ -235,7 +235,7 @@ families: * every: 10s warn: ($web_log_1m_total_requests > 120) ? ($this > 1) : ( 0 ) delay: up 1m down 5m multiplier 1.5 max 1h - info: the ratio of unmatched lines, over the last minute + info: percentage of unparsed log lines over the last minute to: webmaster # ----------------------------------------------------------------------------- @@ -255,7 +255,7 @@ families: * calc: ($this == 0)?(1):($this) units: requests every: 10s - info: the sum of all HTTP requests over the last minute + info: number of HTTP requests in the last minute template: web_log_1m_successful on: web_log.type_requests @@ -267,7 +267,7 @@ families: * warn: ($web_log_1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 ) crit: ($web_log_1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 ) delay: up 2m down 15m multiplier 1.5 max 1h - info: the ratio of successful HTTP responses (1xx, 2xx, 304, 401) over the last minute + info: ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401) to: webmaster template: web_log_1m_redirects @@ -280,7 +280,7 @@ families: * warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 ) crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 ) delay: up 2m down 15m multiplier 1.5 max 1h - info: the ratio of HTTP redirects (3xx except 304) over the last minute + info: ratio of redirection HTTP requests over the last minute (3xx except 304) to: webmaster template: web_log_1m_bad_requests @@ -293,7 +293,7 @@ families: * warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 ) crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 ) delay: up 2m down 15m multiplier 1.5 max 1h - info: the ratio of HTTP bad requests (4xx except 401) over the last minute + info: ratio of client error HTTP requests over the last minute (4xx except 401) to: webmaster template: web_log_1m_internal_errors @@ -306,7 +306,7 @@ families: * warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 ) crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 ) delay: up 2m down 15m multiplier 1.5 max 1h - info: the ratio of HTTP internal server errors (5xx), over the last minute + info: ratio of server error HTTP requests over the last minute (5xx) to: webmaster # ----------------------------------------------------------------------------- @@ -325,7 +325,7 @@ families: * lookup: average -10m unaligned of avg units: ms every: 30s - info: the average time to respond to HTTP requests, over the last 10 minutes + info: average HTTP response time over the last 10 minutes template: web_log_web_slow on: web_log.request_processing_time @@ -338,7 +338,7 @@ families: * warn: ($web_log_1m_requests > 120) ? ($this > $green && $this > ($web_log_10m_response_time * 2) ) : ( 0 ) crit: ($web_log_1m_requests > 120) ? ($this > $red && $this > ($web_log_10m_response_time * 4) ) : ( 0 ) delay: down 15m multiplier 1.5 max 1h - info: the average time to respond to HTTP requests, over the last 1 minute + info: average HTTP response time over the last 1 minute options: no-clear-notification to: webmaster @@ -359,7 +359,7 @@ families: * lookup: average -5m at -5m unaligned of success units: requests/s every: 30s - info: average rate of successful HTTP requests over the last 5 minutes + info: average number of successful HTTP requests for the 5 minutes starting 10 minutes ago template: web_log_5m_successful on: web_log.type_requests @@ -367,7 +367,7 @@ families: * lookup: average -5m unaligned of success units: requests/s every: 30s - info: average successful HTTP requests over the last 5 minutes + info: average number of successful HTTP requests over the last 5 minutes template: web_log_5m_requests_ratio on: web_log.type_requests @@ -379,7 +379,7 @@ families: * crit: ($web_log_5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0) delay: down 15m multiplier 1.5 max 1h options: no-clear-notification - info: the percentage of successful web requests over the last 5 minutes, \ + info: ratio of successful HTTP requests over over the last 5 minutes, \ compared with the previous 5 minutes \ (clear notification for this alarm will not be sent) to: webmaster diff --git a/health/health.d/whoisquery.conf b/health/health.d/whoisquery.conf index 275e11dd9..36ae02fa2 100644 --- a/health/health.d/whoisquery.conf +++ b/health/health.d/whoisquery.conf @@ -20,5 +20,5 @@ template: whoisquery_days_until_expiration every: 60s warn: $this < $days_until_expiration_warning*24*60*60 crit: $this < $days_until_expiration_critical*24*60*60 - info: domain time until expiration + info: time until the domain name registration expires to: webmaster diff --git a/health/health.d/wmi.conf b/health/health.d/wmi.conf index 0441fc1f3..f1f71a606 100644 --- a/health/health.d/wmi.conf +++ b/health/health.d/wmi.conf @@ -26,7 +26,7 @@ template: wmi_10min_cpu_usage warn: $this > (($status >= $WARNING) ? (75) : (85)) crit: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 15m multiplier 1.5 max 1h - info: cpu utilization for the last 10 minutes + info: average CPU utilization over the last 10 minutes to: sysadmin @@ -42,7 +42,7 @@ template: wmi_ram_in_use warn: $this > (($status >= $WARNING) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 15m multiplier 1.5 max 1h - info: used RAM + info: memory utilization to: sysadmin template: wmi_swap_in_use @@ -55,13 +55,13 @@ template: wmi_swap_in_use warn: $this > (($status >= $WARNING) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 15m multiplier 1.5 max 1h - info: used Swap + info: swap memory utilization to: sysadmin ## Network -template: inbound_packets_discarded +template: wmi_inbound_packets_discarded on: wmi.net_discarded os: linux hosts: * @@ -71,10 +71,10 @@ families: * every: 1m warn: $this >= 5 delay: down 1h multiplier 1.5 max 2h - info: interface inbound discarded packets in the last 10 minutes + info: number of inbound discarded packets for the network interface in the last 10 minutes to: sysadmin -template: outbound_packets_discarded +template: wmi_outbound_packets_discarded on: wmi.net_discarded os: linux hosts: * @@ -84,10 +84,10 @@ families: * every: 1m warn: $this >= 5 delay: down 1h multiplier 1.5 max 2h - info: interface outbound discarded packets in the last 10 minutes + info: number of outbound discarded packets for the network interface in the last 10 minutes to: sysadmin -template: inbound_packets_errors +template: wmi_inbound_packets_errors on: wmi.net_errors os: linux hosts: * @@ -97,10 +97,10 @@ families: * every: 1m warn: $this >= 5 delay: down 1h multiplier 1.5 max 2h - info: interface inbound errors in the last 10 minutes + info: number of inbound errors for the network interface in the last 10 minutes to: sysadmin -template: outbound_packets_errors +template: wmi_outbound_packets_errors on: wmi.net_errors os: linux hosts: * @@ -110,7 +110,7 @@ families: * every: 1m warn: $this >= 5 delay: down 1h multiplier 1.5 max 2h - info: interface outbound errors in the last 10 minutes + info: number of outbound errors for the network interface in the last 10 minutes to: sysadmin @@ -126,5 +126,5 @@ template: wmi_disk_in_use warn: $this > (($status >= $WARNING) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 15m multiplier 1.5 max 1h - info: used disk space + info: disk space utilization to: sysadmin diff --git a/health/health.d/x509check.conf b/health/health.d/x509check.conf index dfca37706..f2e4a050d 100644 --- a/health/health.d/x509check.conf +++ b/health/health.d/x509check.conf @@ -20,7 +20,7 @@ template: x509check_days_until_expiration every: 60s warn: $this < $days_until_expiration_warning*24*60*60 crit: $this < $days_until_expiration_critical*24*60*60 - info: certificate time until expiration + info: time until x509 certificate expires to: webmaster template: x509check_revocation_status @@ -28,5 +28,5 @@ template: x509check_revocation_status calc: $revoked every: 60s crit: $this != nan AND $this != 0 - info: certificate revocation status + info: x509 certificate revocation status (0: revoked, 1: valid) to: webmaster diff --git a/health/health.d/zfs.conf b/health/health.d/zfs.conf index af73824e6..74f96dd32 100644 --- a/health/health.d/zfs.conf +++ b/health/health.d/zfs.conf @@ -6,5 +6,5 @@ every: 1m warn: $this > 0 delay: down 1h multiplier 1.5 max 2h - info: the number of times ZFS had to limit the ARC growth in the last 10 minutes + info: number of times ZFS had to limit the ARC growth in the last 10 minutes to: sysadmin diff --git a/health/health.h b/health/health.h index 5281e16e3..07ce1311e 100644 --- a/health/health.h +++ b/health/health.h @@ -64,7 +64,7 @@ extern int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC * extern void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* context, RRDCALC_STATUS status); extern void health_alarms2json(RRDHOST *host, BUFFER *wb, int all); extern void health_alarms_values2json(RRDHOST *host, BUFFER *wb, int all); -extern void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after); +extern void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart); void health_api_v1_chart_variables2json(RRDSET *st, BUFFER *buf); void health_api_v1_chart_custom_variables2json(RRDSET *st, BUFFER *buf); diff --git a/health/health_config.c b/health/health_config.c index 1acf36933..e24acf77c 100644 --- a/health/health_config.c +++ b/health/health_config.c @@ -384,7 +384,7 @@ static inline int health_parse_db_lookup( } // sane defaults - *every = abs(*after); + *every = ABS(*after); // now we may have optional parameters while(*s) { diff --git a/health/health_json.c b/health/health_json.c index 7b5a1e3cb..2a81d1c02 100644 --- a/health/health_json.c +++ b/health/health_json.c @@ -2,7 +2,7 @@ #include "health.h" -static inline void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) { +void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) { if(value && *value) { buffer_sprintf(wb, "%s\"%s\":\"", prefix, label); buffer_strcat_htmlescape(wb, value); @@ -13,7 +13,7 @@ static inline void health_string2json(BUFFER *wb, const char *prefix, const char buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix); } -inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) { +void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) { buffer_sprintf(wb, "\n\t{\n" "\t\t\"hostname\": \"%s\",\n" @@ -93,18 +93,22 @@ inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST buffer_strcat(wb, "\t}"); } -void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after) { +void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart) { netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock); buffer_strcat(wb, "["); unsigned int max = host->health_log.max; unsigned int count = 0; + uint32_t hash_chart = 0; + if (chart) hash_chart = simple_hash(chart); ALARM_ENTRY *ae; - for(ae = host->health_log.alarms; ae && count < max ; count++, ae = ae->next) { - if(ae->unique_id > after) { - if(likely(count)) buffer_strcat(wb, ","); + for (ae = host->health_log.alarms; ae && count < max; ae = ae->next) { + if ((ae->unique_id > after) && (!chart || (ae->hash_chart == hash_chart && !strcmp(ae->chart, chart)))) { + if (likely(count)) + buffer_strcat(wb, ","); health_alarm_entry2json_nolock(wb, ae, host); + count++; } } @@ -298,6 +302,9 @@ static void health_alarms2json_fill_alarms(RRDHOST *host, BUFFER *wb, int all, v if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec)) continue; + if (unlikely(!rrdset_is_available_for_exporting_and_alarms(rc->rrdset))) + continue; + if(likely(!all && !(rc->status == RRDCALC_STATUS_WARNING || rc->status == RRDCALC_STATUS_CRITICAL))) continue; diff --git a/health/health_log.c b/health/health_log.c index 8c0bc5c34..3205f5920 100644 --- a/health/health_log.c +++ b/health/health_log.c @@ -213,8 +213,8 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char if(likely(*pointers[0] == 'U' || *pointers[0] == 'A')) { ALARM_ENTRY *ae = NULL; - if(entries < 26) { - error("HEALTH [%s]: line %zu of file '%s' should have at least 26 entries, but it has %d. Ignoring it.", host->hostname, line, filename, entries); + if(entries < 27) { + error("HEALTH [%s]: line %zu of file '%s' should have at least 27 entries, but it has %d. Ignoring it.", host->hostname, line, filename, entries); errored++; continue; } @@ -243,7 +243,7 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char RRDCALC *rc = alarm_max_last_repeat(host, alarm_name,simple_hash(alarm_name)); if (!rc) { for(rc = host->alarms; rc ; rc = rc->next) { - RRDCALC *rdcmp = (RRDCALC *) avl_insert_lock(&(host)->alarms_idx_name, (avl *)rc); + RRDCALC *rdcmp = (RRDCALC *) avl_insert_lock(&(host)->alarms_idx_name, (avl_t *)rc); if(rdcmp != rc) { error("Cannot insert the alarm index ID using log %s", rc->name); } diff --git a/health/notifications/alarm-notify.sh.in b/health/notifications/alarm-notify.sh.in index 3bf8db5f6..bf6c02816 100755 --- a/health/notifications/alarm-notify.sh.in +++ b/health/notifications/alarm-notify.sh.in @@ -209,6 +209,9 @@ if [[ ${1} = "unittest" ]]; then cfgfile="${3}" # the location of the config file to use for unit testing status="${4}" # the current status : REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL old_status="${5}" # the previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL +elif [[ ${1} = "dump_methods" ]]; then + dump_methods=1 + status="WARNING" else roles="${1}" # the roles that should be notified for this event args_host="${2}" # the host generated this event @@ -372,6 +375,7 @@ EMAIL_PLAINTEXT_ONLY= IRC_NICKNAME= IRC_REALNAME= IRC_NETWORK= +IRC_PORT=6667 # hangouts configs declare -A HANGOUTS_WEBHOOK_URI @@ -549,6 +553,15 @@ filter_recipient_by_criticality() { # check stackpulse [ -z "${STACKPULSE_WEBHOOK}" ] && SEND_STACKPULSE="NO" +# check msteam +[ -z "${MSTEAM_WEBHOOK_URL}" ] && SEND_MSTEAM="NO" + +# check pd +[ -z "${DEFAULT_RECIPIENT_PD}" ] && SEND_PD="NO" + +# check prowl +[ -z "${DEFAULT_RECIPIENT_PROWL}" ] && SEND_PROWL="NO" + if [ "${SEND_PUSHOVER}" = "YES" ] || [ "${SEND_SLACK}" = "YES" ] || [ "${SEND_ROCKETCHAT}" = "YES" ] || @@ -639,6 +652,15 @@ if [ "${SEND_AWSSNS}" = "YES" ] && [ -z "${aws}" ]; then fi fi +if [ ${dump_methods} ]; then + for name in "${!SEND_@}"; do + if [ "${!name}" = "YES" ]; then + echo "$name" + fi + done + exit +fi + # ----------------------------------------------------------------------------- # find the recipients' addresses per method @@ -864,14 +886,15 @@ send_email() { echo >&2 "--- END sendmail command ---" fi - "${sendmail}" -t "${opts[@]}" + local cmd_output + cmd_output=$("${sendmail}" -t "${opts[@]}" 2>&1) ret=$? if [ ${ret} -eq 0 ]; then info "sent email notification for: ${host} ${chart}.${name} is ${status} to '${to_email}'" return 0 else - error "failed to send email notification for: ${host} ${chart}.${name} is ${status} to '${to_email}' with error code ${ret}." + error "failed to send email notification for: ${host} ${chart}.${name} is ${status} to '${to_email}' with error code ${ret} (${cmd_output})." return 1 fi fi @@ -1722,9 +1745,9 @@ send_prowl() { # irc sender send_irc() { - local NICKNAME="${1}" REALNAME="${2}" CHANNELS="${3}" NETWORK="${4}" SERVERNAME="${5}" MESSAGE="${6}" sent=0 channel color send_alarm reply_codes error + local NICKNAME="${1}" REALNAME="${2}" CHANNELS="${3}" NETWORK="${4}" PORT="${5}" SERVERNAME="${6}" MESSAGE="${7}" sent=0 channel color send_alarm reply_codes error - if [ "${SEND_IRC}" = "YES" ] && [ -n "${NICKNAME}" ] && [ -n "${REALNAME}" ] && [ -n "${CHANNELS}" ] && [ -n "${NETWORK}" ] && [ -n "${SERVERNAME}" ]; then + if [ "${SEND_IRC}" = "YES" ] && [ -n "${NICKNAME}" ] && [ -n "${REALNAME}" ] && [ -n "${CHANNELS}" ] && [ -n "${NETWORK}" ] && [ -n "${SERVERNAME}" ] && [ -n "${PORT}" ]; then case "${status}" in WARNING) color="warning" ;; CRITICAL) color="danger" ;; @@ -1735,7 +1758,7 @@ send_irc() { SNDMESSAGE="${MESSAGE//$'\n'/", "}" for CHANNEL in ${CHANNELS}; do error=0 - send_alarm=$(echo -e "USER ${NICKNAME} guest ${REALNAME} ${SERVERNAME}\\nNICK ${NICKNAME}\\nJOIN ${CHANNEL}\\nPRIVMSG ${CHANNEL} :${SNDMESSAGE}\\nQUIT\\n" \ | nc "${NETWORK}" 6667) + send_alarm=$(echo -e "USER ${NICKNAME} guest ${REALNAME} ${SERVERNAME}\\nNICK ${NICKNAME}\\nJOIN ${CHANNEL}\\nPRIVMSG ${CHANNEL} :${SNDMESSAGE}\\nQUIT\\n" \ | nc "${NETWORK}" "${PORT}") reply_codes=$(echo "${send_alarm}" | cut -d ' ' -f 2 | grep -o '[0-9]*') for code in ${reply_codes}; do if [ "${code}" -ge 400 ] && [ "${code}" -le 599 ]; then @@ -2465,7 +2488,7 @@ SENT_PROWL=$? # ----------------------------------------------------------------------------- # send the irc message -send_irc "${IRC_NICKNAME}" "${IRC_REALNAME}" "${to_irc}" "${IRC_NETWORK}" "${host}" "${host} ${status_message} - ${name//_/ } - ${chart} ----- ${alarm} +send_irc "${IRC_NICKNAME}" "${IRC_REALNAME}" "${to_irc}" "${IRC_NETWORK}" "${IRC_PORT}" "${host}" "${host} ${status_message} - ${name//_/ } - ${chart} ----- ${alarm} Severity: ${severity} Chart: ${chart} Family: ${family} diff --git a/health/notifications/email/README.md b/health/notifications/email/README.md index 827a9c0be..ebd7f4b8c 100644 --- a/health/notifications/email/README.md +++ b/health/notifications/email/README.md @@ -43,7 +43,7 @@ You can always find the location of the alarm-notify.sh script in `netdata.conf` If you want an alternative to `sendmail` in order to have a simple MTA configuration for sending emails and auth to an existing SMTP server, you can do the following: - Install `msmtp`. -- Modify the `sendmail` path in `health_alarm_notify.conf` to point to the location of `mstmp`: +- Modify the `sendmail` path in `health_alarm_notify.conf` to point to the location of `msmtp`: ``` # The full path to the sendmail command. # If empty, the system $PATH will be searched for it. diff --git a/health/notifications/health_alarm_notify.conf b/health/notifications/health_alarm_notify.conf index be669e135..2dab1d489 100755 --- a/health/notifications/health_alarm_notify.conf +++ b/health/notifications/health_alarm_notify.conf @@ -676,6 +676,10 @@ DEFAULT_RECIPIENT_IRC="" # e.g. "irc.freenode.net" IRC_NETWORK="" +# The irc port to which a connection will occur. +# e.g. 6667 (the default one), 6697 (a TLS/SSL one) +IRC_PORT=6667 + # The irc nickname which is required to send the notification. It must not be # an already registered name as the connection's MODE is defined as a 'guest'. IRC_NICKNAME="" diff --git a/health/notifications/stackpulse/README.md b/health/notifications/stackpulse/README.md index 13d2f7235..4c44954ab 100644 --- a/health/notifications/stackpulse/README.md +++ b/health/notifications/stackpulse/README.md @@ -39,8 +39,9 @@ SEND_STACKPULSE="YES" STACKPULSE_WEBHOOK="https://hooks.stackpulse.io/v1/webhooks/YOUR_UNIQUE_ID" ``` -4. Now [restart Netdata](/docs/getting-started.md#start-stop-and-restart-netdata). When your node creates an alarm, you - can see the associated notification on your StackPulse Administration Portal +4. Now restart Netdata using `sudo systemctl restart netdata`, or the [appropriate + method](/docs/configure/start-stop-restart.md) for your system. When your node creates an alarm, you can see the + associated notification on your StackPulse Administration Portal ## React to alarms with playbooks -- cgit v1.2.3