diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-07-24 09:54:23 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-07-24 09:54:44 +0000 |
commit | 836b47cb7e99a977c5a23b059ca1d0b5065d310e (patch) | |
tree | 1604da8f482d02effa033c94a84be42bc0c848c3 /health/health.d | |
parent | Releasing debian version 1.44.3-2. (diff) | |
download | netdata-836b47cb7e99a977c5a23b059ca1d0b5065d310e.tar.xz netdata-836b47cb7e99a977c5a23b059ca1d0b5065d310e.zip |
Merging upstream version 1.46.3.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'health/health.d')
79 files changed, 0 insertions, 5137 deletions
diff --git a/health/health.d/adaptec_raid.conf b/health/health.d/adaptec_raid.conf deleted file mode 100644 index 1f1840491..000000000 --- a/health/health.d/adaptec_raid.conf +++ /dev/null @@ -1,32 +0,0 @@ - -# logical device status check - - template: adaptec_raid_ld_status - on: adaptec_raid.ld_status - class: Errors - type: System -component: RAID - lookup: max -10s foreach * - units: bool - every: 10s - crit: $this > 0 - delay: down 5m multiplier 1.5 max 1h - summary: Adaptec raid logical device status - info: Logical device status is failed or degraded - to: sysadmin - -# physical device state check - - template: adaptec_raid_pd_state - on: adaptec_raid.pd_state - class: Errors - type: System -component: RAID - lookup: max -10s foreach * - units: bool - every: 10s - crit: $this > 0 - delay: down 5m multiplier 1.5 max 1h - summary: Adaptec raid physical device state - info: Physical device state is not online - to: sysadmin diff --git a/health/health.d/anomalies.conf b/health/health.d/anomalies.conf deleted file mode 100644 index 269ae544b..000000000 --- a/health/health.d/anomalies.conf +++ /dev/null @@ -1,23 +0,0 @@ -# raise a warning alarm if an anomaly probability is consistently above 50% - - template: anomalies_anomaly_probabilities - on: anomalies.probability - class: Errors - type: Netdata -component: ML - lookup: average -2m foreach * - every: 1m - warn: $this > 50 - info: average anomaly probability over the last 2 minutes - -# raise a warning alarm if an anomaly flag is consistently firing - - template: anomalies_anomaly_flags - on: anomalies.anomaly - class: Errors - type: Netdata -component: ML - lookup: sum -2m foreach * - every: 1m - warn: $this > 10 - info: number of anomalies in the last 2 minutes diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf deleted file mode 100644 index 90a72af19..000000000 --- a/health/health.d/apcupsd.conf +++ /dev/null @@ -1,125 +0,0 @@ -# you can disable an alarm notification by setting the 'to' line to: silent - - template: apcupsd_10min_ups_load - on: apcupsd.load - class: Utilization - type: Power Supply -component: UPS - os: * - hosts: * - lookup: average -10m unaligned of percentage - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (70) : (80)) - delay: down 10m multiplier 1.5 max 1h - summary: APC UPS load - info: APC UPS average load over the last 10 minutes - to: sitemgr - -# Discussion in https://github.com/netdata/netdata/pull/3928: -# Fire the alarm as soon as it's going on battery (99% charge) and clear only when full. - template: apcupsd_ups_charge - on: apcupsd.charge - class: Errors - type: Power Supply -component: UPS - os: * - hosts: * - lookup: average -60s unaligned of charge - units: % - every: 60s - warn: $this < 100 - crit: $this < 40 - delay: down 10m multiplier 1.5 max 1h - summary: APC UPS battery charge - info: APC UPS average battery charge over the last minute - to: sitemgr - - template: apcupsd_last_collected_secs - on: apcupsd.load - class: Latency - type: Power Supply -component: UPS device - calc: $now - $last_collected_t - every: 10s - units: seconds ago - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - summary: APC UPS last collection - info: APC UPS number of seconds since the last successful data collection - to: sitemgr - -#Send out a warning when SELFTEST code is BT or NG. Code descriptions can be found at: -#http://www.apcupsd.org/manual/#:~:text=or%20N/A.-,SELFTEST,-The%20results%20of - template: apcupsd_selftest_warning - on: apcupsd.selftest - lookup: max -1s unaligned match-names of BT,NG - units: status - every: 10s - warn: $this == 1 - delay: up 0 down 15m multiplier 1.5 max 1h - info: APC UPS self-test failed due to insufficient battery capacity or due to overload. - to: sitemgr - -#Send out a warning when STATUS code is ONBATT,OVERLOAD,LOWBATT,REPLACEBATT,NOBATT,COMMLOST -#https://man.archlinux.org/man/apcaccess.8.en#:~:text=apcupsd%20was%20started-,STATUS,-%3A%20UPS%20status.%20One - - template: apcupsd_status_onbatt - on: apcupsd.status - lookup: max -1s unaligned match-names of ONBATT - units: status - every: 10s - warn: $this == 1 - delay: up 1m down 15m multiplier 1.5 max 1h - info: APC UPS has switched to battery power because the input power has failed - to: sitemgr - - template: apcupsd_status_overload - on: apcupsd.status - lookup: max -1s unaligned match-names of OVERLOAD - units: status - every: 10s - warn: $this == 1 - delay: up 0 down 15m multiplier 1.5 max 1h - info: APC UPS is overloaded and cannot supply enough power to the load - to: sitemgr - - template: apcupsd_status_lowbatt - on: apcupsd.status - lookup: max -1s unaligned match-names of LOWBATT - units: status - every: 10s - warn: $this == 1 - delay: up 0 down 15m multiplier 1.5 max 1h - info: APC UPS battery is low and needs to be recharged - to: sitemgr - - template: apcupsd_status_replacebatt - on: apcupsd.status - lookup: max -1s unaligned match-names of REPLACEBATT - units: status - every: 10s - warn: $this == 1 - delay: up 0 down 15m multiplier 1.5 max 1h - info: APC UPS battery has reached the end of its lifespan and needs to be replaced - to: sitemgr - - template: apcupsd_status_nobatt - on: apcupsd.status - lookup: max -1s unaligned match-names of NOBATT - units: status - every: 10s - warn: $this == 1 - delay: up 0 down 15m multiplier 1.5 max 1h - info: APC UPS has no battery - to: sitemgr - - template: apcupsd_status_commlost - on: apcupsd.status - lookup: max -1s unaligned match-names of COMMLOST - units: status - every: 10s - warn: $this == 1 - delay: up 0 down 15m multiplier 1.5 max 1h - info: APC UPS communication link is lost - to: sitemgr diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf deleted file mode 100644 index 446173428..000000000 --- a/health/health.d/bcache.conf +++ /dev/null @@ -1,31 +0,0 @@ - - template: bcache_cache_errors - on: disk.bcache_cache_read_races - class: Errors - type: System -component: Disk - lookup: sum -1m unaligned absolute - units: errors - every: 1m - warn: $this > 0 - delay: up 2m down 1h multiplier 1.5 max 2h - summary: Bcache cache read race errors - info: Number of times data was read from the cache, \ - the bucket was reused and invalidated in the last 10 minutes \ - (when this occurs the data is reread from the backing device) - to: silent - - template: bcache_cache_dirty - on: disk.bcache_cache_alloc - class: Utilization - type: System -component: Disk - calc: $dirty + $metadata + $undefined - units: % - every: 1m - warn: $this > 75 - delay: up 1m down 1h multiplier 1.5 max 2h - summary: Bcache cache used space - info: Percentage of cache space used for dirty data and metadata \ - (this usually means your SSD cache is too small) - to: silent diff --git a/health/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf deleted file mode 100644 index 0d37f28e0..000000000 --- a/health/health.d/beanstalkd.conf +++ /dev/null @@ -1,41 +0,0 @@ -# get the number of buried jobs in all queues - - template: beanstalk_server_buried_jobs - on: beanstalk.current_jobs - class: Workload - type: Messaging -component: Beanstalk - calc: $buried - units: jobs - every: 10s - warn: $this > 3 - delay: up 0 down 5m multiplier 1.2 max 1h - summary: Beanstalk buried jobs - info: Number of buried jobs across all tubes. \ - You need to manually kick them so they can be processed. \ - Presence of buried jobs in a tube does not affect new jobs. - to: sysadmin - -# get the number of buried jobs per queue - -#template: beanstalk_tube_buried_jobs -# on: beanstalk.jobs -# calc: $buried -# units: jobs -# every: 10s -# warn: $this > 0 -# crit: $this > 10 -# delay: up 0 down 5m multiplier 1.2 max 1h -# info: the number of jobs buried per tube -# to: sysadmin - -# get the current number of tubes - -#template: beanstalk_number_of_tubes -# on: beanstalk.current_tubes -# calc: $tubes -# every: 10s -# warn: $this < 5 -# delay: up 0 down 5m multiplier 1.2 max 1h -# info: the current number of tubes on the server -# to: sysadmin diff --git a/health/health.d/bind_rndc.conf b/health/health.d/bind_rndc.conf deleted file mode 100644 index b1c271df9..000000000 --- a/health/health.d/bind_rndc.conf +++ /dev/null @@ -1,12 +0,0 @@ - template: bind_rndc_stats_file_size - on: bind_rndc.stats_size - class: Utilization - type: DNS -component: BIND - units: megabytes - every: 60 - calc: $stats_size - warn: $this > 512 - summary: BIND statistics file size - info: BIND statistics-file size - to: sysadmin diff --git a/health/health.d/boinc.conf b/health/health.d/boinc.conf deleted file mode 100644 index 092a56845..000000000 --- a/health/health.d/boinc.conf +++ /dev/null @@ -1,70 +0,0 @@ -# Alarms for various BOINC issues. - -# Warn on any compute errors encountered. - template: boinc_compute_errors - on: boinc.states - class: Errors - type: Computing -component: BOINC - os: * - hosts: * - lookup: average -10m unaligned of comperror - units: tasks - every: 1m - warn: $this > 0 - delay: up 1m down 5m multiplier 1.5 max 1h - summary: BOINC compute errors - info: Average number of compute errors over the last 10 minutes - to: sysadmin - -# Warn on lots of upload errors - template: boinc_upload_errors - on: boinc.states - class: Errors - type: Computing -component: BOINC - os: * - hosts: * - lookup: average -10m unaligned of upload_failed - units: tasks - every: 1m - warn: $this > 0 - delay: up 1m down 5m multiplier 1.5 max 1h - summary: BOINC failed uploads - info: Average number of failed uploads over the last 10 minutes - to: sysadmin - -# Warn on the task queue being empty - template: boinc_total_tasks - on: boinc.tasks - class: Utilization - type: Computing -component: BOINC - os: * - hosts: * - lookup: average -10m unaligned of total - units: tasks - every: 1m - warn: $this < 1 - delay: up 5m down 10m multiplier 1.5 max 1h - summary: BOINC total tasks - info: Average number of total tasks over the last 10 minutes - to: sysadmin - -# Warn on no active tasks with a non-empty queue - template: boinc_active_tasks - on: boinc.tasks - class: Utilization - type: Computing -component: BOINC - os: * - hosts: * - lookup: average -10m unaligned of active - calc: ($boinc_total_tasks >= 1) ? ($this) : (inf) - units: tasks - every: 1m - warn: $this < 1 - delay: up 5m down 10m multiplier 1.5 max 1h - summary: BOINC active tasks - info: Average number of active tasks over the last 10 minutes - to: sysadmin diff --git a/health/health.d/btrfs.conf b/health/health.d/btrfs.conf deleted file mode 100644 index 1557a5941..000000000 --- a/health/health.d/btrfs.conf +++ /dev/null @@ -1,142 +0,0 @@ - - template: btrfs_allocated - on: btrfs.disk - class: Utilization - type: System -component: File system - os: * - hosts: * - calc: 100 - ($unallocated * 100 / ($unallocated + $data_used + $data_free + $meta_used + $meta_free + $sys_used + $sys_free)) - units: % - every: 10s - warn: $this > (($status == $CRITICAL) ? (95) : (98)) - delay: up 1m down 15m multiplier 1.5 max 1h - summary: BTRFS allocated space utilization - info: Percentage of allocated BTRFS physical disk space - to: silent - - template: btrfs_data - on: btrfs.data - class: Utilization - type: System -component: File system - os: * - hosts: * - calc: $used * 100 / ($used + $free) - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98 - crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98 - delay: up 1m down 15m multiplier 1.5 max 1h - summary: BTRFS data space utilization - info: Utilization of BTRFS data space - to: sysadmin - - template: btrfs_metadata - on: btrfs.metadata - class: Utilization - type: System -component: File system - os: * - hosts: * - calc: ($used + $reserved) * 100 / ($used + $free + $reserved) - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98 - crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98 - delay: up 1m down 15m multiplier 1.5 max 1h - summary: BTRFS metadata space utilization - info: Utilization of BTRFS metadata space - to: sysadmin - - template: btrfs_system - on: btrfs.system - class: Utilization - type: System -component: File system - os: * - hosts: * - calc: $used * 100 / ($used + $free) - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98 - crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98 - delay: up 1m down 15m multiplier 1.5 max 1h - summary: BTRFS system space utilization - info: Utilization of BTRFS system space - to: sysadmin - - template: btrfs_device_read_errors - on: btrfs.device_errors - class: Errors - type: System -component: File system - os: * - hosts: * - units: errors - lookup: max -10m every 1m of read_errs - warn: $this > 0 - delay: up 1m down 15m multiplier 1.5 max 1h - summary: BTRFS device read errors - info: Number of encountered BTRFS read errors - to: sysadmin - - template: btrfs_device_write_errors - on: btrfs.device_errors - class: Errors - type: System -component: File system - os: * - hosts: * - units: errors - lookup: max -10m every 1m of write_errs - crit: $this > 0 - delay: up 1m down 15m multiplier 1.5 max 1h - summary: BTRFS device write errors - info: Number of encountered BTRFS write errors - to: sysadmin - - template: btrfs_device_flush_errors - on: btrfs.device_errors - class: Errors - type: System -component: File system - os: * - hosts: * - units: errors - lookup: max -10m every 1m of flush_errs - crit: $this > 0 - delay: up 1m down 15m multiplier 1.5 max 1h - summary: BTRFS device flush errors - info: Number of encountered BTRFS flush errors - to: sysadmin - - template: btrfs_device_corruption_errors - on: btrfs.device_errors - class: Errors - type: System -component: File system - os: * - hosts: * - units: errors - lookup: max -10m every 1m of corruption_errs - warn: $this > 0 - delay: up 1m down 15m multiplier 1.5 max 1h - summary: BTRFS device corruption errors - info: Number of encountered BTRFS corruption errors - to: sysadmin - - template: btrfs_device_generation_errors - on: btrfs.device_errors - class: Errors - type: System -component: File system - os: * - hosts: * - units: errors - lookup: max -10m every 1m of generation_errs - warn: $this > 0 - delay: up 1m down 15m multiplier 1.5 max 1h - summary: BTRFS device generation errors - info: Number of encountered BTRFS generation errors - to: sysadmin diff --git a/health/health.d/ceph.conf b/health/health.d/ceph.conf deleted file mode 100644 index 44d351338..000000000 --- a/health/health.d/ceph.conf +++ /dev/null @@ -1,16 +0,0 @@ -# low ceph disk available - - template: ceph_cluster_space_usage - on: ceph.general_usage - class: Utilization - type: Storage -component: Ceph - calc: $used * 100 / ($used + $avail) - units: % - every: 1m - warn: $this > (($status >= $WARNING ) ? (85) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: down 5m multiplier 1.2 max 1h - summary: Ceph cluster disk space utilization - info: Ceph cluster disk space utilization - to: sysadmin diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf deleted file mode 100644 index 9c55633ef..000000000 --- a/health/health.d/cgroups.conf +++ /dev/null @@ -1,72 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - - template: cgroup_10min_cpu_usage - on: cgroup.cpu_limit - class: Utilization - type: Cgroups -component: CPU - os: linux - hosts: * - lookup: average -10m unaligned - units: % - every: 1m - warn: $this > (($status == $CRITICAL) ? (85) : (95)) - delay: down 15m multiplier 1.5 max 1h - summary: Cgroup ${label:cgroup_name} CPU utilization - info: Cgroup ${label:cgroup_name} average CPU utilization over the last 10 minutes - to: silent - - template: cgroup_ram_in_use - on: cgroup.mem_usage - class: Utilization - type: Cgroups -component: Memory - os: linux - hosts: * - calc: ($ram) * 100 / $memory_limit - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: down 15m multiplier 1.5 max 1h - summary: Cgroup ${label:cgroup_name} memory utilization - info: Cgroup ${label:cgroup_name} memory utilization - to: silent - -# ---------------------------------K8s containers-------------------------------------------- - - template: k8s_cgroup_10min_cpu_usage - on: k8s.cgroup.cpu_limit - class: Utilization - type: Cgroups -component: CPU - os: linux - hosts: * - lookup: average -10m unaligned - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (75) : (85)) - delay: down 15m multiplier 1.5 max 1h - summary: Container ${label:k8s_container_name} pod ${label:k8s_pod_name} CPU utilization - info: Container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \ - average CPU utilization over the last 10 minutes - to: silent - - template: k8s_cgroup_ram_in_use - on: k8s.cgroup.mem_usage - class: Utilization - type: Cgroups -component: Memory - os: linux - hosts: * - calc: ($ram) * 100 / $memory_limit - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: down 15m multiplier 1.5 max 1h - summary: Container ${label:k8s_container_name} pod ${label:k8s_pod_name} memory utilization - info: container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \ - memory utilization - to: silent diff --git a/health/health.d/cockroachdb.conf b/health/health.d/cockroachdb.conf deleted file mode 100644 index 60f178354..000000000 --- a/health/health.d/cockroachdb.conf +++ /dev/null @@ -1,78 +0,0 @@ - -# Capacity - - template: cockroachdb_used_storage_capacity - on: cockroachdb.storage_used_capacity_percentage - class: Utilization - type: Database -component: CockroachDB - calc: $total - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (85)) - crit: $this > (($status == $CRITICAL) ? (85) : (95)) - delay: down 15m multiplier 1.5 max 1h - summary: CockroachDB storage space utilization - info: Storage capacity utilization - to: dba - - template: cockroachdb_used_usable_storage_capacity - on: cockroachdb.storage_used_capacity_percentage - class: Utilization - type: Database -component: CockroachDB - calc: $usable - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (85)) - crit: $this > (($status == $CRITICAL) ? (85) : (95)) - delay: down 15m multiplier 1.5 max 1h - summary: CockroachDB usable storage space utilization - info: Storage usable space utilization - to: dba - -# Replication - - template: cockroachdb_unavailable_ranges - on: cockroachdb.ranges_replication_problem - class: Errors - type: Database -component: CockroachDB - calc: $unavailable - units: num - every: 10s - warn: $this > 0 - delay: down 15m multiplier 1.5 max 1h - summary: CockroachDB unavailable replication - info: Number of ranges with fewer live replicas than needed for quorum - to: dba - - template: cockroachdb_underreplicated_ranges - on: cockroachdb.ranges_replication_problem - class: Errors - type: Database -component: CockroachDB - calc: $under_replicated - units: num - every: 10s - warn: $this > 0 - delay: down 15m multiplier 1.5 max 1h - summary: CockroachDB under-replicated - info: Number of ranges with fewer live replicas than the replication target - to: dba - -# FD - - template: cockroachdb_open_file_descriptors_limit - on: cockroachdb.process_file_descriptors - class: Utilization - type: Database -component: CockroachDB - calc: $open/$sys_fd_softlimit * 100 - units: % - every: 10s - warn: $this > 80 - delay: down 15m multiplier 1.5 max 1h - summary: CockroachDB file descriptors utilization - info: Open file descriptors utilization (against softlimit) - to: dba diff --git a/health/health.d/consul.conf b/health/health.d/consul.conf deleted file mode 100644 index 8b414a26d..000000000 --- a/health/health.d/consul.conf +++ /dev/null @@ -1,171 +0,0 @@ -# you can disable an alarm notification by setting the 'to' line to: silent - - template: consul_license_expiration_time - on: consul.license_expiration_time - class: Errors - type: ServiceMesh -component: Consul - calc: $license_expiration - every: 60m - units: seconds - warn: $this < 14*24*60*60 - crit: $this < 7*24*60*60 - summary: Consul license expiration on ${label:node_name} - info: Consul Enterprise license expiration time on node ${label:node_name} datacenter ${label:datacenter} - to: sysadmin - - template: consul_autopilot_health_status - on: consul.autopilot_health_status - class: Errors - type: ServiceMesh -component: Consul - calc: $unhealthy - every: 10s - units: status - warn: $this == 1 - delay: down 5m multiplier 1.5 max 1h - summary: Consul datacenter ${label:datacenter} health - info: Datacenter ${label:datacenter} cluster is unhealthy as reported by server ${label:node_name} - to: sysadmin - - template: consul_autopilot_server_health_status - on: consul.autopilot_server_health_status - class: Errors - type: ServiceMesh -component: Consul - calc: $unhealthy - every: 10s - units: status - warn: $this == 1 - delay: down 5m multiplier 1.5 max 1h - summary: Consul server ${label:node_name} health - info: Server ${label:node_name} from datacenter ${label:datacenter} is unhealthy - to: sysadmin - - template: consul_raft_leader_last_contact_time - on: consul.raft_leader_last_contact_time - class: Errors - type: ServiceMesh -component: Consul - lookup: average -1m unaligned of quantile_0.5 - every: 10s - units: milliseconds - warn: $this > (($status >= $WARNING) ? (150) : (200)) - crit: $this > (($status == $CRITICAL) ? (200) : (500)) - delay: down 5m multiplier 1.5 max 1h - summary: Consul leader server ${label:node_name} last contact time - info: Median time elapsed since leader server ${label:node_name} datacenter ${label:datacenter} was last able to contact the follower nodes - to: sysadmin - - template: consul_raft_leadership_transitions - on: consul.raft_leadership_transitions_rate - class: Errors - type: ServiceMesh -component: Consul - lookup: sum -1m unaligned - every: 10s - units: transitions - warn: $this > 0 - delay: down 5m multiplier 1.5 max 1h - summary: Consul server ${label:node_name} leadership transitions - info: There has been a leadership change and server ${label:node_name} datacenter ${label:datacenter} has become the leader - to: sysadmin - - template: consul_raft_thread_main_saturation - on: consul.raft_thread_main_saturation_perc - class: Utilization - type: ServiceMesh -component: Consul - lookup: average -1m unaligned of quantile_0.9 - every: 10s - units: percentage - warn: $this > (($status >= $WARNING) ? (40) : (50)) - delay: down 5m multiplier 1.5 max 1h - summary: Consul server ${label:node_name} main Raft saturation - info: Average saturation of the main Raft goroutine on server ${label:node_name} datacenter ${label:datacenter} - to: sysadmin - - template: consul_raft_thread_fsm_saturation - on: consul.raft_thread_fsm_saturation_perc - class: Utilization - type: ServiceMesh -component: Consul - lookup: average -1m unaligned of quantile_0.9 - every: 10s - units: milliseconds - warn: $this > (($status >= $WARNING) ? (40) : (50)) - delay: down 5m multiplier 1.5 max 1h - summary: Consul server ${label:node_name} FSM Raft saturation - info: Average saturation of the FSM Raft goroutine on server ${label:node_name} datacenter ${label:datacenter} - to: sysadmin - - template: consul_client_rpc_requests_exceeded - on: consul.client_rpc_requests_exceeded_rate - class: Errors - type: ServiceMesh -component: Consul - lookup: sum -1m unaligned - every: 10s - units: requests - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: down 5m multiplier 1.5 max 1h - summary: Consul server ${label:node_name} RPC requests rate - info: Number of rate-limited RPC requests made by server ${label:node_name} datacenter ${label:datacenter} - to: sysadmin - - template: consul_client_rpc_requests_failed - on: consul.client_rpc_requests_failed_rate - class: Errors - type: ServiceMesh -component: Consul - lookup: sum -1m unaligned - every: 10s - units: requests - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: down 5m multiplier 1.5 max 1h - summary: Consul server ${label:node_name} failed RPC requests - info: number of failed RPC requests made by server ${label:node_name} datacenter ${label:datacenter} - to: sysadmin - - template: consul_node_health_check_status - on: consul.node_health_check_status - class: Errors - type: ServiceMesh -component: Consul - calc: $warning + $critical - every: 10s - units: status - warn: $this != nan AND $this != 0 - delay: down 5m multiplier 1.5 max 1h - summary: Consul node health check ${label:check_name} on ${label:node_name} - info: Node health check ${label:check_name} has failed on server ${label:node_name} datacenter ${label:datacenter} - to: sysadmin - - template: consul_service_health_check_status - on: consul.service_health_check_status - class: Errors - type: ServiceMesh -component: Consul - calc: $warning + $critical - every: 10s - units: status - warn: $this == 1 - delay: down 5m multiplier 1.5 max 1h - summary: Consul service health check ${label:check_name} service ${label:service_name} node ${label:node_name} - info: Service health check ${label:check_name} for service ${label:service_name} has failed on server ${label:node_name} datacenter ${label:datacenter} - to: sysadmin - - template: consul_gc_pause_time - on: consul.gc_pause_time - class: Errors - type: ServiceMesh -component: Consul - lookup: sum -1m unaligned - every: 10s - units: seconds - warn: $this > (($status >= $WARNING) ? (1) : (2)) - crit: $this > (($status >= $WARNING) ? (2) : (5)) - delay: down 5m multiplier 1.5 max 1h - summary: Consul server ${label:node_name} garbage collection pauses - info: Time spent in stop-the-world garbage collection pauses on server ${label:node_name} datacenter ${label:datacenter} - to: sysadmin diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf deleted file mode 100644 index 0b007d6b4..000000000 --- a/health/health.d/cpu.conf +++ /dev/null @@ -1,69 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - - template: 10min_cpu_usage - on: system.cpu - class: Utilization - type: System -component: CPU - os: linux - hosts: * - lookup: average -10m unaligned of user,system,softirq,irq,guest - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (75) : (85)) - crit: $this > (($status == $CRITICAL) ? (85) : (95)) - delay: down 15m multiplier 1.5 max 1h - summary: System CPU utilization - info: Average CPU utilization over the last 10 minutes (excluding iowait, nice and steal) - to: silent - - template: 10min_cpu_iowait - on: system.cpu - class: Utilization - type: System -component: CPU - os: linux - hosts: * - lookup: average -10m unaligned of iowait - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (20) : (40)) - delay: up 30m down 30m multiplier 1.5 max 2h - summary: System CPU iowait time - info: Average CPU iowait time over the last 10 minutes - to: silent - - template: 20min_steal_cpu - on: system.cpu - class: Latency - type: System -component: CPU - os: linux - hosts: * - lookup: average -20m unaligned of steal - units: % - every: 5m - warn: $this > (($status >= $WARNING) ? (5) : (10)) - delay: down 1h multiplier 1.5 max 2h - summary: System CPU steal time - info: Average CPU steal time over the last 20 minutes - to: silent - -## FreeBSD - template: 10min_cpu_usage - on: system.cpu - class: Utilization - type: System -component: CPU - os: freebsd - hosts: * - lookup: average -10m unaligned of user,system,interrupt - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (75) : (85)) - crit: $this > (($status == $CRITICAL) ? (85) : (95)) - delay: down 15m multiplier 1.5 max 1h - summary: System CPU utilization - info: Average CPU utilization over the last 10 minutes (excluding nice) - to: silent diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf deleted file mode 100644 index 0a70d2e8f..000000000 --- a/health/health.d/dbengine.conf +++ /dev/null @@ -1,68 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - - alarm: 10min_dbengine_global_fs_errors - on: netdata.dbengine_global_errors - class: Errors - type: Netdata -component: DB engine - os: linux freebsd macos - hosts: * - lookup: sum -10m unaligned of fs_errors - units: errors - every: 10s - crit: $this > 0 - delay: down 15m multiplier 1.5 max 1h - summary: Netdata DBengine filesystem errors - info: Number of filesystem errors in the last 10 minutes (too many open files, wrong permissions, etc) - to: sysadmin - - alarm: 10min_dbengine_global_io_errors - on: netdata.dbengine_global_errors - class: Errors - type: Netdata -component: DB engine - os: linux freebsd macos - hosts: * - lookup: sum -10m unaligned of io_errors - units: errors - every: 10s - crit: $this > 0 - delay: down 1h multiplier 1.5 max 3h - summary: Netdata DBengine IO errors - info: Number of IO errors in the last 10 minutes (CRC errors, out of space, bad disk, etc) - to: sysadmin - - alarm: 10min_dbengine_global_flushing_warnings - on: netdata.dbengine_global_errors - class: Errors - type: Netdata -component: DB engine - os: linux freebsd macos - hosts: * - lookup: sum -10m unaligned of pg_cache_over_half_dirty_events - units: errors - every: 10s - warn: $this > 0 - delay: down 1h multiplier 1.5 max 3h - summary: Netdata DBengine global flushing warnings - info: number of times when dbengine dirty pages were over 50% of the instance's page cache in the last 10 minutes. \ - Metric data are at risk of not being stored in the database. To remedy, reduce disk load or use faster disks. - to: sysadmin - - alarm: 10min_dbengine_global_flushing_errors - on: netdata.dbengine_long_term_page_stats - class: Errors - type: Netdata -component: DB engine - os: linux freebsd macos - hosts: * - lookup: sum -10m unaligned of flushing_pressure_deletions - units: pages - every: 10s - crit: $this != 0 - delay: down 1h multiplier 1.5 max 3h - summary: Netdata DBengine global flushing errors - info: Number of pages deleted due to failure to flush data to disk in the last 10 minutes. \ - Metric data were lost to unblock data collection. To fix, reduce disk load or use faster disks. - to: sysadmin diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf deleted file mode 100644 index 2e417fd4a..000000000 --- a/health/health.d/disks.conf +++ /dev/null @@ -1,172 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - - -# ----------------------------------------------------------------------------- -# low disk space - -# checking the latest collected values -# raise an alarm if the disk is low on -# available disk space - - template: disk_space_usage - on: disk.space - class: Utilization - type: System -component: Disk - os: linux freebsd - hosts: * -chart labels: mount_point=!/dev !/dev/* !/run !/run/* * - calc: $used * 100 / ($avail + $used) - units: % - every: 1m - warn: $this > (($status >= $WARNING ) ? (80) : (90)) - crit: ($this > (($status == $CRITICAL) ? (90) : (98))) && $avail < 5 - delay: up 1m down 15m multiplier 1.5 max 1h - summary: Disk ${label:mount_point} space usage - info: Total space utilization of disk ${label:mount_point} - to: sysadmin - - template: disk_inode_usage - on: disk.inodes - class: Utilization - type: System -component: Disk - os: linux freebsd - hosts: * -chart labels: mount_point=!/dev !/dev/* !/run !/run/* * - calc: $used * 100 / ($avail + $used) - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: up 1m down 15m multiplier 1.5 max 1h - summary: Disk ${label:mount_point} inode usage - info: Total inode utilization of disk ${label:mount_point} - to: sysadmin - - -# ----------------------------------------------------------------------------- -# disk fill rate - -# calculate the rate the disk fills -# use as base, the available space change -# during the last hour - -# this is just a calculation - it has no alarm -# we will use it in the next template to find -# the hours remaining - -template: disk_fill_rate - on: disk.space - os: linux freebsd - hosts: * - lookup: min -10m at -50m unaligned of avail - calc: ($this - $avail) / (($now - $after) / 3600) - every: 1m - units: GB/hour - info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour - -# calculate the hours remaining -# if the disk continues to fill -# in this rate - -template: out_of_disk_space_time - on: disk.space - os: linux freebsd - hosts: * - calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf) - units: hours - every: 10s - warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) - crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) - delay: down 15m multiplier 1.2 max 1h - summary: Disk ${label:mount_point} estimation of lack of space - info: Estimated time the disk ${label:mount_point} will run out of space, if the system continues to add data with the rate of the last hour - to: silent - - -# ----------------------------------------------------------------------------- -# disk inode fill rate - -# calculate the rate the disk inodes are allocated -# use as base, the available inodes change -# during the last hour - -# this is just a calculation - it has no alarm -# we will use it in the next template to find -# the hours remaining - -template: disk_inode_rate - on: disk.inodes - os: linux freebsd - hosts: * - lookup: min -10m at -50m unaligned of avail - calc: ($this - $avail) / (($now - $after) / 3600) - every: 1m - units: inodes/hour - info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour - -# calculate the hours remaining -# if the disk inodes are allocated -# in this rate - -template: out_of_disk_inodes_time - on: disk.inodes - os: linux freebsd - hosts: * - calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf) - units: hours - every: 10s - warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) - crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) - delay: down 15m multiplier 1.2 max 1h - summary: Disk ${label:mount_point} estimation of lack of inodes - info: Estimated time the disk ${label:mount_point} will run out of inodes, if the system continues to allocate inodes with the rate of the last hour - to: silent - - -# ----------------------------------------------------------------------------- -# disk congestion - -# raise an alarm if the disk is congested -# by calculating the average disk utilization -# for the last 10 minutes - - template: 10min_disk_utilization - on: disk.util - class: Utilization - type: System -component: Disk - os: linux freebsd - hosts: * - lookup: average -10m unaligned - units: % - every: 1m - warn: $this > 98 * (($status >= $WARNING) ? (0.7) : (1)) - delay: down 15m multiplier 1.2 max 1h - summary: Disk ${label:device} utilization - info: Average percentage of time ${label:device} disk was busy over the last 10 minutes - to: silent - - -# raise an alarm if the disk backlog -# is above 1000ms (1s) per second -# for 10 minutes -# (i.e. the disk cannot catch up) - - template: 10min_disk_backlog - on: disk.backlog - class: Latency - type: System -component: Disk - os: linux - hosts: * - lookup: average -10m unaligned - units: ms - every: 1m - warn: $this > 5000 * (($status >= $WARNING) ? (0.7) : (1)) - delay: down 15m multiplier 1.2 max 1h - summary: Disk ${label:device} backlog - info: Average backlog size of the ${label:device} disk over the last 10 minutes - to: silent diff --git a/health/health.d/dns_query.conf b/health/health.d/dns_query.conf deleted file mode 100644 index 756c6a1b6..000000000 --- a/health/health.d/dns_query.conf +++ /dev/null @@ -1,15 +0,0 @@ -# detect dns query failure - - template: dns_query_query_status - on: dns_query.query_status - class: Errors - type: DNS -component: DNS - calc: $success - units: status - every: 10s - warn: $this != nan && $this != 1 - delay: up 30s down 5m multiplier 1.5 max 1h - summary: DNS query unsuccessful requests to ${label:server} - info: DNS request type ${label:record_type} to server ${label:server} is unsuccessful - to: sysadmin diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf deleted file mode 100644 index f6ef01940..000000000 --- a/health/health.d/dnsmasq_dhcp.conf +++ /dev/null @@ -1,15 +0,0 @@ -# dhcp-range utilization - - template: dnsmasq_dhcp_dhcp_range_utilization - on: dnsmasq_dhcp.dhcp_range_utilization - class: Utilization - type: DHCP -component: Dnsmasq - every: 10s - units: % - calc: $used - warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) ) - delay: down 5m - summary: Dnsmasq DHCP range ${label:dhcp_range} utilization - info: DHCP range ${label:dhcp_range} utilization - to: sysadmin diff --git a/health/health.d/docker.conf b/health/health.d/docker.conf deleted file mode 100644 index 668614d4d..000000000 --- a/health/health.d/docker.conf +++ /dev/null @@ -1,12 +0,0 @@ - template: docker_container_unhealthy - on: docker.container_health_status - class: Errors - type: Containers -component: Docker - units: status - every: 10s - lookup: average -10s of unhealthy - warn: $this > 0 - summary: Docker container ${label:container_name} health - info: ${label:container_name} docker container health status is unhealthy - to: sysadmin diff --git a/health/health.d/elasticsearch.conf b/health/health.d/elasticsearch.conf deleted file mode 100644 index 600840c58..000000000 --- a/health/health.d/elasticsearch.conf +++ /dev/null @@ -1,78 +0,0 @@ -# you can disable an alarm notification by setting the 'to' line to: silent - -# 'red' is a threshold, can't lookup the 'red' dimension - using simple pattern is a workaround. - - template: elasticsearch_cluster_health_status_red - on: elasticsearch.cluster_health_status - class: Errors - type: SearchEngine -component: Elasticsearch - lookup: average -5s unaligned of *ed - every: 10s - units: status - crit: $this == 1 - delay: down 5m multiplier 1.5 max 1h - summary: Elasticsearch cluster ${label:cluster_name} status - info: Elasticsearch cluster ${label:cluster_name} health status is red. - to: sysadmin - -# the idea of '-10m' is to handle yellow status after node restart, -# (usually) no action is required because Elasticsearch will automatically restore the green status. - template: elasticsearch_cluster_health_status_yellow - on: elasticsearch.cluster_health_status - class: Errors - type: SearchEngine -component: Elasticsearch - lookup: average -10m unaligned of yellow - every: 1m - units: status - warn: $this == 1 - delay: down 5m multiplier 1.5 max 1h - summary: Elasticsearch cluster ${label:cluster_name} status - info: Elasticsearch cluster ${label:cluster_name} health status is yellow. - to: sysadmin - - template: elasticsearch_node_index_health_red - on: elasticsearch.node_index_health - class: Errors - type: SearchEngine -component: Elasticsearch - lookup: average -5s unaligned of *ed - every: 10s - units: status - warn: $this == 1 - delay: down 5m multiplier 1.5 max 1h - summary: Elasticsearch cluster ${label:cluster_name} index ${label:index} status - info: Elasticsearch cluster ${label:cluster_name} index ${label:index} health status is red. - to: sysadmin - -# don't convert 'lookup' value to seconds in 'calc' due to UI showing seconds as hh:mm:ss (0 as now). - - template: elasticsearch_node_indices_search_time_query - on: elasticsearch.node_indices_search_time - class: Workload - type: SearchEngine -component: Elasticsearch - lookup: average -10m unaligned of query - every: 10s - units: milliseconds - warn: $this > (($status >= $WARNING) ? (20 * 1000) : (30 * 1000)) - delay: down 5m multiplier 1.5 max 1h - summary: Elasticsearch cluster ${label:cluster_name} node ${label:node_name} query performance - info: Elasticsearch cluster ${label:cluster_name} node ${label:node_name} search performance is degraded, queries run slowly. - to: sysadmin - - template: elasticsearch_node_indices_search_time_fetch - on: elasticsearch.node_indices_search_time - class: Workload - type: SearchEngine -component: Elasticsearch - lookup: average -10m unaligned of fetch - every: 10s - units: milliseconds - warn: $this > (($status >= $WARNING) ? (3 * 1000) : (5 * 1000)) - crit: $this > (($status == $CRITICAL) ? (5 * 1000) : (30 * 1000)) - delay: down 5m multiplier 1.5 max 1h - summary: Elasticsearch cluster ${label:cluster_name} node ${label:node_name} fetch performance - info: Elasticsearch cluster ${label:cluster_name} node ${label:node_name} search performance is degraded, fetches run slowly. - to: sysadmin diff --git a/health/health.d/entropy.conf b/health/health.d/entropy.conf deleted file mode 100644 index be8b1fe4f..000000000 --- a/health/health.d/entropy.conf +++ /dev/null @@ -1,20 +0,0 @@ - -# check if entropy is too low -# the alarm is checked every 1 minute -# and examines the last hour of data - - alarm: lowest_entropy - on: system.entropy - class: Utilization - type: System -component: Cryptography - os: linux - hosts: * - lookup: min -5m unaligned - units: entries - every: 5m - warn: $this < (($status >= $WARNING) ? (200) : (100)) - delay: down 1h multiplier 1.5 max 2h - summary: System entropy pool number of entries - info: Minimum number of entries in the random numbers pool in the last 5 minutes - to: silent diff --git a/health/health.d/exporting.conf b/health/health.d/exporting.conf deleted file mode 100644 index c0320193c..000000000 --- a/health/health.d/exporting.conf +++ /dev/null @@ -1,29 +0,0 @@ - - template: exporting_last_buffering - on: netdata.exporting_data_size - class: Latency - type: Netdata -component: Exporting engine - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - summary: Netdata exporting data last successful buffering - info: Number of seconds since the last successful buffering of exporting data - to: dba - - template: exporting_metrics_sent - on: netdata.exporting_data_size - class: Workload - type: Netdata -component: Exporting engine - units: % - calc: abs($sent) * 100 / abs($buffered) - every: 10s - warn: $this != 100 - delay: down 5m multiplier 1.5 max 1h - summary: Netdata exporting metrics sent - info: Percentage of metrics sent to the external database server - to: dba diff --git a/health/health.d/file_descriptors.conf b/health/health.d/file_descriptors.conf deleted file mode 100644 index 20a592d6b..000000000 --- a/health/health.d/file_descriptors.conf +++ /dev/null @@ -1,33 +0,0 @@ - # you can disable an alarm notification by setting the 'to' line to: silent - - template: system_file_descriptors_utilization - on: system.file_nr_utilization - class: Utilization - type: System - component: Processes - hosts: * - lookup: max -1m unaligned - units: % - every: 1m - crit: $this > 90 - delay: down 15m multiplier 1.5 max 1h - summary: System open file descriptors utilization - info: System-wide utilization of open files - to: sysadmin - - template: apps_group_file_descriptors_utilization - on: app.fds_open_limit - class: Utilization - type: System -component: Process - os: linux - module: * - hosts: * - lookup: max -10s unaligned foreach * - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (85) : (95)) - delay: down 15m multiplier 1.5 max 1h - summary: App group ${label:app_group} file descriptors utilization - info: Open files percentage against the processes limits, among all PIDs in application group - to: sysadmin diff --git a/health/health.d/gearman.conf b/health/health.d/gearman.conf deleted file mode 100644 index 78e1165d1..000000000 --- a/health/health.d/gearman.conf +++ /dev/null @@ -1,14 +0,0 @@ - - template: gearman_workers_queued - on: gearman.single_job - class: Latency - type: Computing -component: Gearman - lookup: average -10m unaligned match-names of Pending - units: workers - every: 10s - warn: $this > 30000 - delay: down 5m multiplier 1.5 max 1h - summary: Gearman queued jobs - info: Average number of queued jobs over the last 10 minutes - to: sysadmin diff --git a/health/health.d/geth.conf b/health/health.d/geth.conf deleted file mode 100644 index 361b6b41f..000000000 --- a/health/health.d/geth.conf +++ /dev/null @@ -1,11 +0,0 @@ -#chainhead_header is expected momenterarily to be ahead. If its considerably ahead (e.g more than 5 blocks), then the node is definitely out of sync. - template: geth_chainhead_diff_between_header_block - on: geth.chainhead - class: Workload - type: ethereum_node -component: geth - every: 10s - calc: $chain_head_block - $chain_head_header - units: blocks - warn: $this != 0 - delay: down 1m multiplier 1.5 max 1h diff --git a/health/health.d/go.d.plugin.conf b/health/health.d/go.d.plugin.conf deleted file mode 100644 index 7796a1bc8..000000000 --- a/health/health.d/go.d.plugin.conf +++ /dev/null @@ -1,18 +0,0 @@ - -# make sure go.d.plugin data collection job is running - - template: go.d_job_last_collected_secs - on: netdata.go_plugin_execution_time - class: Errors - type: Netdata -component: go.d.plugin - module: !* * - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - summary: Go.d plugin last collection - info: Number of seconds since the last successful data collection - to: webmaster diff --git a/health/health.d/haproxy.conf b/health/health.d/haproxy.conf deleted file mode 100644 index 66a488fa4..000000000 --- a/health/health.d/haproxy.conf +++ /dev/null @@ -1,25 +0,0 @@ - template: haproxy_backend_server_status - on: haproxy_hs.down - class: Errors - type: Web Proxy -component: HAProxy - units: failed servers - every: 10s - lookup: average -10s - crit: $this > 0 - summary: HAProxy server status - info: Average number of failed haproxy backend servers over the last 10 seconds - to: sysadmin - - template: haproxy_backend_status - on: haproxy_hb.down - class: Errors - type: Web Proxy -component: HAProxy - units: failed backend - every: 10s - lookup: average -10s - crit: $this > 0 - summary: HAProxy backend status - info: Average number of failed haproxy backends over the last 10 seconds - to: sysadmin diff --git a/health/health.d/hdfs.conf b/health/health.d/hdfs.conf deleted file mode 100644 index 566e815aa..000000000 --- a/health/health.d/hdfs.conf +++ /dev/null @@ -1,81 +0,0 @@ - -# Common - - template: hdfs_capacity_usage - on: hdfs.capacity - class: Utilization - type: Storage -component: HDFS - calc: ($used) * 100 / ($used + $remaining) - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (70) : (80)) - crit: $this > (($status == $CRITICAL) ? (80) : (98)) - delay: down 15m multiplier 1.5 max 1h - summary: HDFS datanodes space utilization - info: summary datanodes space capacity utilization - to: sysadmin - - -# NameNode - - template: hdfs_missing_blocks - on: hdfs.blocks - class: Errors - type: Storage -component: HDFS - calc: $missing - units: missing blocks - every: 10s - warn: $this > 0 - delay: down 15m multiplier 1.5 max 1h - summary: HDFS missing blocks - info: number of missing blocks - to: sysadmin - - - template: hdfs_stale_nodes - on: hdfs.data_nodes - class: Errors - type: Storage -component: HDFS - calc: $stale - units: dead nodes - every: 10s - warn: $this > 0 - delay: down 15m multiplier 1.5 max 1h - summary: HDFS stale datanodes - info: number of datanodes marked stale due to delayed heartbeat - to: sysadmin - - - template: hdfs_dead_nodes - on: hdfs.data_nodes - class: Errors - type: Storage -component: HDFS - calc: $dead - units: dead nodes - every: 10s - crit: $this > 0 - delay: down 15m multiplier 1.5 max 1h - summary: HDFS dead datanodes - info: number of datanodes which are currently dead - to: sysadmin - - -# DataNode - - template: hdfs_num_failed_volumes - on: hdfs.num_failed_volumes - class: Errors - type: Storage -component: HDFS - calc: $fsds_num_failed_volumes - units: failed volumes - every: 10s - warn: $this > 0 - delay: down 15m multiplier 1.5 max 1h - summary: HDFS failed volumes - info: number of failed volumes - to: sysadmin diff --git a/health/health.d/httpcheck.conf b/health/health.d/httpcheck.conf deleted file mode 100644 index da5dec797..000000000 --- a/health/health.d/httpcheck.conf +++ /dev/null @@ -1,73 +0,0 @@ - -# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges - template: httpcheck_web_service_up - on: httpcheck.status - class: Utilization - type: Web Server -component: HTTP endpoint - lookup: average -1m unaligned percentage of success - calc: ($this < 75) ? (0) : ($this) - every: 5s - units: up/down - info: HTTP check endpoint ${label:url} liveness status - to: silent - - template: httpcheck_web_service_bad_content - on: httpcheck.status - class: Workload - type: Web Server -component: HTTP endpoint - lookup: average -5m unaligned percentage of bad_content - every: 10s - units: % - warn: $this >= 10 AND $this < 40 - crit: $this >= 40 - delay: down 5m multiplier 1.5 max 1h - summary: HTTP check for ${label:url} unexpected content - info: Percentage of HTTP responses from ${label:url} with unexpected content in the last 5 minutes - to: webmaster - - template: httpcheck_web_service_bad_status - on: httpcheck.status - class: Workload - type: Web Server -component: HTTP endpoint - lookup: average -5m unaligned percentage of bad_status - every: 10s - units: % - warn: $this >= 10 AND $this < 40 - crit: $this >= 40 - delay: down 5m multiplier 1.5 max 1h - summary: HTTP check for ${label:url} unexpected status - info: Percentage of HTTP responses from ${label:url} with unexpected status in the last 5 minutes - to: webmaster - - template: httpcheck_web_service_timeouts - on: httpcheck.status - class: Latency - type: Web Server -component: HTTP endpoint - lookup: average -5m unaligned percentage of timeout - every: 10s - units: % - warn: $this >= 10 AND $this < 40 - crit: $this >= 40 - delay: down 5m multiplier 1.5 max 1h - summary: HTTP check for ${label:url} timeouts - info: Percentage of timed-out HTTP requests to ${label:url} in the last 5 minutes - to: webmaster - - template: httpcheck_web_service_no_connection - on: httpcheck.status - class: Errors - type: Other -component: HTTP endpoint - lookup: average -5m unaligned percentage of no_connection - every: 10s - units: % - warn: $this >= 10 AND $this < 40 - crit: $this >= 40 - delay: down 5m multiplier 1.5 max 1h - summary: HTTP check for ${label:url} failed requests - info: Percentage of failed HTTP requests to ${label:url} in the last 5 minutes - to: webmaster diff --git a/health/health.d/ioping.conf b/health/health.d/ioping.conf deleted file mode 100644 index 6d832bf00..000000000 --- a/health/health.d/ioping.conf +++ /dev/null @@ -1,14 +0,0 @@ - template: ioping_disk_latency - on: ioping.latency - class: Latency - type: System -component: Disk - lookup: average -10s unaligned of latency - units: microseconds - every: 10s - green: 10000 - warn: $this > $green - delay: down 30m multiplier 1.5 max 2h - summary: IO ping latency - info: Average I/O latency over the last 10 seconds - to: silent diff --git a/health/health.d/ipc.conf b/health/health.d/ipc.conf deleted file mode 100644 index f77f56065..000000000 --- a/health/health.d/ipc.conf +++ /dev/null @@ -1,34 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - - alarm: semaphores_used - on: system.ipc_semaphores - class: Utilization - type: System -component: IPC - os: linux - hosts: * - calc: $semaphores * 100 / $ipc_semaphores_max - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (70) : (80)) - delay: down 5m multiplier 1.5 max 1h - summary: IPC semaphores used - info: IPC semaphore utilization - to: sysadmin - - alarm: semaphore_arrays_used - on: system.ipc_semaphore_arrays - class: Utilization - type: System -component: IPC - os: linux - hosts: * - calc: $arrays * 100 / $ipc_semaphores_arrays_max - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (70) : (80)) - delay: down 5m multiplier 1.5 max 1h - summary: IPC semaphore arrays used - info: IPC semaphore arrays utilization - to: sysadmin diff --git a/health/health.d/ipfs.conf b/health/health.d/ipfs.conf deleted file mode 100644 index 4dfee3c7f..000000000 --- a/health/health.d/ipfs.conf +++ /dev/null @@ -1,15 +0,0 @@ - - template: ipfs_datastore_usage - on: ipfs.repo_size - class: Utilization - type: Data Sharing -component: IPFS - calc: $size * 100 / $avail - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: down 15m multiplier 1.5 max 1h - summary: IPFS datastore utilization - info: IPFS datastore utilization - to: sysadmin diff --git a/health/health.d/ipmi.conf b/health/health.d/ipmi.conf deleted file mode 100644 index cec2320a9..000000000 --- a/health/health.d/ipmi.conf +++ /dev/null @@ -1,28 +0,0 @@ - template: ipmi_sensor_state - on: ipmi.sensor_state - class: Errors - type: System -component: IPMI - calc: $warning + $critical - units: state - every: 10s - warn: $warning > 0 - crit: $critical > 0 - delay: up 5m down 15m multiplier 1.5 max 1h - summary: IPMI sensor ${label:sensor} state - info: IPMI sensor ${label:sensor} (${label:component}) state - to: sysadmin - - alarm: ipmi_events - on: ipmi.events - class: Utilization - type: System -component: IPMI - calc: $events - units: events - every: 30s - warn: $this > 0 - delay: up 5m down 15m multiplier 1.5 max 1h - summary: IPMI entries in System Event Log - info: number of events in the IPMI System Event Log (SEL) - to: silent diff --git a/health/health.d/isc_dhcpd.conf b/health/health.d/isc_dhcpd.conf deleted file mode 100644 index d1f93969a..000000000 --- a/health/health.d/isc_dhcpd.conf +++ /dev/null @@ -1,10 +0,0 @@ -# template: isc_dhcpd_leases_size -# on: isc_dhcpd.leases_total -# units: KB -# every: 60 -# calc: $leases_size -# warn: $this > 3072 -# crit: $this > 6144 -# delay: up 2m down 5m -# info: dhcpd.leases file too big! Module can slow down your server. -# to: sysadmin diff --git a/health/health.d/kubelet.conf b/health/health.d/kubelet.conf deleted file mode 100644 index 8adf5f7d4..000000000 --- a/health/health.d/kubelet.conf +++ /dev/null @@ -1,151 +0,0 @@ -# you can disable an alarm notification by setting the 'to' line to: silent - -# ----------------------------------------------------------------------------- - -# True (1) if the node is experiencing a configuration-related error, false (0) otherwise. - - template: kubelet_node_config_error - on: k8s_kubelet.kubelet_node_config_error - class: Errors - type: Kubernetes -component: Kubelet - calc: $experiencing_error - units: bool - every: 10s - warn: $this == 1 - delay: down 1m multiplier 1.5 max 2h - summary: Kubelet node config error - info: The node is experiencing a configuration-related error (0: false, 1: true) - to: sysadmin - -# Failed Token() requests to the alternate token source - - template: kubelet_token_requests - on: k8s_kubelet.kubelet_token_requests - class: Errors - type: Kubernetes -component: Kubelet - lookup: sum -10s of failed - units: requests - every: 10s - warn: $this > 0 - delay: down 1m multiplier 1.5 max 2h - summary: Kubelet failed token requests - info: Number of failed Token() requests to the alternate token source - to: sysadmin - -# Docker and runtime operation errors - - template: kubelet_operations_error - on: k8s_kubelet.kubelet_operations_errors - class: Errors - type: Kubernetes -component: Kubelet - lookup: sum -1m - units: errors - every: 10s - warn: $this > (($status >= $WARNING) ? (0) : (20)) - delay: up 30s down 1m multiplier 1.5 max 2h - summary: Kubelet runtime errors - info: Number of Docker or runtime operation errors - to: sysadmin - -# ----------------------------------------------------------------------------- - -# Pod Lifecycle Event Generator Relisting Latency - -# 1. calculate the pleg relisting latency for 1m (quantile 0.5, quantile 0.9, quantile 0.99) -# 2. do the same for the last 10s -# 3. raise an alarm if the later is: -# - 2x the first for quantile 0.5 -# - 4x the first for quantile 0.9 -# - 8x the first for quantile 0.99 -# -# we assume the minimum latency is 1000 microseconds - -# quantile 0.5 - - template: kubelet_1m_pleg_relist_latency_quantile_05 - on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - class: Latency - type: Kubernetes -component: Kubelet - lookup: average -1m unaligned of 0.5 - units: microseconds - every: 10s - info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.5) - - template: kubelet_10s_pleg_relist_latency_quantile_05 - on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - class: Latency - type: Kubernetes -component: Kubelet - lookup: average -10s unaligned of 0.5 - calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05)) - every: 10s - units: % - warn: $this > (($status >= $WARNING)?(100):(200)) - crit: $this > (($status >= $WARNING)?(200):(400)) - delay: down 1m multiplier 1.5 max 2h - summary: Kubelet relisting latency (quantile 0.5) - info: Ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \ - compared to the last minute (quantile 0.5) - to: sysadmin - -# quantile 0.9 - - template: kubelet_1m_pleg_relist_latency_quantile_09 - on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - class: Latency - type: Kubernetes -component: Kubelet - lookup: average -1m unaligned of 0.9 - units: microseconds - every: 10s - info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.9) - - template: kubelet_10s_pleg_relist_latency_quantile_09 - on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - class: Latency - type: Kubernetes -component: Kubelet - lookup: average -10s unaligned of 0.9 - calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09)) - every: 10s - units: % - warn: $this > (($status >= $WARNING)?(200):(400)) - crit: $this > (($status >= $WARNING)?(400):(800)) - delay: down 1m multiplier 1.5 max 2h - summary: Kubelet relisting latency (quantile 0.9) - info: Ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \ - compared to the last minute (quantile 0.9) - to: sysadmin - -# quantile 0.99 - - template: kubelet_1m_pleg_relist_latency_quantile_099 - on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - class: Latency - type: Kubernetes -component: Kubelet - lookup: average -1m unaligned of 0.99 - units: microseconds - every: 10s - info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.99) - - template: kubelet_10s_pleg_relist_latency_quantile_099 - on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds - class: Latency - type: Kubernetes -component: Kubelet - lookup: average -10s unaligned of 0.99 - calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099)) - every: 10s - units: % - warn: $this > (($status >= $WARNING)?(400):(800)) - crit: $this > (($status >= $WARNING)?(800):(1200)) - delay: down 1m multiplier 1.5 max 2h - summary: Kubelet relisting latency (quantile 0.99) - info: Ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \ - compared to the last minute (quantile 0.99) - to: sysadmin diff --git a/health/health.d/linux_power_supply.conf b/health/health.d/linux_power_supply.conf deleted file mode 100644 index b0d35e752..000000000 --- a/health/health.d/linux_power_supply.conf +++ /dev/null @@ -1,15 +0,0 @@ -# Alert on low battery capacity. - - template: linux_power_supply_capacity - on: powersupply.capacity - class: Utilization - type: Power Supply -component: Battery - calc: $capacity - units: % - every: 10s - warn: $this < 10 - delay: up 30s down 5m multiplier 1.2 max 1h - summary: Power supply capacity - info: Percentage of remaining power supply capacity - to: silent diff --git a/health/health.d/load.conf b/health/health.d/load.conf deleted file mode 100644 index fd8bf9396..000000000 --- a/health/health.d/load.conf +++ /dev/null @@ -1,72 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - -# Calculate the base trigger point for the load average alarms. -# This is the maximum number of CPU's in the system over the past 1 -# minute, with a special case for a single CPU of setting the trigger at 2. - alarm: load_cpu_number - on: system.load - class: Utilization - type: System -component: Load - os: linux - hosts: * - calc: ($active_processors == nan or $active_processors == 0) ? (nan) : ( ($active_processors < 2) ? ( 2 ) : ( $active_processors ) ) - units: cpus - every: 1m - info: Number of active CPU cores in the system - -# Send alarms if the load average is unusually high. -# These intentionally _do not_ calculate the average over the sampled -# time period because the values being checked already are averages. - - alarm: load_average_15 - on: system.load - class: Utilization - type: System -component: Load - os: linux - hosts: * - lookup: max -1m unaligned of load15 - calc: ($load_cpu_number == nan) ? (nan) : ($this) - units: load - every: 1m - warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 175 : 200) - delay: down 15m multiplier 1.5 max 1h - summary: Host load average (15 minutes) - info: System load average for the past 15 minutes - to: silent - - alarm: load_average_5 - on: system.load - class: Utilization - type: System -component: Load - os: linux - hosts: * - lookup: max -1m unaligned of load5 - calc: ($load_cpu_number == nan) ? (nan) : ($this) - units: load - every: 1m - warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 350 : 400) - delay: down 15m multiplier 1.5 max 1h - summary: System load average (5 minutes) - info: System load average for the past 5 minutes - to: silent - - alarm: load_average_1 - on: system.load - class: Utilization - type: System -component: Load - os: linux - hosts: * - lookup: max -1m unaligned of load1 - calc: ($load_cpu_number == nan) ? (nan) : ($this) - units: load - every: 1m - warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 700 : 800) - delay: down 15m multiplier 1.5 max 1h - summary: System load average (1 minute) - info: System load average for the past 1 minute - to: silent diff --git a/health/health.d/mdstat.conf b/health/health.d/mdstat.conf deleted file mode 100644 index 90f97d851..000000000 --- a/health/health.d/mdstat.conf +++ /dev/null @@ -1,43 +0,0 @@ - - template: mdstat_disks - on: md.disks - class: Errors - type: System -component: RAID - units: failed devices - every: 10s - calc: $down - warn: $this > 0 - summary: MD array device ${label:device} down - info: Number of devices in the down state for the ${label:device} ${label:raid_level} array. \ - Any number > 0 indicates that the array is degraded. - to: sysadmin - - template: mdstat_mismatch_cnt - on: md.mismatch_cnt - class: Errors - type: System -component: RAID -chart labels: raid_level=!raid1 !raid10 * - units: unsynchronized blocks - calc: $count - every: 60s - warn: $this > 1024 - delay: up 30m - summary: MD array device ${label:device} unsynchronized blocks - info: Number of unsynchronized blocks for the ${label:device} ${label:raid_level} array - to: silent - - template: mdstat_nonredundant_last_collected - on: md.nonredundant - class: Latency - type: System -component: RAID - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - summary: MD array last collected - info: Number of seconds since the last successful data collection - to: sysadmin diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf deleted file mode 100644 index 118997a59..000000000 --- a/health/health.d/megacli.conf +++ /dev/null @@ -1,76 +0,0 @@ - -## Adapters (controllers) - - template: megacli_adapter_state - on: megacli.adapter_degraded - class: Errors - type: System -component: RAID - lookup: max -10s foreach * - units: boolean - every: 10s - crit: $this > 0 - delay: down 5m multiplier 2 max 10m - summary: MegaCLI adapter state - info: Adapter is in the degraded state (0: false, 1: true) - to: sysadmin - -## Physical Disks - - template: megacli_pd_predictive_failures - on: megacli.pd_predictive_failure - class: Errors - type: System -component: RAID - lookup: sum -10s foreach * - units: predictive failures - every: 10s - warn: $this > 0 - delay: up 1m down 5m multiplier 2 max 10m - summary: MegaCLI physical drive predictive failures - info: Number of physical drive predictive failures - to: sysadmin - - template: megacli_pd_media_errors - on: megacli.pd_media_error - class: Errors - type: System -component: RAID - lookup: sum -10s foreach * - units: media errors - every: 10s - warn: $this > 0 - delay: up 1m down 5m multiplier 2 max 10m - summary: MegaCLI physical drive errors - info: Number of physical drive media errors - to: sysadmin - -## Battery Backup Units (BBU) - - template: megacli_bbu_relative_charge - on: megacli.bbu_relative_charge - class: Workload - type: System -component: RAID - lookup: average -10s - units: percent - every: 10s - warn: $this <= (($status >= $WARNING) ? (85) : (80)) - crit: $this <= (($status == $CRITICAL) ? (50) : (40)) - summary: MegaCLI BBU charge state - info: Average battery backup unit (BBU) relative state of charge over the last 10 seconds - to: sysadmin - - template: megacli_bbu_cycle_count - on: megacli.bbu_cycle_count - class: Workload - type: System -component: RAID - lookup: average -10s - units: cycles - every: 10s - warn: $this >= 100 - crit: $this >= 500 - summary: MegaCLI BBU cycles count - info: Average battery backup unit (BBU) charge cycles count over the last 10 seconds - to: sysadmin diff --git a/health/health.d/memcached.conf b/health/health.d/memcached.conf deleted file mode 100644 index 77ca0afa9..000000000 --- a/health/health.d/memcached.conf +++ /dev/null @@ -1,50 +0,0 @@ - -# detect if memcached cache is full - - template: memcached_cache_memory_usage - on: memcached.cache - class: Utilization - type: KV Storage -component: Memcached - calc: $used * 100 / ($used + $available) - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (70) : (80)) - crit: $this > (($status == $CRITICAL) ? (80) : (90)) - delay: up 0 down 15m multiplier 1.5 max 1h - summary: Memcached memory utilization - info: Cache memory utilization - to: dba - - -# find the rate memcached cache is filling - - template: memcached_cache_fill_rate - on: memcached.cache - class: Utilization - type: KV Storage -component: Memcached - lookup: min -10m at -50m unaligned of available - calc: ($this - $available) / (($now - $after) / 3600) - units: KB/hour - every: 1m - info: Average rate the cache fills up (positive), or frees up (negative) space over the last hour - - -# find the hours remaining until memcached cache is full - - template: memcached_out_of_cache_space_time - on: memcached.cache - class: Utilization - type: KV Storage -component: Memcached - calc: ($memcached_cache_fill_rate > 0) ? ($available / $memcached_cache_fill_rate) : (inf) - units: hours - every: 10s - warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) - crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) - delay: down 15m multiplier 1.5 max 1h - summary: Memcached estimation of lack of cache space - info: Estimated time the cache will run out of space \ - if the system continues to add data at the same rate as the past hour - to: dba diff --git a/health/health.d/memory.conf b/health/health.d/memory.conf deleted file mode 100644 index 5ab3d2d92..000000000 --- a/health/health.d/memory.conf +++ /dev/null @@ -1,85 +0,0 @@ -# you can disable an alarm notification by setting the 'to' line to: silent - - alarm: 1hour_memory_hw_corrupted - on: mem.hwcorrupt - class: Errors - type: System -component: Memory - os: linux - hosts: * - calc: $HardwareCorrupted - units: MB - every: 10s - warn: $this > 0 - delay: down 1h multiplier 1.5 max 1h - summary: System corrupted memory - info: Amount of memory corrupted due to a hardware failure - to: sysadmin - -## ECC Controller - - template: ecc_memory_mc_correctable - on: mem.edac_mc - class: Errors - type: System -component: Memory - os: linux - hosts: * - lookup: sum -10m unaligned of correctable, correctable_noinfo - units: errors - every: 1m - warn: $this > 0 - delay: down 1h multiplier 1.5 max 1h - summary: System ECC memory ${label:controller} correctable errors - info: Memory controller ${label:controller} ECC correctable errors in the last 10 minutes - to: sysadmin - - template: ecc_memory_mc_uncorrectable - on: mem.edac_mc - class: Errors - type: System -component: Memory - os: linux - hosts: * - lookup: sum -10m unaligned of uncorrectable,uncorrectable_noinfo - units: errors - every: 1m - crit: $this > 0 - delay: down 1h multiplier 1.5 max 1h - summary: System ECC memory ${label:controller} uncorrectable errors - info: Memory controller ${label:controller} ECC uncorrectable errors in the last 10 minutes - to: sysadmin - -## ECC DIMM - - template: ecc_memory_dimm_correctable - on: mem.edac_mc_dimm - class: Errors - type: System -component: Memory - os: linux - hosts: * - lookup: sum -10m unaligned of correctable - units: errors - every: 1m - warn: $this > 0 - delay: down 1h multiplier 1.5 max 1h - summary: System ECC memory DIMM ${label:dimm} correctable errors - info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC correctable errors in the last 10 minutes - to: sysadmin - - template: ecc_memory_dimm_uncorrectable - on: mem.edac_mc_dimm - class: Errors - type: System -component: Memory - os: linux - hosts: * - lookup: sum -10m unaligned of uncorrectable - units: errors - every: 1m - crit: $this > 0 - delay: down 1h multiplier 1.5 max 1h - summary: System ECC memory DIMM ${label:dimm} uncorrectable errors - info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC uncorrectable errors in the last 10 minutes - to: sysadmin diff --git a/health/health.d/ml.conf b/health/health.d/ml.conf deleted file mode 100644 index aef9b0368..000000000 --- a/health/health.d/ml.conf +++ /dev/null @@ -1,56 +0,0 @@ -# below are some examples of using the `anomaly-bit` option to define alerts based on anomaly -# rates as opposed to raw metric values. You can read more about the anomaly-bit and Netdata's -# native anomaly detection here: -# https://learn.netdata.cloud/docs/agent/ml#anomaly-bit---100--anomalous-0--normal - -# some examples below are commented, you would need to uncomment and adjust as desired to enable them. - -# node level anomaly rate -# https://learn.netdata.cloud/docs/agent/ml#node-anomaly-rate -# if node level anomaly rate is above 1% then warning (pick your own threshold that works best via trial and error). - template: ml_1min_node_ar - on: anomaly_detection.anomaly_rate - class: Workload - type: System -component: ML - os: * - hosts: * - lookup: average -1m of anomaly_rate - calc: $this - units: % - every: 30s - warn: $this > 1 - summary: ML node anomaly rate - info: Rolling 1min node level anomaly rate - to: silent - -# alert per dimension example -# if anomaly rate is between 5-20% then warning (pick your own threshold that works best via tial and error). -# if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error). -# template: ml_5min_cpu_dims -# on: system.cpu -# os: linux -# hosts: * -# lookup: average -5m anomaly-bit foreach * -# calc: $this -# units: % -# every: 30s -# warn: $this > (($status >= $WARNING) ? (5) : (20)) -# crit: $this > (($status == $CRITICAL) ? (20) : (100)) -# info: rolling 5min anomaly rate for each system.cpu dimension - -# alert per chart example -# if anomaly rate is between 5-20% then warning (pick your own threshold that works best via tial and error). -# if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error). -# template: ml_5min_cpu_chart -# on: system.cpu -# os: linux -# hosts: * -# lookup: average -5m anomaly-bit of * -# calc: $this -# units: % -# every: 30s -# warn: $this > (($status >= $WARNING) ? (5) : (20)) -# crit: $this > (($status == $CRITICAL) ? (20) : (100)) -# info: rolling 5min anomaly rate for system.cpu chart - diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf deleted file mode 100644 index 572560b4e..000000000 --- a/health/health.d/mysql.conf +++ /dev/null @@ -1,187 +0,0 @@ - -# slow queries - - template: mysql_10s_slow_queries - on: mysql.queries - class: Latency - type: Database -component: MySQL - lookup: sum -10s of slow_queries - units: slow queries - every: 10s - warn: $this > (($status >= $WARNING) ? (5) : (10)) - crit: $this > (($status == $CRITICAL) ? (10) : (20)) - delay: down 5m multiplier 1.5 max 1h - summary: MySQL slow queries - info: Number of slow queries in the last 10 seconds - to: dba - - -# ----------------------------------------------------------------------------- -# lock waits - - template: mysql_10s_table_locks_immediate - on: mysql.table_locks - class: Utilization - type: Database -component: MySQL - lookup: sum -10s absolute of immediate - units: immediate locks - every: 10s - summary: MySQL table immediate locks - info: Number of table immediate locks in the last 10 seconds - to: dba - - template: mysql_10s_table_locks_waited - on: mysql.table_locks - class: Latency - type: Database -component: MySQL - lookup: sum -10s absolute of waited - units: waited locks - every: 10s - summary: MySQL table waited locks - info: Number of table waited locks in the last 10 seconds - to: dba - - template: mysql_10s_waited_locks_ratio - on: mysql.table_locks - class: Latency - type: Database -component: MySQL - calc: ( ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate) > 0 ) ? (($mysql_10s_table_locks_waited * 100) / ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate)) : 0 - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (10) : (25)) - crit: $this > (($status == $CRITICAL) ? (25) : (50)) - delay: down 30m multiplier 1.5 max 1h - summary: MySQL waited table locks ratio - info: Ratio of waited table locks over the last 10 seconds - to: dba - - -# ----------------------------------------------------------------------------- -# connections - - template: mysql_connections - on: mysql.connections_active - class: Utilization - type: Database -component: MySQL - calc: $active * 100 / $limit - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (60) : (70)) - crit: $this > (($status == $CRITICAL) ? (80) : (90)) - delay: down 15m multiplier 1.5 max 1h - summary: MySQL connections utilization - info: Client connections utilization - to: dba - - -# ----------------------------------------------------------------------------- -# replication - - template: mysql_replication - on: mysql.slave_status - class: Errors - type: Database -component: MySQL - calc: ($sql_running <= 0 OR $io_running <= 0)?0:1 - units: ok/failed - every: 10s - crit: $this == 0 - delay: down 5m multiplier 1.5 max 1h - summary: MySQL replication status - info: Replication status (0: stopped, 1: working) - to: dba - - template: mysql_replication_lag - on: mysql.slave_behind - class: Latency - type: Database -component: MySQL - calc: $seconds - units: seconds - every: 10s - warn: $this > (($status >= $WARNING) ? (5) : (10)) - crit: $this > (($status == $CRITICAL) ? (10) : (30)) - delay: down 15m multiplier 1.5 max 1h - summary: MySQL replication lag - info: Difference between the timestamp of the latest transaction processed by the SQL thread and \ - the timestamp of the same transaction when it was processed on the master - to: dba - - -# ----------------------------------------------------------------------------- -# galera cluster size - - template: mysql_galera_cluster_size_max_2m - on: mysql.galera_cluster_size - class: Utilization - type: Database -component: MySQL - lookup: max -2m at -1m unaligned - units: nodes - every: 10s - info: maximum galera cluster size in the last 2 minutes starting one minute ago - to: dba - - template: mysql_galera_cluster_size - on: mysql.galera_cluster_size - class: Utilization - type: Database -component: MySQL - calc: $nodes - units: nodes - every: 10s - warn: $this > $mysql_galera_cluster_size_max_2m - crit: $this < $mysql_galera_cluster_size_max_2m - delay: up 20s down 5m multiplier 1.5 max 1h - summary: MySQL galera cluster size - info: Current galera cluster size, compared to the maximum size in the last 2 minutes - to: dba - -# galera node state - - template: mysql_galera_cluster_state_warn - on: mysql.galera_cluster_state - class: Errors - type: Database -component: MySQL - calc: $donor + $joined - every: 10s - warn: $this != nan AND $this != 0 - delay: up 30s down 5m multiplier 1.5 max 1h - summary: MySQL galera node state - info: Galera node state is either Donor/Desynced or Joined. - to: dba - - template: mysql_galera_cluster_state_crit - on: mysql.galera_cluster_state - class: Errors - type: Database -component: MySQL - calc: $undefined + $joining + $error - every: 10s - crit: $this != nan AND $this != 0 - delay: up 30s down 5m multiplier 1.5 max 1h - summary: MySQL galera node state - info: Galera node state is either Undefined or Joining or Error. - to: dba - -# galera node status - - template: mysql_galera_cluster_status - on: mysql.galera_cluster_status - class: Errors - type: Database -component: MySQL - calc: $primary - every: 10s - crit: $this != nan AND $this != 1 - delay: up 30s down 5m multiplier 1.5 max 1h - summary: MySQL galera cluster status - info: Galera node is part of a nonoperational component. \ - This occurs in cases of multiple membership changes that result in a loss of Quorum or in cases of split-brain situations. - to: dba diff --git a/health/health.d/net.conf b/health/health.d/net.conf deleted file mode 100644 index 2dfe6bbaf..000000000 --- a/health/health.d/net.conf +++ /dev/null @@ -1,258 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - -# ----------------------------------------------------------------------------- -# net traffic overflow - - template: interface_speed - on: net.net - class: Latency - type: System -component: Network - os: * - hosts: * - calc: ( $nic_speed_max > 0 ) ? ( $nic_speed_max / 1000) : ( nan ) - units: Mbit - every: 10s - info: Network interface ${label:device} current speed - - template: 1m_received_traffic_overflow - on: net.net - class: Workload - type: System -component: Network - os: linux - hosts: * - lookup: average -1m unaligned absolute of received - calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan ) - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (85) : (90)) - delay: up 1m down 1m multiplier 1.5 max 1h - summary: System network interface ${label:device} inbound utilization - info: Average inbound utilization for the network interface ${label:device} over the last minute - to: silent - - template: 1m_sent_traffic_overflow - on: net.net - class: Workload - type: System -component: Network - os: linux - hosts: * - lookup: average -1m unaligned absolute of sent - calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan ) - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (85) : (90)) - delay: up 1m down 1m multiplier 1.5 max 1h - summary: System network interface ${label:device} outbound utilization - info: Average outbound utilization for the network interface ${label:device} over the last minute - to: silent - -# ----------------------------------------------------------------------------- -# dropped packets - -# check if an interface is dropping packets -# the alarm is checked every 1 minute -# and examines the last 10 minutes of data -# -# it is possible to have expected packet drops on an interface for some network configurations -# look at the Monitoring Network Interfaces section in the proc.plugin documentation for more information - - template: net_interface_inbound_packets - on: net.packets - class: Workload - type: System -component: Network - os: * - hosts: * - lookup: sum -10m unaligned absolute of received - units: packets - every: 1m - summary: Network interface ${label:device} received packets - info: Received packets for the network interface ${label:device} in the last 10 minutes - - template: net_interface_outbound_packets - on: net.packets - class: Workload - type: System -component: Network - os: * - hosts: * - lookup: sum -10m unaligned absolute of sent - units: packets - every: 1m - summary: Network interface ${label:device} sent packets - info: Sent packets for the network interface ${label:device} in the last 10 minutes - - template: inbound_packets_dropped_ratio - on: net.drops - class: Errors - type: System -component: Network - os: * - hosts: * -chart labels: device=!wl* * - lookup: sum -10m unaligned absolute of inbound - calc: (($net_interface_inbound_packets > 10000) ? ($this * 100 / $net_interface_inbound_packets) : (0)) - units: % - every: 1m - warn: $this >= 2 - delay: up 1m down 1h multiplier 1.5 max 2h - summary: System network interface ${label:device} inbound drops - info: Ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes - to: silent - - template: outbound_packets_dropped_ratio - on: net.drops - class: Errors - type: System -component: Network - os: * - hosts: * -chart labels: device=!wl* * - lookup: sum -10m unaligned absolute of outbound - calc: (($net_interface_outbound_packets > 1000) ? ($this * 100 / $net_interface_outbound_packets) : (0)) - units: % - every: 1m - warn: $this >= 2 - delay: up 1m down 1h multiplier 1.5 max 2h - summary: System network interface ${label:device} outbound drops - info: Ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes - to: silent - - template: wifi_inbound_packets_dropped_ratio - on: net.drops - class: Errors - type: System -component: Network - os: linux - hosts: * -chart labels: device=wl* - lookup: sum -10m unaligned absolute of received - calc: (($net_interface_inbound_packets > 10000) ? ($this * 100 / $net_interface_inbound_packets) : (0)) - units: % - every: 1m - warn: $this >= 10 - delay: up 1m down 1h multiplier 1.5 max 2h - summary: System network interface ${label:device} inbound drops ratio - info: Ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes - to: silent - - template: wifi_outbound_packets_dropped_ratio - on: net.drops - class: Errors - type: System -component: Network - os: linux - hosts: * -chart labels: device=wl* - lookup: sum -10m unaligned absolute of sent - calc: (($net_interface_outbound_packets > 1000) ? ($this * 100 / $net_interface_outbound_packets) : (0)) - units: % - every: 1m - warn: $this >= 10 - delay: up 1m down 1h multiplier 1.5 max 2h - summary: System network interface ${label:device} outbound drops ratio - info: Ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes - to: silent - -# ----------------------------------------------------------------------------- -# interface errors - - template: interface_inbound_errors - on: net.errors - class: Errors - type: System -component: Network - os: freebsd - hosts: * - lookup: sum -10m unaligned absolute of inbound - units: errors - every: 1m - warn: $this >= 5 - delay: down 1h multiplier 1.5 max 2h - summary: System network interface ${label:device} inbound errors - info: Number of inbound errors for the network interface ${label:device} in the last 10 minutes - to: silent - - template: interface_outbound_errors - on: net.errors - class: Errors - type: System -component: Network - os: freebsd - hosts: * - lookup: sum -10m unaligned absolute of outbound - units: errors - every: 1m - warn: $this >= 5 - delay: down 1h multiplier 1.5 max 2h - summary: System network interface ${label:device} outbound errors - info: Number of outbound errors for the network interface ${label:device} in the last 10 minutes - to: silent - -# ----------------------------------------------------------------------------- -# FIFO errors - -# check if an interface is having FIFO -# buffer errors -# the alarm is checked every 1 minute -# and examines the last 10 minutes of data - - template: 10min_fifo_errors - on: net.fifo - class: Errors - type: System -component: Network - os: linux - hosts: * - lookup: sum -10m unaligned absolute - units: errors - every: 1m - warn: $this > 0 - delay: down 1h multiplier 1.5 max 2h - summary: System network interface ${label:device} FIFO errors - info: Number of FIFO errors for the network interface ${label:device} in the last 10 minutes - to: silent - -# ----------------------------------------------------------------------------- -# check for packet storms - -# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate -# 2. do the same for the last 10s -# 3. raise an alarm if the later is 10x or 20x the first -# we assume the minimum packet storm should at least have -# 10000 packets/s, average of the last 10 seconds - - template: 1m_received_packets_rate - on: net.packets - class: Workload - type: System -component: Network - os: linux freebsd - hosts: * - lookup: average -1m unaligned of received - units: packets - every: 10s - info: Average number of packets received by the network interface ${label:device} over the last minute - - template: 10s_received_packets_storm - on: net.packets - class: Workload - type: System -component: Network - os: linux freebsd - hosts: * - lookup: average -10s unaligned of received - calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate)) - every: 10s - units: % - warn: $this > (($status >= $WARNING)?(200):(5000)) - crit: $this > (($status == $CRITICAL)?(5000):(6000)) - options: no-clear-notification - summary: System network interface ${label:device} inbound packet storm - info: Ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \ - compared to the rate over the last minute - to: silent diff --git a/health/health.d/netfilter.conf b/health/health.d/netfilter.conf deleted file mode 100644 index 417105d43..000000000 --- a/health/health.d/netfilter.conf +++ /dev/null @@ -1,20 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - - alarm: netfilter_conntrack_full - on: netfilter.conntrack_sockets - class: Workload - type: System -component: Network - os: linux - hosts: * - lookup: max -10s unaligned of connections - calc: $this * 100 / $netfilter_conntrack_max - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (85) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (95)) - delay: down 5m multiplier 1.5 max 1h - summary: System Netfilter connection tracker utilization - info: Netfilter connection tracker table size utilization - to: sysadmin diff --git a/health/health.d/nvme.conf b/health/health.d/nvme.conf deleted file mode 100644 index aea402e88..000000000 --- a/health/health.d/nvme.conf +++ /dev/null @@ -1,15 +0,0 @@ -# you can disable an alarm notification by setting the 'to' line to: silent - - template: nvme_device_critical_warnings_state - on: nvme.device_critical_warnings_state - class: Errors - type: System -component: Disk - lookup: max -30s unaligned - units: state - every: 10s - crit: $this != nan AND $this != 0 - delay: down 5m multiplier 1.5 max 2h - summary: NVMe device ${label:device} state - info: NVMe device ${label:device} has critical warnings - to: sysadmin diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf deleted file mode 100644 index c4db835ce..000000000 --- a/health/health.d/pihole.conf +++ /dev/null @@ -1,33 +0,0 @@ - -# Blocklist last update time. -# Default update interval is a week. - - template: pihole_blocklist_last_update - on: pihole.blocklist_last_update - class: Errors - type: Ad Filtering -component: Pi-hole - every: 10s - units: seconds - calc: $ago - warn: $this > 60 * 60 * 24 * 30 - summary: Pi-hole blocklist last update - info: gravity.list (blocklist) file last update time - to: sysadmin - -# Pi-hole's ability to block unwanted domains. -# Should be enabled. The whole point of Pi-hole! - - template: pihole_status - on: pihole.unwanted_domains_blocking_status - class: Errors - type: Ad Filtering -component: Pi-hole - every: 10s - units: status - calc: $disabled - warn: $this != nan AND $this == 1 - delay: up 2m down 5m - summary: Pi-hole domains blocking status - info: Unwanted domains blocking is disabled - to: sysadmin diff --git a/health/health.d/ping.conf b/health/health.d/ping.conf deleted file mode 100644 index 0e434420d..000000000 --- a/health/health.d/ping.conf +++ /dev/null @@ -1,50 +0,0 @@ -# you can disable an alarm notification by setting the 'to' line to: silent - - template: ping_host_reachable - on: ping.host_packet_loss - class: Errors - type: Other -component: Network - lookup: average -30s unaligned of loss - calc: $this != nan AND $this < 100 - units: up/down - every: 10s - crit: $this == 0 - delay: down 30m multiplier 1.5 max 2h - summary: Host ${label:host} ping status - info: Network host ${label:host} reachability status - to: sysadmin - - template: ping_packet_loss - on: ping.host_packet_loss - class: Errors - type: Other -component: Network - lookup: average -10m unaligned of loss - green: 5 - red: 10 - units: % - every: 10s - warn: $this > $green - crit: $this > $red - delay: down 30m multiplier 1.5 max 2h - summary: Host ${label:host} ping packet loss - info: Packet loss percentage to the network host ${label:host} over the last 10 minutes - to: sysadmin - - template: ping_host_latency - on: ping.host_rtt - class: Latency - type: Other -component: Network - lookup: average -10s unaligned of avg - units: ms - every: 10s - green: 500 - red: 1000 - warn: $this > $green OR $max > $red - crit: $this > $red - delay: down 30m multiplier 1.5 max 2h - summary: Host ${label:host} ping latency - info: Average latency to the network host ${label:host} over the last 10 seconds - to: sysadmin diff --git a/health/health.d/plugin.conf b/health/health.d/plugin.conf deleted file mode 100644 index 8615a0213..000000000 --- a/health/health.d/plugin.conf +++ /dev/null @@ -1,12 +0,0 @@ - template: plugin_availability_status - on: netdata.plugin_availability_status - class: Errors - type: Netdata - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : (20 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - summary: Plugin ${label:_collect_plugin} availability status - info: the amount of time that ${label:_collect_plugin} did not report its availability status - to: sysadmin diff --git a/health/health.d/portcheck.conf b/health/health.d/portcheck.conf deleted file mode 100644 index 281731c86..000000000 --- a/health/health.d/portcheck.conf +++ /dev/null @@ -1,44 +0,0 @@ - -# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges - template: portcheck_service_reachable - on: portcheck.status - class: Workload - type: Other -component: TCP endpoint - lookup: average -1m unaligned percentage of success - calc: ($this < 75) ? (0) : ($this) - every: 5s - units: up/down - summary: Portcheck status for ${label:host}:${label:port} - info: TCP host ${label:host} port ${label:port} liveness status - to: silent - - template: portcheck_connection_timeouts - on: portcheck.status - class: Errors - type: Other -component: TCP endpoint - lookup: average -5m unaligned percentage of timeout - every: 10s - units: % - warn: $this >= 10 AND $this < 40 - crit: $this >= 40 - delay: down 5m multiplier 1.5 max 1h - summary: Portcheck timeouts for ${label:host}:${label:port} - info: Percentage of timed-out TCP connections to host ${label:host} port ${label:port} in the last 5 minutes - to: sysadmin - - template: portcheck_connection_fails - on: portcheck.status - class: Errors - type: Other -component: TCP endpoint - lookup: average -5m unaligned percentage of no_connection,failed - every: 10s - units: % - warn: $this >= 10 AND $this < 40 - crit: $this >= 40 - delay: down 5m multiplier 1.5 max 1h - summary: Portcheck fails for ${label:host}:${label:port} - info: Percentage of failed TCP connections to host ${label:host} port ${label:port} in the last 5 minutes - to: sysadmin diff --git a/health/health.d/postgres.conf b/health/health.d/postgres.conf deleted file mode 100644 index de4c0078e..000000000 --- a/health/health.d/postgres.conf +++ /dev/null @@ -1,228 +0,0 @@ -# you can disable an alarm notification by setting the 'to' line to: silent - - template: postgres_total_connection_utilization - on: postgres.connections_utilization - class: Utilization - type: Database -component: PostgreSQL - hosts: * - lookup: average -1m unaligned of used - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (70) : (80)) - crit: $this > (($status == $CRITICAL) ? (80) : (90)) - delay: down 15m multiplier 1.5 max 1h - summary: PostgreSQL connection utilization - info: Average total connection utilization over the last minute - to: dba - - template: postgres_acquired_locks_utilization - on: postgres.locks_utilization - class: Utilization - type: Database -component: PostgreSQL - hosts: * - lookup: average -1m unaligned of used - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (15) : (20)) - delay: down 15m multiplier 1.5 max 1h - summary: PostgreSQL acquired locks utilization - info: Average acquired locks utilization over the last minute - to: dba - - template: postgres_txid_exhaustion_perc - on: postgres.txid_exhaustion_perc - class: Utilization - type: Database -component: PostgreSQL - hosts: * - calc: $txid_exhaustion - units: % - every: 1m - warn: $this > 90 - delay: down 15m multiplier 1.5 max 1h - summary: PostgreSQL TXID exhaustion - info: Percent towards TXID wraparound - to: dba - -# Database alarms - - template: postgres_db_cache_io_ratio - on: postgres.db_cache_io_ratio - class: Workload - type: Database -component: PostgreSQL - hosts: * - lookup: average -1m unaligned of miss - calc: 100 - $this - units: % - every: 1m - warn: $this < (($status >= $WARNING) ? (70) : (60)) - crit: $this < (($status == $CRITICAL) ? (60) : (50)) - delay: down 15m multiplier 1.5 max 1h - summary: PostgreSQL DB ${label:database} cache hit ratio - info: Average cache hit ratio in db ${label:database} over the last minute - to: dba - - template: postgres_db_transactions_rollback_ratio - on: postgres.db_transactions_ratio - class: Workload - type: Database -component: PostgreSQL - hosts: * - lookup: average -5m unaligned of rollback - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (2)) - delay: down 15m multiplier 1.5 max 1h - summary: PostgreSQL DB ${label:database} aborted transactions - info: Average aborted transactions percentage in db ${label:database} over the last five minutes - to: dba - - template: postgres_db_deadlocks_rate - on: postgres.db_deadlocks_rate - class: Errors - type: Database -component: PostgreSQL - hosts: * - lookup: sum -1m unaligned of deadlocks - units: deadlocks - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (10)) - delay: down 15m multiplier 1.5 max 1h - summary: PostgreSQL DB ${label:database} deadlocks rate - info: Number of deadlocks detected in db ${label:database} in the last minute - to: dba - -# Table alarms - - template: postgres_table_cache_io_ratio - on: postgres.table_cache_io_ratio - class: Workload - type: Database -component: PostgreSQL - hosts: * - lookup: average -1m unaligned of miss - calc: 100 - $this - units: % - every: 1m - warn: $this < (($status >= $WARNING) ? (70) : (60)) - crit: $this < (($status == $CRITICAL) ? (60) : (50)) - delay: down 15m multiplier 1.5 max 1h - summary: PostgreSQL table ${label:table} db ${label:database} cache hit ratio - info: Average cache hit ratio in db ${label:database} table ${label:table} over the last minute - to: dba - - template: postgres_table_index_cache_io_ratio - on: postgres.table_index_cache_io_ratio - class: Workload - type: Database -component: PostgreSQL - hosts: * - lookup: average -1m unaligned of miss - calc: 100 - $this - units: % - every: 1m - warn: $this < (($status >= $WARNING) ? (70) : (60)) - crit: $this < (($status == $CRITICAL) ? (60) : (50)) - delay: down 15m multiplier 1.5 max 1h - summary: PostgreSQL table ${label:table} db ${label:database} index cache hit ratio - info: Average index cache hit ratio in db ${label:database} table ${label:table} over the last minute - to: dba - - template: postgres_table_toast_cache_io_ratio - on: postgres.table_toast_cache_io_ratio - class: Workload - type: Database -component: PostgreSQL - hosts: * - lookup: average -1m unaligned of miss - calc: 100 - $this - units: % - every: 1m - warn: $this < (($status >= $WARNING) ? (70) : (60)) - crit: $this < (($status == $CRITICAL) ? (60) : (50)) - delay: down 15m multiplier 1.5 max 1h - summary: PostgreSQL table ${label:table} db ${label:database} toast cache hit ratio - info: Average TOAST hit ratio in db ${label:database} table ${label:table} over the last minute - to: dba - - template: postgres_table_toast_index_cache_io_ratio - on: postgres.table_toast_index_cache_io_ratio - class: Workload - type: Database -component: PostgreSQL - hosts: * - lookup: average -1m unaligned of miss - calc: 100 - $this - units: % - every: 1m - warn: $this < (($status >= $WARNING) ? (70) : (60)) - crit: $this < (($status == $CRITICAL) ? (60) : (50)) - delay: down 15m multiplier 1.5 max 1h - summary: PostgreSQL table ${label:table} db ${label:database} index toast hit ratio - info: average index TOAST hit ratio in db ${label:database} table ${label:table} over the last minute - to: dba - - template: postgres_table_bloat_size_perc - on: postgres.table_bloat_size_perc - class: Errors - type: Database -component: PostgreSQL - hosts: * - calc: ($table_size > (1024 * 1024 * 100)) ? ($bloat) : (0) - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (60) : (70)) - crit: $this > (($status == $CRITICAL) ? (70) : (80)) - delay: down 15m multiplier 1.5 max 1h - summary: PostgreSQL table ${label:table} db ${label:database} bloat size - info: Bloat size percentage in db ${label:database} table ${label:table} - to: dba - - template: postgres_table_last_autovacuum_time - on: postgres.table_autovacuum_since_time - class: Errors - type: Database -component: PostgreSQL - hosts: !* - calc: $time - units: seconds - every: 1m - warn: $this != nan AND $this > (60 * 60 * 24 * 7) - summary: PostgreSQL table ${label:table} db ${label:database} last autovacuum - info: Time elapsed since db ${label:database} table ${label:table} was vacuumed by the autovacuum daemon - to: dba - - template: postgres_table_last_autoanalyze_time - on: postgres.table_autoanalyze_since_time - class: Errors - type: Database -component: PostgreSQL - hosts: !* - calc: $time - units: seconds - every: 1m - warn: $this != nan AND $this > (60 * 60 * 24 * 7) - summary: PostgreSQL table ${label:table} db ${label:database} last autoanalyze - info: Time elapsed since db ${label:database} table ${label:table} was analyzed by the autovacuum daemon - to: dba - -# Index alarms - - template: postgres_index_bloat_size_perc - on: postgres.index_bloat_size_perc - class: Errors - type: Database -component: PostgreSQL - hosts: * - calc: ($index_size > (1024 * 1024 * 10)) ? ($bloat) : (0) - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (60) : (70)) - crit: $this > (($status == $CRITICAL) ? (70) : (80)) - delay: down 15m multiplier 1.5 max 1h - summary: PostgreSQL table ${label:table} db ${label:database} index bloat size - info: Bloat size percentage in db ${label:database} table ${label:table} index ${label:index} - to: dba diff --git a/health/health.d/processes.conf b/health/health.d/processes.conf deleted file mode 100644 index 8f2e0fda5..000000000 --- a/health/health.d/processes.conf +++ /dev/null @@ -1,17 +0,0 @@ -# you can disable an alarm notification by setting the 'to' line to: silent - - alarm: active_processes - on: system.active_processes - class: Workload - type: System -component: Processes - hosts: * - calc: $active * 100 / $pidmax - units: % - every: 5s - warn: $this > (($status >= $WARNING) ? (85) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (95)) - delay: down 5m multiplier 1.5 max 1h - summary: System PIDs utilization - info: System process IDs (PID) space utilization - to: sysadmin diff --git a/health/health.d/python.d.plugin.conf b/health/health.d/python.d.plugin.conf deleted file mode 100644 index da27ad5b7..000000000 --- a/health/health.d/python.d.plugin.conf +++ /dev/null @@ -1,18 +0,0 @@ - -# make sure python.d.plugin data collection job is running - - template: python.d_job_last_collected_secs - on: netdata.pythond_runtime - class: Errors - type: Netdata -component: python.d.plugin - module: !* * - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - summary: Python.d plugin last collection - info: Number of seconds since the last successful data collection - to: webmaster diff --git a/health/health.d/qos.conf b/health/health.d/qos.conf deleted file mode 100644 index 970ea6363..000000000 --- a/health/health.d/qos.conf +++ /dev/null @@ -1,18 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - -# check if a QoS class is dropping packets -# the alarm is checked every 10 seconds -# and examines the last minute of data - -template: 10min_qos_packet_drops - on: tc.qos_dropped - os: linux - hosts: * - lookup: sum -5m unaligned absolute - every: 30s - warn: $this > 0 - units: packets - summary: QOS packet drops - info: Dropped packets in the last 5 minutes - to: silent diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf deleted file mode 100644 index 51f307ca6..000000000 --- a/health/health.d/ram.conf +++ /dev/null @@ -1,82 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - - alarm: ram_in_use - on: system.ram - class: Utilization - type: System -component: Memory - os: linux - hosts: * - calc: $used * 100 / ($used + $cached + $free + $buffers) - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: down 15m multiplier 1.5 max 1h - summary: System memory utilization - info: System memory utilization - to: sysadmin - - alarm: ram_available - on: mem.available - class: Utilization - type: System -component: Memory - os: linux - hosts: * - calc: $avail * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers) - units: % - every: 10s - warn: $this < (($status >= $WARNING) ? (15) : (10)) - delay: down 15m multiplier 1.5 max 1h - summary: System available memory - info: Percentage of estimated amount of RAM available for userspace processes, without causing swapping - to: silent - - alarm: oom_kill - on: mem.oom_kill - os: linux - hosts: * - lookup: sum -30m unaligned - units: kills - every: 5m - warn: $this > 0 - delay: down 10m - summary: System OOM kills - info: Number of out of memory kills in the last 30 minutes - to: silent - -## FreeBSD - alarm: ram_in_use - on: system.ram - class: Utilization - type: System -component: Memory - os: freebsd - hosts: * - calc: ($active + $wired + $laundry + $buffers) * 100 / ($active + $wired + $laundry + $buffers + $cache + $free + $inactive) - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: down 15m multiplier 1.5 max 1h - summary: System memory utilization - info: System memory utilization - to: sysadmin - - alarm: ram_available - on: mem.available - class: Utilization - type: System -component: Memory - os: freebsd - hosts: * - calc: $avail * 100 / ($system.ram.free + $system.ram.active + $system.ram.inactive + $system.ram.wired + $system.ram.cache + $system.ram.laundry + $system.ram.buffers) - units: % - every: 10s - warn: $this < (($status >= $WARNING) ? (15) : (10)) - delay: down 15m multiplier 1.5 max 1h - summary: System available memory - info: Percentage of estimated amount of RAM available for userspace processes, without causing swapping - to: silent diff --git a/health/health.d/redis.conf b/health/health.d/redis.conf deleted file mode 100644 index 7c2945e68..000000000 --- a/health/health.d/redis.conf +++ /dev/null @@ -1,57 +0,0 @@ -# you can disable an alarm notification by setting the 'to' line to: silent - - template: redis_connections_rejected - on: redis.connections - class: Errors - type: KV Storage -component: Redis - lookup: sum -1m unaligned of rejected - every: 10s - units: connections - warn: $this > 0 - summary: Redis rejected connections - info: Connections rejected because of maxclients limit in the last minute - delay: down 5m multiplier 1.5 max 1h - to: dba - - template: redis_bgsave_broken - on: redis.bgsave_health - class: Errors - type: KV Storage -component: Redis - every: 10s - crit: $last_bgsave != nan AND $last_bgsave != 0 - units: ok/failed - summary: Redis background save - info: Status of the last RDB save operation (0: ok, 1: error) - delay: down 5m multiplier 1.5 max 1h - to: dba - - template: redis_bgsave_slow - on: redis.bgsave_now - class: Latency - type: KV Storage -component: Redis - every: 10s - calc: $current_bgsave_time - warn: $this > 600 - crit: $this > 1200 - units: seconds - summary: Redis slow background save - info: Duration of the on-going RDB save operation - delay: down 5m multiplier 1.5 max 1h - to: dba - - template: redis_master_link_down - on: redis.master_link_down_since_time - class: Errors - type: KV Storage -component: Redis - every: 10s - calc: $time - units: seconds - crit: $this != nan AND $this > 0 - summary: Redis master link down - info: Time elapsed since the link between master and slave is down - delay: down 5m multiplier 1.5 max 1h - to: dba diff --git a/health/health.d/retroshare.conf b/health/health.d/retroshare.conf deleted file mode 100644 index c665430fa..000000000 --- a/health/health.d/retroshare.conf +++ /dev/null @@ -1,17 +0,0 @@ - -# make sure the DHT is fine when active - - template: retroshare_dht_working - on: retroshare.dht - class: Utilization - type: Data Sharing -component: Retroshare - calc: $dht_size_all - units: peers - every: 1m - warn: $this < (($status >= $WARNING) ? (120) : (100)) - crit: $this < (($status == $CRITICAL) ? (10) : (1)) - delay: up 0 down 15m multiplier 1.5 max 1h - summary: Retroshare DHT peers - info: Number of DHT peers - to: sysadmin diff --git a/health/health.d/riakkv.conf b/health/health.d/riakkv.conf deleted file mode 100644 index 677e3cb4f..000000000 --- a/health/health.d/riakkv.conf +++ /dev/null @@ -1,98 +0,0 @@ - -# Warn if a list keys operation is running. - template: riakkv_list_keys_active - on: riak.core.fsm_active - class: Utilization - type: Database -component: Riak KV - calc: $list_fsm_active - units: state machines - every: 10s - warn: $list_fsm_active > 0 - summary: Riak KV active list keys - info: Number of currently running list keys finite state machines - to: dba - - -## Timing healthchecks -# KV GET - template: riakkv_1h_kv_get_mean_latency - on: riak.kv.latency.get - class: Latency - type: Database -component: Riak KV - calc: $node_get_fsm_time_mean - lookup: average -1h unaligned of time - every: 30s - units: ms - info: average time between reception of client GET request and \ - subsequent response to client over the last hour - - template: riakkv_kv_get_slow - on: riak.kv.latency.get - class: Latency - type: Database -component: Riak KV - calc: $mean - lookup: average -3m unaligned of time - units: ms - every: 10s - warn: ($this > ($riakkv_1h_kv_get_mean_latency * 2) ) - crit: ($this > ($riakkv_1h_kv_get_mean_latency * 3) ) - summary: Riak KV GET latency - info: Average time between reception of client GET request and \ - subsequent response to the client over the last 3 minutes, \ - compared to the average over the last hour - delay: down 5m multiplier 1.5 max 1h - to: dba - -# KV PUT - template: riakkv_1h_kv_put_mean_latency - on: riak.kv.latency.put - class: Latency - type: Database -component: Riak KV - calc: $node_put_fsm_time_mean - lookup: average -1h unaligned of time - every: 30s - units: ms - summary: Riak KV PUT mean latency - info: Average time between reception of client PUT request and \ - subsequent response to the client over the last hour - - template: riakkv_kv_put_slow - on: riak.kv.latency.put - class: Latency - type: Database -component: Riak KV - calc: $mean - lookup: average -3m unaligned of time - units: ms - every: 10s - warn: ($this > ($riakkv_1h_kv_put_mean_latency * 2) ) - crit: ($this > ($riakkv_1h_kv_put_mean_latency * 3) ) - summary: Riak KV PUT latency - info: Average time between reception of client PUT request and \ - subsequent response to the client over the last 3 minutes, \ - compared to the average over the last hour - delay: down 5m multiplier 1.5 max 1h - to: dba - - -## VM healthchecks - -# Default Erlang VM process limit: 262144 -# On systems observed, this is < 2000, but may grow depending on load. - template: riakkv_vm_high_process_count - on: riak.vm - class: Utilization - type: Database -component: Riak KV - calc: $sys_process_count - units: processes - every: 10s - warn: $this > 10000 - crit: $this > 100000 - summary: Riak KV number of processes - info: Number of processes running in the Erlang VM - to: dba diff --git a/health/health.d/scaleio.conf b/health/health.d/scaleio.conf deleted file mode 100644 index b089cb85e..000000000 --- a/health/health.d/scaleio.conf +++ /dev/null @@ -1,33 +0,0 @@ - -# make sure Storage Pool capacity utilization is under limit - - template: scaleio_storage_pool_capacity_utilization - on: scaleio.storage_pool_capacity_utilization - class: Utilization - type: Storage -component: ScaleIO - calc: $used - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (85)) - crit: $this > (($status == $CRITICAL) ? (85) : (90)) - delay: down 15m multiplier 1.5 max 1h - summary: ScaleIO storage pool capacity utilization - info: Storage pool capacity utilization - to: sysadmin - - -# make sure Sdc is connected to MDM - - template: scaleio_sdc_mdm_connection_state - on: scaleio.sdc_mdm_connection_state - class: Utilization - type: Storage -component: ScaleIO - calc: $connected - every: 10s - warn: $this != 1 - delay: up 30s down 5m multiplier 1.5 max 1h - summary: ScaleIO SDC-MDM connection state - info: Data Client (SDC) to Metadata Manager (MDM) connection state (0: disconnected, 1: connected) - to: sysadmin diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf deleted file mode 100644 index 8d7ba5661..000000000 --- a/health/health.d/softnet.conf +++ /dev/null @@ -1,57 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - -# check for common /proc/net/softnet_stat errors - - alarm: 1min_netdev_backlog_exceeded - on: system.softnet_stat - class: Errors - type: System -component: Network - os: linux - hosts: * - lookup: average -1m unaligned absolute of dropped - units: packets - every: 10s - warn: $this > (($status >= $WARNING) ? (0) : (10)) - delay: down 1h multiplier 1.5 max 2h - summary: System netdev dropped packets - info: Average number of dropped packets in the last minute \ - due to exceeded net.core.netdev_max_backlog - to: silent - - alarm: 1min_netdev_budget_ran_outs - on: system.softnet_stat - class: Errors - type: System -component: Network - os: linux - hosts: * - lookup: average -1m unaligned absolute of squeezed - units: events - every: 10s - warn: $this > (($status >= $WARNING) ? (0) : (10)) - delay: down 1h multiplier 1.5 max 2h - summary: System netdev budget run outs - info: Average number of times ksoftirq ran out of sysctl net.core.netdev_budget or \ - net.core.netdev_budget_usecs with work remaining over the last minute \ - (this can be a cause for dropped packets) - to: silent - - alarm: 10min_netisr_backlog_exceeded - on: system.softnet_stat - class: Errors - type: System -component: Network - os: freebsd - hosts: * - lookup: average -1m unaligned absolute of qdrops - units: packets - every: 10s - warn: $this > (($status >= $WARNING) ? (0) : (10)) - delay: down 1h multiplier 1.5 max 2h - summary: System netisr drops - info: Average number of drops in the last minute \ - due to exceeded sysctl net.route.netisr_maxqlen \ - (this can be a cause for dropped packets) - to: silent diff --git a/health/health.d/swap.conf b/health/health.d/swap.conf deleted file mode 100644 index e39733996..000000000 --- a/health/health.d/swap.conf +++ /dev/null @@ -1,37 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - - alarm: 30min_ram_swapped_out - on: mem.swapio - class: Workload - type: System -component: Memory - os: linux freebsd - hosts: * - lookup: sum -30m unaligned absolute of out - # we have to convert KB to MB by dividing $this (i.e. the result of the lookup) with 1024 - calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free ) - units: % of RAM - every: 1m - warn: $this > (($status >= $WARNING) ? (20) : (30)) - delay: down 15m multiplier 1.5 max 1h - summary: System memory swapped out - info: Percentage of the system RAM swapped in the last 30 minutes - to: silent - - alarm: used_swap - on: mem.swap - class: Utilization - type: System -component: Memory - os: linux freebsd - hosts: * - calc: (($used + $free) > 0) ? ($used * 100 / ($used + $free)) : 0 - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: up 30s down 15m multiplier 1.5 max 1h - summary: System swap memory utilization - info: Swap memory utilization - to: sysadmin diff --git a/health/health.d/synchronization.conf b/health/health.d/synchronization.conf deleted file mode 100644 index 6c947d90b..000000000 --- a/health/health.d/synchronization.conf +++ /dev/null @@ -1,13 +0,0 @@ - alarm: sync_freq - on: mem.sync - lookup: sum -1m of sync - units: calls - plugin: ebpf.plugin - every: 1m - warn: $this > 6 - delay: up 1m down 10m multiplier 1.5 max 1h - summary: Sync system call frequency - info: Number of sync() system calls. \ - Every call causes all pending modifications to filesystem metadata and \ - cached file data to be written to the underlying filesystems. - to: silent diff --git a/health/health.d/systemdunits.conf b/health/health.d/systemdunits.conf deleted file mode 100644 index ad53a0e1c..000000000 --- a/health/health.d/systemdunits.conf +++ /dev/null @@ -1,161 +0,0 @@ -# you can disable an alarm notification by setting the 'to' line to: silent - -## Service units - template: systemd_service_unit_failed_state - on: systemd.service_unit_state - class: Errors - type: Linux -component: Systemd units - module: !* * - calc: $failed - units: state - every: 10s - warn: $this != nan AND $this == 1 - delay: down 5m multiplier 1.5 max 1h - summary: systemd unit ${label:unit_name} state - info: systemd service unit in the failed state - to: sysadmin - -## Socket units - template: systemd_socket_unit_failed_state - on: systemd.socket_unit_state - class: Errors - type: Linux -component: Systemd units - module: !* * - calc: $failed - units: state - every: 10s - warn: $this != nan AND $this == 1 - delay: down 5m multiplier 1.5 max 1h - summary: systemd unit ${label:unit_name} state - info: systemd socket unit in the failed state - to: sysadmin - -## Target units - template: systemd_target_unit_failed_state - on: systemd.target_unit_state - class: Errors - type: Linux -component: Systemd units - module: !* * - calc: $failed - units: state - every: 10s - warn: $this != nan AND $this == 1 - delay: down 5m multiplier 1.5 max 1h - summary: systemd unit ${label:unit_name} state - info: systemd target unit in the failed state - to: sysadmin - -## Path units - template: systemd_path_unit_failed_state - on: systemd.path_unit_state - class: Errors - type: Linux -component: Systemd units - module: !* * - calc: $failed - units: state - every: 10s - warn: $this != nan AND $this == 1 - delay: down 5m multiplier 1.5 max 1h - summary: systemd unit ${label:unit_name} state - info: systemd path unit in the failed state - to: sysadmin - -## Device units - template: systemd_device_unit_failed_state - on: systemd.device_unit_state - class: Errors - type: Linux -component: Systemd units - module: !* * - calc: $failed - units: state - every: 10s - warn: $this != nan AND $this == 1 - delay: down 5m multiplier 1.5 max 1h - summary: systemd unit ${label:unit_name} state - info: systemd device unit in the failed state - to: sysadmin - -## Mount units - template: systemd_mount_unit_failed_state - on: systemd.mount_unit_state - class: Errors - type: Linux -component: Systemd units - module: !* * - calc: $failed - units: state - every: 10s - warn: $this != nan AND $this == 1 - delay: down 5m multiplier 1.5 max 1h - summary: systemd unit ${label:unit_name} state - info: systemd mount units in the failed state - to: sysadmin - -## Automount units - template: systemd_automount_unit_failed_state - on: systemd.automount_unit_state - class: Errors - type: Linux -component: Systemd units - module: !* * - calc: $failed - units: state - every: 10s - warn: $this != nan AND $this == 1 - delay: down 5m multiplier 1.5 max 1h - summary: systemd unit ${label:unit_name} state - info: systemd automount unit in the failed state - to: sysadmin - -## Swap units - template: systemd_swap_unit_failed_state - on: systemd.swap_unit_state - class: Errors - type: Linux -component: Systemd units - module: !* * - calc: $failed - units: state - every: 10s - warn: $this != nan AND $this == 1 - delay: down 5m multiplier 1.5 max 1h - summary: systemd unit ${label:unit_name} state - info: systemd swap units in the failed state - to: sysadmin - -## Scope units - template: systemd_scope_unit_failed_state - on: systemd.scope_unit_state - class: Errors - type: Linux -component: Systemd units - module: !* * - calc: $failed - units: state - every: 10s - warn: $this != nan AND $this == 1 - delay: down 5m multiplier 1.5 max 1h - summary: systemd unit ${label:unit_name} state - info: systemd scope units in the failed state - to: sysadmin - -## Slice units - template: systemd_slice_unit_failed_state - on: systemd.slice_unit_state - class: Errors - type: Linux -component: Systemd units - module: !* * - calc: $failed - units: state - every: 10s - warn: $this != nan AND $this == 1 - delay: down 5m multiplier 1.5 max 1h - summary: systemd unit ${label:unit_name} state - info: systemd slice units in the failed state - to: sysadmin diff --git a/health/health.d/tcp_conn.conf b/health/health.d/tcp_conn.conf deleted file mode 100644 index 2b2f97406..000000000 --- a/health/health.d/tcp_conn.conf +++ /dev/null @@ -1,23 +0,0 @@ - -# -# ${tcp_max_connections} may be nan or -1 if the system -# supports dynamic threshold for TCP connections. -# In this case, the alarm will always be zero. -# - - alarm: tcp_connections - on: ip.tcpsock - class: Workload - type: System -component: Network - os: linux - hosts: * - calc: (${tcp_max_connections} > 0) ? ( ${connections} * 100 / ${tcp_max_connections} ) : 0 - units: % - every: 10s - warn: $this > (($status >= $WARNING ) ? ( 60 ) : ( 80 )) - crit: $this > (($status == $CRITICAL) ? ( 80 ) : ( 90 )) - delay: up 0 down 5m multiplier 1.5 max 1h - summary: System TCP connections utilization - info: IPv4 TCP connections utilization - to: sysadmin diff --git a/health/health.d/tcp_listen.conf b/health/health.d/tcp_listen.conf deleted file mode 100644 index 9d1104a51..000000000 --- a/health/health.d/tcp_listen.conf +++ /dev/null @@ -1,100 +0,0 @@ -# -# There are two queues involved when incoming TCP connections are handled -# (both at the kernel): -# -# SYN queue -# The SYN queue tracks TCP handshakes until connections are fully established. -# It overflows when too many incoming TCP connection requests hang in the -# half-open state and the server is not configured to fall back to SYN cookies. -# Overflows are usually caused by SYN flood DoS attacks (i.e. someone sends -# lots of SYN packets and never completes the handshakes). -# -# Accept queue -# The accept queue holds fully established TCP connections waiting to be handled -# by the listening application. It overflows when the server application fails -# to accept new connections at the rate they are coming in. -# -# -# ----------------------------------------------------------------------------- -# tcp accept queue (at the kernel) - - alarm: 1m_tcp_accept_queue_overflows - on: ip.tcp_accept_queue - class: Workload - type: System -component: Network - os: linux - hosts: * - lookup: average -60s unaligned absolute of ListenOverflows - units: overflows - every: 10s - warn: $this > 1 - crit: $this > (($status == $CRITICAL) ? (1) : (5)) - delay: up 0 down 5m multiplier 1.5 max 1h - summary: System TCP accept queue overflows - info: Average number of overflows in the TCP accept queue over the last minute - to: silent - -# THIS IS TOO GENERIC -# CHECK: https://github.com/netdata/netdata/issues/3234#issuecomment-423935842 - alarm: 1m_tcp_accept_queue_drops - on: ip.tcp_accept_queue - class: Workload - type: System -component: Network - os: linux - hosts: * - lookup: average -60s unaligned absolute of ListenDrops - units: drops - every: 10s - warn: $this > 1 - crit: $this > (($status == $CRITICAL) ? (1) : (5)) - delay: up 0 down 5m multiplier 1.5 max 1h - summary: System TCP accept queue dropped packets - info: Average number of dropped packets in the TCP accept queue over the last minute - to: silent - - -# ----------------------------------------------------------------------------- -# tcp SYN queue (at the kernel) - -# When the SYN queue is full, either TcpExtTCPReqQFullDoCookies or -# TcpExtTCPReqQFullDrop is incremented, depending on whether SYN cookies are -# enabled or not. In both cases this probably indicates a SYN flood attack, -# so i guess a notification should be sent. - - alarm: 1m_tcp_syn_queue_drops - on: ip.tcp_syn_queue - class: Workload - type: System -component: Network - os: linux - hosts: * - lookup: average -60s unaligned absolute of TCPReqQFullDrop - units: drops - every: 10s - warn: $this > 1 - crit: $this > (($status == $CRITICAL) ? (0) : (5)) - delay: up 10 down 5m multiplier 1.5 max 1h - summary: System TCP SYN queue drops - info: Average number of SYN requests was dropped due to the full TCP SYN queue over the last minute \ - (SYN cookies were not enabled) - to: silent - - alarm: 1m_tcp_syn_queue_cookies - on: ip.tcp_syn_queue - class: Workload - type: System -component: Network - os: linux - hosts: * - lookup: average -60s unaligned absolute of TCPReqQFullDoCookies - units: cookies - every: 10s - warn: $this > 1 - crit: $this > (($status == $CRITICAL) ? (0) : (5)) - delay: up 10 down 5m multiplier 1.5 max 1h - summary: System TCP SYN queue cookies - info: Average number of sent SYN cookies due to the full TCP SYN queue over the last minute - to: silent - diff --git a/health/health.d/tcp_mem.conf b/health/health.d/tcp_mem.conf deleted file mode 100644 index 4e422ec1c..000000000 --- a/health/health.d/tcp_mem.conf +++ /dev/null @@ -1,24 +0,0 @@ -# -# check -# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html -# -# We give a warning when TCP is under memory pressure -# and a critical when TCP is 90% of its upper memory limit -# - - alarm: tcp_memory - on: ipv4.sockstat_tcp_mem - class: Utilization - type: System -component: Network - os: linux - hosts: * - calc: ${mem} * 100 / ${tcp_mem_high} - units: % - every: 10s - warn: ${mem} > (($status >= $WARNING ) ? ( ${tcp_mem_pressure} * 0.8 ) : ( ${tcp_mem_pressure} )) - crit: ${mem} > (($status == $CRITICAL ) ? ( ${tcp_mem_pressure} ) : ( ${tcp_mem_high} * 0.9 )) - delay: up 0 down 5m multiplier 1.5 max 1h - summary: System TCP memory utilization - info: TCP memory utilization - to: silent diff --git a/health/health.d/tcp_orphans.conf b/health/health.d/tcp_orphans.conf deleted file mode 100644 index 8f665d50e..000000000 --- a/health/health.d/tcp_orphans.conf +++ /dev/null @@ -1,25 +0,0 @@ - -# -# check -# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html -# -# The kernel may penalize orphans by 2x or even 4x -# so we alarm warning at 25% and critical at 50% -# - - alarm: tcp_orphans - on: ipv4.sockstat_tcp_sockets - class: Errors - type: System -component: Network - os: linux - hosts: * - calc: ${orphan} * 100 / ${tcp_max_orphans} - units: % - every: 10s - warn: $this > (($status >= $WARNING ) ? ( 20 ) : ( 25 )) - crit: $this > (($status == $CRITICAL) ? ( 25 ) : ( 50 )) - delay: up 0 down 5m multiplier 1.5 max 1h - summary: System TCP orphan sockets utilization - info: Orphan IPv4 TCP sockets utilization - to: silent diff --git a/health/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf deleted file mode 100644 index 7c39db2db..000000000 --- a/health/health.d/tcp_resets.conf +++ /dev/null @@ -1,71 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - -# ----------------------------------------------------------------------------- -# tcp resets this host sends - - alarm: 1m_ip_tcp_resets_sent - on: ip.tcphandshake - class: Errors - type: System -component: Network - os: linux - hosts: * - lookup: average -1m at -10s unaligned absolute of OutRsts - units: tcp resets/s - every: 10s - info: average number of sent TCP RESETS over the last minute - - alarm: 10s_ip_tcp_resets_sent - on: ip.tcphandshake - class: Errors - type: System -component: Network - os: linux - hosts: * - lookup: average -10s unaligned absolute of OutRsts - units: tcp resets/s - every: 10s - warn: $netdata.uptime.uptime > (1 * 60) AND $this > ((($1m_ip_tcp_resets_sent < 5)?(5):($1m_ip_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (10))) - delay: up 20s down 60m multiplier 1.2 max 2h - options: no-clear-notification - summary: System TCP outbound resets - info: Average number of sent TCP RESETS over the last 10 seconds. \ - This can indicate a port scan, \ - or that a service running on this host has crashed. \ - Netdata will not send a clear notification for this alarm. - to: silent - -# ----------------------------------------------------------------------------- -# tcp resets this host receives - - alarm: 1m_ip_tcp_resets_received - on: ip.tcphandshake - class: Errors - type: System -component: Network - os: linux freebsd - hosts: * - lookup: average -1m at -10s unaligned absolute of AttemptFails - units: tcp resets/s - every: 10s - info: average number of received TCP RESETS over the last minute - - alarm: 10s_ip_tcp_resets_received - on: ip.tcphandshake - class: Errors - type: System -component: Network - os: linux freebsd - hosts: * - lookup: average -10s unaligned absolute of AttemptFails - units: tcp resets/s - every: 10s - warn: $netdata.uptime.uptime > (1 * 60) AND $this > ((($1m_ip_tcp_resets_received < 5)?(5):($1m_ip_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (10))) - delay: up 20s down 60m multiplier 1.2 max 2h - options: no-clear-notification - summary: System TCP inbound resets - info: average number of received TCP RESETS over the last 10 seconds. \ - This can be an indication that a service this host needs has crashed. \ - Netdata will not send a clear notification for this alarm. - to: silent diff --git a/health/health.d/timex.conf b/health/health.d/timex.conf deleted file mode 100644 index 65c9628b5..000000000 --- a/health/health.d/timex.conf +++ /dev/null @@ -1,18 +0,0 @@ - -# It can take several minutes before ntpd selects a server to synchronize with; -# try checking after 17 minutes (1024 seconds). - - alarm: system_clock_sync_state - on: system.clock_sync_state - os: linux - class: Errors - type: System -component: Clock - calc: $state - units: synchronization state - every: 10s - warn: $system.uptime.uptime > 17 * 60 AND $this == 0 - delay: down 5m - summary: System clock sync state - info: When set to 0, the system kernel believes the system clock is not properly synchronized to a reliable server - to: silent diff --git a/health/health.d/udp_errors.conf b/health/health.d/udp_errors.conf deleted file mode 100644 index dc0948403..000000000 --- a/health/health.d/udp_errors.conf +++ /dev/null @@ -1,40 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - -# ----------------------------------------------------------------------------- -# UDP receive buffer errors - - alarm: 1m_ipv4_udp_receive_buffer_errors - on: ipv4.udperrors - class: Errors - type: System -component: Network - os: linux freebsd - hosts: * - lookup: average -1m unaligned absolute of RcvbufErrors - units: errors - every: 10s - warn: $this > (($status >= $WARNING) ? (0) : (10)) - summary: System UDP receive buffer errors - info: Average number of UDP receive buffer errors over the last minute - delay: up 1m down 60m multiplier 1.2 max 2h - to: silent - -# ----------------------------------------------------------------------------- -# UDP send buffer errors - - alarm: 1m_ipv4_udp_send_buffer_errors - on: ipv4.udperrors - class: Errors - type: System -component: Network - os: linux - hosts: * - lookup: average -1m unaligned absolute of SndbufErrors - units: errors - every: 10s - warn: $this > (($status >= $WARNING) ? (0) : (10)) - summary: System UDP send buffer errors - info: Average number of UDP send buffer errors over the last minute - delay: up 1m down 60m multiplier 1.2 max 2h - to: silent diff --git a/health/health.d/unbound.conf b/health/health.d/unbound.conf deleted file mode 100644 index 3c898f1d5..000000000 --- a/health/health.d/unbound.conf +++ /dev/null @@ -1,30 +0,0 @@ - -# make sure there is no overwritten/dropped queries in the request-list - - template: unbound_request_list_overwritten - on: unbound.request_list_jostle_list - class: Errors - type: DNS -component: Unbound - lookup: average -60s unaligned absolute match-names of overwritten - units: queries - every: 10s - warn: $this > 5 - delay: up 10 down 5m multiplier 1.5 max 1h - summary: Unbound overwritten queries - info: Number of overwritten queries in the request-list - to: sysadmin - - template: unbound_request_list_dropped - on: unbound.request_list_jostle_list - class: Errors - type: DNS -component: Unbound - lookup: average -60s unaligned absolute match-names of dropped - units: queries - every: 10s - warn: $this > 0 - delay: up 10 down 5m multiplier 1.5 max 1h - summary: Unbound dropped queries - info: Number of dropped queries in the request-list - to: sysadmin diff --git a/health/health.d/upsd.conf b/health/health.d/upsd.conf deleted file mode 100644 index 703a64881..000000000 --- a/health/health.d/upsd.conf +++ /dev/null @@ -1,50 +0,0 @@ -# you can disable an alarm notification by setting the 'to' line to: silent - - template: upsd_10min_ups_load - on: upsd.ups_load - class: Utilization - type: Power Supply -component: UPS - os: * - hosts: * - lookup: average -10m unaligned of load - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (70) : (80)) - crit: $this > (($status == $CRITICAL) ? (85) : (95)) - delay: down 10m multiplier 1.5 max 1h - summary: UPS ${label:ups_name} load - info: UPS ${label:ups_name} average load over the last 10 minutes - to: sitemgr - - template: upsd_ups_battery_charge - on: upsd.ups_battery_charge - class: Errors - type: Power Supply -component: UPS - os: * - hosts: * - lookup: average -60s unaligned of charge - units: % - every: 60s - warn: $this < 75 - crit: $this < 40 - delay: down 10m multiplier 1.5 max 1h - summary: UPS ${label:ups_name} battery charge - info: UPS ${label:ups_name} average battery charge over the last minute - to: sitemgr - - template: upsd_ups_last_collected_secs - on: upsd.ups_load - class: Latency - type: Power Supply -component: UPS device - calc: $now - $last_collected_t - every: 10s - units: seconds ago - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - summary: UPS ${label:ups_name} last collected - info: UPS ${label:ups_name} number of seconds since the last successful data collection - to: sitemgr diff --git a/health/health.d/vcsa.conf b/health/health.d/vcsa.conf deleted file mode 100644 index 3e20bfd1e..000000000 --- a/health/health.d/vcsa.conf +++ /dev/null @@ -1,230 +0,0 @@ - -# Overall system health: -# - 0: all components are healthy. -# - 1: one or more components might become overloaded soon. -# - 2: one or more components in the appliance might be degraded. -# - 3: one or more components might be in an unusable status and the appliance might become unresponsive soon. -# - 4: no health data is available. - - template: vcsa_system_health_warn - on: vcsa.system_health_status - class: Errors - type: Virtual Machine -component: VMware vCenter - calc: $orange - units: status - every: 10s - warn: $this == 1 - delay: down 1m multiplier 1.5 max 1h - summary: VCSA system status - info: VCSA overall system status is orange. One or more components are degraded. - to: sysadmin - - template: vcsa_system_health_crit - on: vcsa.system_health_status - class: Errors - type: Virtual Machine -component: VMware vCenter - calc: $red - units: status - every: 10s - crit: $this == 1 - delay: down 1m multiplier 1.5 max 1h - summary: VCSA system status - info: VCSA overall system status is red. One or more components are unavailable or will stop functioning soon. - to: sysadmin - -# Components health: -# - 0: healthy. -# - 1: healthy, but may have some problems. -# - 2: degraded, and may have serious problems. -# - 3: unavailable, or will stop functioning soon. -# - 4: no health data is available. - - template: vcsa_applmgmt_health_warn - on: vcsa.applmgmt_health_status - class: Errors - type: Virtual Machine -component: VMware vCenter - calc: $orange - units: status - every: 10s - warn: $this == 1 - delay: down 1m multiplier 1.5 max 1h - summary: VCSA ApplMgmt service status - info: VCSA ApplMgmt component status is orange. It is degraded, and may have serious problems. - to: silent - - template: vcsa_applmgmt_health_crit - on: vcsa.applmgmt_health_status - class: Errors - type: Virtual Machine -component: VMware vCenter - calc: $red - units: status - every: 10s - warn: $this == 1 - delay: down 1m multiplier 1.5 max 1h - summary: VCSA ApplMgmt service status - info: VCSA ApplMgmt component status is red. It is unavailable, or will stop functioning soon. - to: sysadmin - - template: vcsa_load_health_warn - on: vcsa.load_health_status - class: Errors - type: Virtual Machine -component: VMware vCenter - calc: $orange - units: status - every: 10s - warn: $this == 1 - delay: down 1m multiplier 1.5 max 1h - summary: VCSA Load status - info: VCSA Load component status is orange. It is degraded, and may have serious problems. - to: silent - - template: vcsa_load_health_crit - on: vcsa.load_health_status - class: Errors - type: Virtual Machine -component: VMware vCenter - calc: $red - units: status - every: 10s - warn: $this == 1 - delay: down 1m multiplier 1.5 max 1h - summary: VCSA Load status - info: VCSA Load component status is red. It is unavailable, or will stop functioning soon. - to: sysadmin - - template: vcsa_mem_health_warn - on: vcsa.mem_health_status - class: Errors - type: Virtual Machine -component: VMware vCenter - calc: $orange - units: status - every: 10s - warn: $this == 1 - delay: down 1m multiplier 1.5 max 1h - summary: VCSA Memory status - info: VCSA Memory component status is orange. It is degraded, and may have serious problems. - to: silent - - template: vcsa_mem_health_crit - on: vcsa.mem_health_status - class: Errors - type: Virtual Machine -component: VMware vCenter - calc: $red - units: status - every: 10s - warn: $this == 1 - delay: down 1m multiplier 1.5 max 1h - summary: VCSA Memory status - info: VCSA Memory component status is red. It is unavailable, or will stop functioning soon. - to: sysadmin - - template: vcsa_swap_health_warn - on: vcsa.swap_health_status - class: Errors - type: Virtual Machine -component: VMware vCenter - calc: $orange - units: status - every: 10s - warn: $this == 1 - delay: down 1m multiplier 1.5 max 1h - summary: VCSA Swap status - info: VCSA Swap component status is orange. It is degraded, and may have serious problems. - to: silent - - template: vcsa_swap_health_crit - on: vcsa.swap_health_status - class: Errors - type: Virtual Machine -component: VMware vCenter - calc: $red - units: status - every: 10s - warn: $this == 1 - delay: down 1m multiplier 1.5 max 1h - summary: VCSA Swap status - info: VCSA Swap component status is red. It is unavailable, or will stop functioning soon. - to: sysadmin - - template: vcsa_database_storage_health_warn - on: vcsa.database_storage_health_status - class: Errors - type: Virtual Machine -component: VMware vCenter - calc: $orange - units: status - every: 10s - warn: $this == 1 - delay: down 1m multiplier 1.5 max 1h - summary: VCSA Database status - info: VCSA Database Storage component status is orange. It is degraded, and may have serious problems. - to: silent - - template: vcsa_database_storage_health_crit - on: vcsa.database_storage_health_status - class: Errors - type: Virtual Machine -component: VMware vCenter - calc: $red - units: status - every: 10s - warn: $this == 1 - delay: down 1m multiplier 1.5 max 1h - summary: VCSA Database status - info: VCSA Database Storage component status is red. It is unavailable, or will stop functioning soon. - to: sysadmin - - template: vcsa_storage_health_warn - on: vcsa.storage_health_status - class: Errors - type: Virtual Machine -component: VMware vCenter - calc: $orange - units: status - every: 10s - warn: $this == 1 - delay: down 1m multiplier 1.5 max 1h - summary: VCSA Storage status - info: VCSA Storage component status is orange. It is degraded, and may have serious problems. - to: silent - - template: vcsa_storage_health_crit - on: vcsa.storage_health_status - class: Errors - type: Virtual Machine -component: VMware vCenter - calc: $red - units: status - every: 10s - warn: $this == 1 - delay: down 1m multiplier 1.5 max 1h - summary: VCSA Storage status - info: VCSA Storage component status is red. It is unavailable, or will stop functioning soon. - to: sysadmin - -# Software updates health: -# - 0: no updates available. -# - 2: non-security updates are available. -# - 3: security updates are available. -# - 4: an error retrieving information on software updates. - - template: vcsa_software_packages_health_warn - on: vcsa.software_packages_health_status - class: Errors - type: Virtual Machine -component: VMware vCenter - calc: $orange - units: status - every: 10s - warn: $this == 1 - delay: down 1m multiplier 1.5 max 1h - summary: VCSA software status - info: VCSA software packages security updates are available. - to: silent diff --git a/health/health.d/vernemq.conf b/health/health.d/vernemq.conf deleted file mode 100644 index 6ea9f99dc..000000000 --- a/health/health.d/vernemq.conf +++ /dev/null @@ -1,391 +0,0 @@ - -# Socket errors - - template: vernemq_socket_errors - on: vernemq.socket_errors - class: Errors - type: Messaging -component: VerneMQ - lookup: sum -1m unaligned absolute of socket_error - units: errors - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ socket errors - info: Number of socket errors in the last minute - to: sysadmin - -# Queues dropped/expired/unhandled PUBLISH messages - - template: vernemq_queue_message_drop - on: vernemq.queue_undelivered_messages - class: Errors - type: Messaging -component: VerneMQ - lookup: average -1m unaligned absolute of queue_message_drop - units: dropped messages - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ dropped messages - info: Number of dropped messages due to full queues in the last minute - to: sysadmin - - template: vernemq_queue_message_expired - on: vernemq.queue_undelivered_messages - class: Latency - type: Messaging -component: VerneMQ - lookup: average -1m unaligned absolute of queue_message_expired - units: expired messages - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ expired messages - info: number of messages which expired before delivery in the last minute - to: sysadmin - - template: vernemq_queue_message_unhandled - on: vernemq.queue_undelivered_messages - class: Latency - type: Messaging -component: VerneMQ - lookup: average -1m unaligned absolute of queue_message_unhandled - units: unhandled messages - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ unhandled messages - info: Number of unhandled messages (connections with clean session=true) in the last minute - to: sysadmin - -# Erlang VM - - template: vernemq_average_scheduler_utilization - on: vernemq.average_scheduler_utilization - class: Utilization - type: Messaging -component: VerneMQ - lookup: average -10m unaligned - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (75) : (85)) - crit: $this > (($status == $CRITICAL) ? (85) : (95)) - delay: down 15m multiplier 1.5 max 1h - summary: VerneMQ scheduler utilization - info: Average scheduler utilization over the last 10 minutes - to: sysadmin - -# Cluster communication and netsplits - - template: vernemq_cluster_dropped - on: vernemq.cluster_dropped - class: Errors - type: Messaging -component: VerneMQ - lookup: sum -1m unaligned - units: KiB - every: 1m - warn: $this > 0 - delay: up 5m down 5m multiplier 1.5 max 1h - summary: VerneMQ dropped traffic - info: Amount of traffic dropped during communication with the cluster nodes in the last minute - to: sysadmin - - template: vernemq_netsplits - on: vernemq.netsplits - class: Workload - type: Messaging -component: VerneMQ - lookup: sum -1m unaligned absolute of netsplit_detected - units: netsplits - every: 10s - warn: $this > 0 - delay: down 5m multiplier 1.5 max 2h - summary: VerneMQ netsplits - info: Number of detected netsplits (split brain situation) in the last minute - to: sysadmin - -# Unsuccessful CONNACK - - template: vernemq_mqtt_connack_sent_reason_unsuccessful - on: vernemq.mqtt_connack_sent_reason - class: Errors - type: Messaging -component: VerneMQ - lookup: average -1m unaligned absolute match-names of !success,* - units: packets - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ unsuccessful CONNACK - info: Number of sent unsuccessful v3/v5 CONNACK packets in the last minute - to: sysadmin - -# Not normal DISCONNECT - - template: vernemq_mqtt_disconnect_received_reason_not_normal - on: vernemq.mqtt_disconnect_received_reason - class: Workload - type: Messaging -component: VerneMQ - lookup: average -1m unaligned absolute match-names of !normal_disconnect,* - units: packets - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ received not normal DISCONNECT - info: Number of received not normal v5 DISCONNECT packets in the last minute - to: sysadmin - - template: vernemq_mqtt_disconnect_sent_reason_not_normal - on: vernemq.mqtt_disconnect_sent_reason - class: Errors - type: Messaging -component: VerneMQ - lookup: average -1m unaligned absolute match-names of !normal_disconnect,* - units: packets - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ sent not normal DISCONNECT - info: Number of sent not normal v5 DISCONNECT packets in the last minute - to: sysadmin - -# SUBSCRIBE errors and unauthorized attempts - - template: vernemq_mqtt_subscribe_error - on: vernemq.mqtt_subscribe_error - class: Errors - type: Messaging -component: VerneMQ - lookup: average -1m unaligned absolute - units: failed ops - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ failed SUBSCRIBE - info: Number of failed v3/v5 SUBSCRIBE operations in the last minute - to: sysadmin - - template: vernemq_mqtt_subscribe_auth_error - on: vernemq.mqtt_subscribe_auth_error - class: Workload - type: Messaging -component: VerneMQ - lookup: average -1m unaligned absolute - units: attempts - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ unauthorized SUBSCRIBE - info: number of unauthorized v3/v5 SUBSCRIBE attempts in the last minute - to: sysadmin - -# UNSUBSCRIBE errors - - template: vernemq_mqtt_unsubscribe_error - on: vernemq.mqtt_unsubscribe_error - class: Errors - type: Messaging -component: VerneMQ - lookup: average -1m unaligned absolute - units: failed ops - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ failed UNSUBSCRIBE - info: Number of failed v3/v5 UNSUBSCRIBE operations in the last minute - to: sysadmin - -# PUBLISH errors and unauthorized attempts - - template: vernemq_mqtt_publish_errors - on: vernemq.mqtt_publish_errors - class: Errors - type: Messaging -component: VerneMQ - lookup: average -1m unaligned absolute - units: failed ops - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ failed PUBLISH - info: Number of failed v3/v5 PUBLISH operations in the last minute - to: sysadmin - - template: vernemq_mqtt_publish_auth_errors - on: vernemq.mqtt_publish_auth_errors - class: Workload - type: Messaging -component: VerneMQ - lookup: average -1m unaligned absolute - units: attempts - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ unauthorized PUBLISH - info: Number of unauthorized v3/v5 PUBLISH attempts in the last minute - to: sysadmin - -# Unsuccessful and unexpected PUBACK - - template: vernemq_mqtt_puback_received_reason_unsuccessful - on: vernemq.mqtt_puback_received_reason - class: Errors - type: Messaging -component: VerneMQ - lookup: average -1m unaligned absolute match-names of !success,* - units: packets - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ unsuccessful received PUBACK - info: Number of received unsuccessful v5 PUBACK packets in the last minute - to: sysadmin - - template: vernemq_mqtt_puback_sent_reason_unsuccessful - on: vernemq.mqtt_puback_sent_reason - class: Errors - type: Messaging -component: VerneMQ - lookup: average -1m unaligned absolute match-names of !success,* - units: packets - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ unsuccessful sent PUBACK - info: Number of sent unsuccessful v5 PUBACK packets in the last minute - to: sysadmin - - template: vernemq_mqtt_puback_unexpected - on: vernemq.mqtt_puback_invalid_error - class: Workload - type: Messaging -component: VerneMQ - lookup: average -1m unaligned absolute - units: messages - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ unnexpected recieved PUBACK - info: Number of received unexpected v3/v5 PUBACK packets in the last minute - to: sysadmin - -# Unsuccessful and unexpected PUBREC - - template: vernemq_mqtt_pubrec_received_reason_unsuccessful - on: vernemq.mqtt_pubrec_received_reason - class: Errors - type: Messaging -component: VerneMQ - lookup: average -1m unaligned absolute match-names of !success,* - units: packets - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ unsuccessful received PUBREC - info: Number of received unsuccessful v5 PUBREC packets in the last minute - to: sysadmin - - template: vernemq_mqtt_pubrec_sent_reason_unsuccessful - on: vernemq.mqtt_pubrec_sent_reason - class: Errors - type: Messaging -component: VerneMQ - lookup: average -1m unaligned absolute match-names of !success,* - units: packets - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ unsuccessful sent PUBREC - info: Number of sent unsuccessful v5 PUBREC packets in the last minute - to: sysadmin - - template: vernemq_mqtt_pubrec_invalid_error - on: vernemq.mqtt_pubrec_invalid_error - class: Workload - type: Messaging -component: VerneMQ - lookup: average -1m unaligned absolute - units: messages - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ invalid received PUBREC - info: Number of received invalid v3 PUBREC packets in the last minute - to: sysadmin - -# Unsuccessful PUBREL - - template: vernemq_mqtt_pubrel_received_reason_unsuccessful - on: vernemq.mqtt_pubrel_received_reason - class: Errors - type: Messaging -component: VerneMQ - lookup: average -1m unaligned absolute match-names of !success,* - units: packets - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ unsuccessful received PUBREL - info: Number of received unsuccessful v5 PUBREL packets in the last minute - to: sysadmin - - template: vernemq_mqtt_pubrel_sent_reason_unsuccessful - on: vernemq.mqtt_pubrel_sent_reason - class: Errors - type: Messaging -component: VerneMQ - lookup: average -1m unaligned absolute match-names of !success,* - units: packets - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ unsuccessful sent PUBREL - info: number of sent unsuccessful v5 PUBREL packets in the last minute - to: sysadmin - -# Unsuccessful and unexpected PUBCOMP - - template: vernemq_mqtt_pubcomp_received_reason_unsuccessful - on: vernemq.mqtt_pubcomp_received_reason - class: Errors - type: Messaging -component: VerneMQ - lookup: average -1m unaligned absolute match-names of !success,* - units: packets - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ unsuccessful received PUBCOMP - info: Number of received unsuccessful v5 PUBCOMP packets in the last minute - to: sysadmin - - template: vernemq_mqtt_pubcomp_sent_reason_unsuccessful - on: vernemq.mqtt_pubcomp_sent_reason - class: Errors - type: Messaging -component: VerneMQ - lookup: average -1m unaligned absolute match-names of !success,* - units: packets - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ unsuccessful sent PUBCOMP - info: number of sent unsuccessful v5 PUBCOMP packets in the last minute - to: sysadmin - - template: vernemq_mqtt_pubcomp_unexpected - on: vernemq.mqtt_pubcomp_invalid_error - class: Workload - type: Messaging -component: VerneMQ - lookup: average -1m unaligned absolute - units: messages - every: 1m - warn: $this > (($status >= $WARNING) ? (0) : (5)) - delay: up 2m down 5m multiplier 1.5 max 2h - summary: VerneMQ unexpected received PUBCOMP - info: number of received unexpected v3/v5 PUBCOMP packets in the last minute - to: sysadmin diff --git a/health/health.d/vsphere.conf b/health/health.d/vsphere.conf deleted file mode 100644 index b8ad9aee4..000000000 --- a/health/health.d/vsphere.conf +++ /dev/null @@ -1,70 +0,0 @@ - -# you can disable an alarm notification by setting the 'to' line to: silent - -# -----------------------------------------------Virtual Machine-------------------------------------------------------- - - template: vsphere_vm_cpu_utilization - on: vsphere.vm_cpu_utilization - class: Utilization - type: Virtual Machine -component: CPU - hosts: * - lookup: average -10m unaligned match-names of used - units: % - every: 20s - warn: $this > (($status >= $WARNING) ? (75) : (85)) - crit: $this > (($status == $CRITICAL) ? (85) : (95)) - delay: down 15m multiplier 1.5 max 1h - summary: vSphere CPU utilization for VM ${label:vm} - info: CPU utilization VM ${label:vm} host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter} - to: silent - - template: vsphere_vm_mem_utilization - on: vsphere.vm_mem_utilization - class: Utilization - type: Virtual Machine -component: Memory - hosts: * - calc: $used - units: % - every: 20s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: down 15m multiplier 1.5 max 1h - summary: vSphere memory utilization for VM ${label:vm} - info: Memory utilization VM ${label:vm} host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter} - to: silent - -# -----------------------------------------------ESXI host-------------------------------------------------------------- - - template: vsphere_host_cpu_utilization - on: vsphere.host_cpu_utilization - class: Utilization - type: Virtual Machine -component: CPU - hosts: * - lookup: average -10m unaligned match-names of used - units: % - every: 20s - warn: $this > (($status >= $WARNING) ? (75) : (85)) - crit: $this > (($status == $CRITICAL) ? (85) : (95)) - delay: down 15m multiplier 1.5 max 1h - summary: vSphere ESXi CPU utilization for host ${label:host} - info: CPU utilization ESXi host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter} - to: sysadmin - - template: vsphere_host_mem_utilization - on: vsphere.host_mem_utilization - class: Utilization - type: Virtual Machine -component: Memory - hosts: * - calc: $used - units: % - every: 20s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: down 15m multiplier 1.5 max 1h - summary: vSphere ESXi Ram utilization for host ${label:host} - info: Memory utilization ESXi host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter} - to: sysadmin diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf deleted file mode 100644 index 78f1cc7f5..000000000 --- a/health/health.d/web_log.conf +++ /dev/null @@ -1,205 +0,0 @@ - -# unmatched lines - -# the following alarms trigger only when there are enough data. -# we assume there are enough data when: -# -# $1m_total_requests > 120 -# -# i.e. when there are at least 120 requests during the last minute - - template: web_log_1m_total_requests - on: web_log.requests - class: Workload - type: Web Server -component: Web log - lookup: sum -1m unaligned - calc: ($this == 0)?(1):($this) - units: requests - every: 10s - info: number of HTTP requests in the last minute - - template: web_log_1m_unmatched - on: web_log.excluded_requests - class: Errors - type: Web Server -component: Web log - lookup: sum -1m unaligned of unmatched - calc: $this * 100 / $web_log_1m_total_requests - units: % - every: 10s - warn: ($web_log_1m_total_requests > 120) ? ($this > 1) : ( 0 ) - delay: up 1m down 5m multiplier 1.5 max 1h - summary: Web log unparsed - info: Percentage of unparsed log lines over the last minute - to: webmaster - -# ----------------------------------------------------------------------------- -# high level response code alarms - -# the following alarms trigger only when there are enough data. -# we assume there are enough data when: -# -# $1m_requests > 120 -# -# i.e. when there are at least 120 requests during the last minute - - template: web_log_1m_requests - on: web_log.type_requests - class: Workload - type: Web Server -component: Web log - lookup: sum -1m unaligned - calc: ($this == 0)?(1):($this) - units: requests - every: 10s - info: number of HTTP requests in the last minute - - template: web_log_1m_successful - on: web_log.type_requests - class: Workload - type: Web Server -component: Web log - lookup: sum -1m unaligned of success - calc: $this * 100 / $web_log_1m_requests - units: % - every: 10s - warn: ($web_log_1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 ) - crit: ($web_log_1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 ) - delay: up 2m down 15m multiplier 1.5 max 1h - summary: Web log successful - info: Ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401) - to: webmaster - - template: web_log_1m_redirects - on: web_log.type_requests - class: Workload - type: Web Server -component: Web log - lookup: sum -1m unaligned of redirect - calc: $this * 100 / $web_log_1m_requests - units: % - every: 10s - warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 ) - delay: up 2m down 15m multiplier 1.5 max 1h - summary: Web log redirects - info: Ratio of redirection HTTP requests over the last minute (3xx except 304) - to: webmaster - - template: web_log_1m_bad_requests - on: web_log.type_requests - class: Errors - type: Web Server -component: Web log - lookup: sum -1m unaligned of bad - calc: $this * 100 / $web_log_1m_requests - units: % - every: 10s - warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 ) - delay: up 2m down 15m multiplier 1.5 max 1h - summary: Web log bad requests - info: Ratio of client error HTTP requests over the last minute (4xx except 401) - to: webmaster - - template: web_log_1m_internal_errors - on: web_log.type_requests - class: Errors - type: Web Server -component: Web log - lookup: sum -1m unaligned of error - calc: $this * 100 / $web_log_1m_requests - units: % - every: 10s - warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 ) - crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 ) - delay: up 2m down 15m multiplier 1.5 max 1h - summary: Web log server errors - info: Ratio of server error HTTP requests over the last minute (5xx) - to: webmaster - -# ----------------------------------------------------------------------------- -# web slow - -# the following alarms trigger only when there are enough data. -# we assume there are enough data when: -# -# $1m_requests > 120 -# -# i.e. when there are at least 120 requests during the last minute - - template: web_log_10m_response_time - on: web_log.request_processing_time - class: Latency - type: System -component: Web log - lookup: average -10m unaligned of avg - units: ms - every: 30s - info: average HTTP response time over the last 10 minutes - - template: web_log_web_slow - on: web_log.request_processing_time - class: Latency - type: Web Server -component: Web log - lookup: average -1m unaligned of avg - units: ms - every: 10s - green: 500 - red: 1000 - warn: ($web_log_1m_requests > 120) ? ($this > $green && $this > ($web_log_10m_response_time * 2) ) : ( 0 ) - crit: ($web_log_1m_requests > 120) ? ($this > $red && $this > ($web_log_10m_response_time * 4) ) : ( 0 ) - delay: down 15m multiplier 1.5 max 1h - summary: Web log processing time - info: Average HTTP response time over the last 1 minute - options: no-clear-notification - to: webmaster - -# ----------------------------------------------------------------------------- -# web too many or too few requests - -# the following alarms trigger only when there are enough data. -# we assume there are enough data when: -# -# $5m_successful_old > 120 -# -# i.e. when there were at least 120 requests during the 5 minutes starting -# at -10m and ending at -5m - - template: web_log_5m_successful_old - on: web_log.type_requests - class: Workload - type: Web Server -component: Web log - lookup: average -5m at -5m unaligned of success - units: requests/s - every: 30s - info: average number of successful HTTP requests for the 5 minutes starting 10 minutes ago - - template: web_log_5m_successful - on: web_log.type_requests - class: Workload - type: Web Server -component: Web log - lookup: average -5m unaligned of success - units: requests/s - every: 30s - info: average number of successful HTTP requests over the last 5 minutes - - template: web_log_5m_requests_ratio - on: web_log.type_requests - class: Workload - type: Web Server -component: Web log - calc: ($web_log_5m_successful_old > 0)?($web_log_5m_successful * 100 / $web_log_5m_successful_old):(100) - units: % - every: 30s - warn: ($web_log_5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0) - crit: ($web_log_5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0) - delay: down 15m multiplier 1.5 max 1h - options: no-clear-notification - summary: Web log 5 minutes requests ratio - info: Ratio of successful HTTP requests over over the last 5 minutes, \ - compared with the previous 5 minutes \ - (clear notification for this alarm will not be sent) - to: webmaster diff --git a/health/health.d/whoisquery.conf b/health/health.d/whoisquery.conf deleted file mode 100644 index 0a328b592..000000000 --- a/health/health.d/whoisquery.conf +++ /dev/null @@ -1,14 +0,0 @@ - - template: whoisquery_days_until_expiration - on: whoisquery.time_until_expiration - class: Utilization - type: Other -component: WHOIS - calc: $expiry - units: seconds - every: 60s - warn: $this < $days_until_expiration_warning*24*60*60 - crit: $this < $days_until_expiration_critical*24*60*60 - summary: Whois expiration time for domain ${label:domain} - info: Time until the domain name registration for ${label:domain} expires - to: webmaster diff --git a/health/health.d/windows.conf b/health/health.d/windows.conf deleted file mode 100644 index 706fcbf22..000000000 --- a/health/health.d/windows.conf +++ /dev/null @@ -1,126 +0,0 @@ - -## CPU - - template: windows_10min_cpu_usage - on: windows.cpu_utilization_total - class: Utilization - type: Windows -component: CPU - os: * - hosts: * - lookup: average -10m unaligned match-names of dpc,user,privileged,interrupt - units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (75) : (85)) - crit: $this > (($status == $CRITICAL) ? (85) : (95)) - delay: down 15m multiplier 1.5 max 1h - summary: CPU utilization - info: Average CPU utilization over the last 10 minutes - to: silent - - -## Memory - - template: windows_ram_in_use - on: windows.memory_utilization - class: Utilization - type: Windows -component: Memory - os: * - hosts: * - calc: ($used) * 100 / ($used + $available) - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: down 15m multiplier 1.5 max 1h - summary: Ram utilization - info: Memory utilization - to: sysadmin - - -## Network - - template: windows_inbound_packets_discarded - on: windows.net_nic_discarded - class: Errors - type: Windows -component: Network - os: * - hosts: * - lookup: sum -10m unaligned absolute match-names of inbound - units: packets - every: 1m - warn: $this >= 5 - delay: down 1h multiplier 1.5 max 2h - summary: Inbound network packets discarded - info: Number of inbound discarded packets for the network interface in the last 10 minutes - to: silent - - template: windows_outbound_packets_discarded - on: windows.net_nic_discarded - class: Errors - type: Windows -component: Network - os: * - hosts: * - lookup: sum -10m unaligned absolute match-names of outbound - units: packets - every: 1m - warn: $this >= 5 - delay: down 1h multiplier 1.5 max 2h - summary: Outbound network packets discarded - info: Number of outbound discarded packets for the network interface in the last 10 minutes - to: silent - - template: windows_inbound_packets_errors - on: windows.net_nic_errors - class: Errors - type: Windows -component: Network - os: * - hosts: * - lookup: sum -10m unaligned absolute match-names of inbound - units: packets - every: 1m - warn: $this >= 5 - delay: down 1h multiplier 1.5 max 2h - summary: Inbound network errors - info: Number of inbound errors for the network interface in the last 10 minutes - to: silent - - template: windows_outbound_packets_errors - on: windows.net_nic_errors - class: Errors - type: Windows -component: Network - os: * - hosts: * - lookup: sum -10m unaligned absolute match-names of outbound - units: packets - every: 1m - warn: $this >= 5 - delay: down 1h multiplier 1.5 max 2h - summary: Outbound network errors - info: Number of outbound errors for the network interface in the last 10 minutes - to: silent - - -## Disk - - template: windows_disk_in_use - on: windows.logical_disk_space_usage - class: Utilization - type: Windows -component: Disk - os: * - hosts: * - calc: ($used) * 100 / ($used + $free) - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: down 15m multiplier 1.5 max 1h - summary: Disk space usage - info: Disk space utilization - to: sysadmin diff --git a/health/health.d/x509check.conf b/health/health.d/x509check.conf deleted file mode 100644 index d05f3ef0f..000000000 --- a/health/health.d/x509check.conf +++ /dev/null @@ -1,26 +0,0 @@ - - template: x509check_days_until_expiration - on: x509check.time_until_expiration - class: Latency - type: Certificates -component: x509 certificates - calc: $expiry - units: seconds - every: 60s - warn: $this < $days_until_expiration_warning*24*60*60 - crit: $this < $days_until_expiration_critical*24*60*60 - summary: x509 certificate expiration for ${label:source} - info: Time until x509 certificate expires for ${label:source} - to: webmaster - - template: x509check_revocation_status - on: x509check.revocation_status - class: Errors - type: Certificates -component: x509 certificates - calc: $revoked - every: 60s - crit: $this != nan AND $this != 0 - summary: x509 certificate revocation status for ${label:source} - info: x509 certificate revocation status (0: revoked, 1: valid) for ${label:source} - to: webmaster diff --git a/health/health.d/zfs.conf b/health/health.d/zfs.conf deleted file mode 100644 index d2a561000..000000000 --- a/health/health.d/zfs.conf +++ /dev/null @@ -1,44 +0,0 @@ - - alarm: zfs_memory_throttle - on: zfs.memory_ops - class: Utilization - type: System -component: File system - lookup: sum -10m unaligned absolute of throttled - units: events - every: 1m - warn: $this > 0 - delay: down 1h multiplier 1.5 max 2h - summary: ZFS ARC growth throttling - info: number of times ZFS had to limit the ARC growth in the last 10 minutes - to: silent - -# ZFS pool state - - template: zfs_pool_state_warn - on: zfspool.state - class: Errors - type: System -component: File system - calc: $degraded - units: boolean - every: 10s - warn: $this > 0 - delay: down 1m multiplier 1.5 max 1h - summary: ZFS pool ${label:pool} state - info: ZFS pool ${label:pool} state is degraded - to: sysadmin - - template: zfs_pool_state_crit - on: zfspool.state - class: Errors - type: System -component: File system - calc: $faulted + $unavail - units: boolean - every: 10s - crit: $this > 0 - delay: down 1m multiplier 1.5 max 1h - summary: Critical ZFS pool ${label:pool} state - info: ZFS pool ${label:pool} state is faulted or unavail - to: sysadmin |