diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 02:57:58 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-19 02:57:58 +0000 |
commit | be1c7e50e1e8809ea56f2c9d472eccd8ffd73a97 (patch) | |
tree | 9754ff1ca740f6346cf8483ec915d4054bc5da2d /health/health.d | |
parent | Initial commit. (diff) | |
download | netdata-be1c7e50e1e8809ea56f2c9d472eccd8ffd73a97.tar.xz netdata-be1c7e50e1e8809ea56f2c9d472eccd8ffd73a97.zip |
Adding upstream version 1.44.3.upstream/1.44.3upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
79 files changed, 5137 insertions, 0 deletions
diff --git a/health/health.d/adaptec_raid.conf b/health/health.d/adaptec_raid.conf new file mode 100644 index 00000000..1f184049 --- /dev/null +++ b/health/health.d/adaptec_raid.conf @@ -0,0 +1,32 @@ + +# logical device status check + + template: adaptec_raid_ld_status + on: adaptec_raid.ld_status + class: Errors + type: System +component: RAID + lookup: max -10s foreach * + units: bool + every: 10s + crit: $this > 0 + delay: down 5m multiplier 1.5 max 1h + summary: Adaptec raid logical device status + info: Logical device status is failed or degraded + to: sysadmin + +# physical device state check + + template: adaptec_raid_pd_state + on: adaptec_raid.pd_state + class: Errors + type: System +component: RAID + lookup: max -10s foreach * + units: bool + every: 10s + crit: $this > 0 + delay: down 5m multiplier 1.5 max 1h + summary: Adaptec raid physical device state + info: Physical device state is not online + to: sysadmin diff --git a/health/health.d/anomalies.conf b/health/health.d/anomalies.conf new file mode 100644 index 00000000..269ae544 --- /dev/null +++ b/health/health.d/anomalies.conf @@ -0,0 +1,23 @@ +# raise a warning alarm if an anomaly probability is consistently above 50% + + template: anomalies_anomaly_probabilities + on: anomalies.probability + class: Errors + type: Netdata +component: ML + lookup: average -2m foreach * + every: 1m + warn: $this > 50 + info: average anomaly probability over the last 2 minutes + +# raise a warning alarm if an anomaly flag is consistently firing + + template: anomalies_anomaly_flags + on: anomalies.anomaly + class: Errors + type: Netdata +component: ML + lookup: sum -2m foreach * + every: 1m + warn: $this > 10 + info: number of anomalies in the last 2 minutes diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf new file mode 100644 index 00000000..90a72af1 --- /dev/null +++ b/health/health.d/apcupsd.conf @@ -0,0 +1,125 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: apcupsd_10min_ups_load + on: apcupsd.load + class: Utilization + type: Power Supply +component: UPS + os: * + hosts: * + lookup: average -10m unaligned of percentage + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (70) : (80)) + delay: down 10m multiplier 1.5 max 1h + summary: APC UPS load + info: APC UPS average load over the last 10 minutes + to: sitemgr + +# Discussion in https://github.com/netdata/netdata/pull/3928: +# Fire the alarm as soon as it's going on battery (99% charge) and clear only when full. + template: apcupsd_ups_charge + on: apcupsd.charge + class: Errors + type: Power Supply +component: UPS + os: * + hosts: * + lookup: average -60s unaligned of charge + units: % + every: 60s + warn: $this < 100 + crit: $this < 40 + delay: down 10m multiplier 1.5 max 1h + summary: APC UPS battery charge + info: APC UPS average battery charge over the last minute + to: sitemgr + + template: apcupsd_last_collected_secs + on: apcupsd.load + class: Latency + type: Power Supply +component: UPS device + calc: $now - $last_collected_t + every: 10s + units: seconds ago + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + summary: APC UPS last collection + info: APC UPS number of seconds since the last successful data collection + to: sitemgr + +#Send out a warning when SELFTEST code is BT or NG. Code descriptions can be found at: +#http://www.apcupsd.org/manual/#:~:text=or%20N/A.-,SELFTEST,-The%20results%20of + template: apcupsd_selftest_warning + on: apcupsd.selftest + lookup: max -1s unaligned match-names of BT,NG + units: status + every: 10s + warn: $this == 1 + delay: up 0 down 15m multiplier 1.5 max 1h + info: APC UPS self-test failed due to insufficient battery capacity or due to overload. + to: sitemgr + +#Send out a warning when STATUS code is ONBATT,OVERLOAD,LOWBATT,REPLACEBATT,NOBATT,COMMLOST +#https://man.archlinux.org/man/apcaccess.8.en#:~:text=apcupsd%20was%20started-,STATUS,-%3A%20UPS%20status.%20One + + template: apcupsd_status_onbatt + on: apcupsd.status + lookup: max -1s unaligned match-names of ONBATT + units: status + every: 10s + warn: $this == 1 + delay: up 1m down 15m multiplier 1.5 max 1h + info: APC UPS has switched to battery power because the input power has failed + to: sitemgr + + template: apcupsd_status_overload + on: apcupsd.status + lookup: max -1s unaligned match-names of OVERLOAD + units: status + every: 10s + warn: $this == 1 + delay: up 0 down 15m multiplier 1.5 max 1h + info: APC UPS is overloaded and cannot supply enough power to the load + to: sitemgr + + template: apcupsd_status_lowbatt + on: apcupsd.status + lookup: max -1s unaligned match-names of LOWBATT + units: status + every: 10s + warn: $this == 1 + delay: up 0 down 15m multiplier 1.5 max 1h + info: APC UPS battery is low and needs to be recharged + to: sitemgr + + template: apcupsd_status_replacebatt + on: apcupsd.status + lookup: max -1s unaligned match-names of REPLACEBATT + units: status + every: 10s + warn: $this == 1 + delay: up 0 down 15m multiplier 1.5 max 1h + info: APC UPS battery has reached the end of its lifespan and needs to be replaced + to: sitemgr + + template: apcupsd_status_nobatt + on: apcupsd.status + lookup: max -1s unaligned match-names of NOBATT + units: status + every: 10s + warn: $this == 1 + delay: up 0 down 15m multiplier 1.5 max 1h + info: APC UPS has no battery + to: sitemgr + + template: apcupsd_status_commlost + on: apcupsd.status + lookup: max -1s unaligned match-names of COMMLOST + units: status + every: 10s + warn: $this == 1 + delay: up 0 down 15m multiplier 1.5 max 1h + info: APC UPS communication link is lost + to: sitemgr diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf new file mode 100644 index 00000000..44617342 --- /dev/null +++ b/health/health.d/bcache.conf @@ -0,0 +1,31 @@ + + template: bcache_cache_errors + on: disk.bcache_cache_read_races + class: Errors + type: System +component: Disk + lookup: sum -1m unaligned absolute + units: errors + every: 1m + warn: $this > 0 + delay: up 2m down 1h multiplier 1.5 max 2h + summary: Bcache cache read race errors + info: Number of times data was read from the cache, \ + the bucket was reused and invalidated in the last 10 minutes \ + (when this occurs the data is reread from the backing device) + to: silent + + template: bcache_cache_dirty + on: disk.bcache_cache_alloc + class: Utilization + type: System +component: Disk + calc: $dirty + $metadata + $undefined + units: % + every: 1m + warn: $this > 75 + delay: up 1m down 1h multiplier 1.5 max 2h + summary: Bcache cache used space + info: Percentage of cache space used for dirty data and metadata \ + (this usually means your SSD cache is too small) + to: silent diff --git a/health/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf new file mode 100644 index 00000000..0d37f28e --- /dev/null +++ b/health/health.d/beanstalkd.conf @@ -0,0 +1,41 @@ +# get the number of buried jobs in all queues + + template: beanstalk_server_buried_jobs + on: beanstalk.current_jobs + class: Workload + type: Messaging +component: Beanstalk + calc: $buried + units: jobs + every: 10s + warn: $this > 3 + delay: up 0 down 5m multiplier 1.2 max 1h + summary: Beanstalk buried jobs + info: Number of buried jobs across all tubes. \ + You need to manually kick them so they can be processed. \ + Presence of buried jobs in a tube does not affect new jobs. + to: sysadmin + +# get the number of buried jobs per queue + +#template: beanstalk_tube_buried_jobs +# on: beanstalk.jobs +# calc: $buried +# units: jobs +# every: 10s +# warn: $this > 0 +# crit: $this > 10 +# delay: up 0 down 5m multiplier 1.2 max 1h +# info: the number of jobs buried per tube +# to: sysadmin + +# get the current number of tubes + +#template: beanstalk_number_of_tubes +# on: beanstalk.current_tubes +# calc: $tubes +# every: 10s +# warn: $this < 5 +# delay: up 0 down 5m multiplier 1.2 max 1h +# info: the current number of tubes on the server +# to: sysadmin diff --git a/health/health.d/bind_rndc.conf b/health/health.d/bind_rndc.conf new file mode 100644 index 00000000..b1c271df --- /dev/null +++ b/health/health.d/bind_rndc.conf @@ -0,0 +1,12 @@ + template: bind_rndc_stats_file_size + on: bind_rndc.stats_size + class: Utilization + type: DNS +component: BIND + units: megabytes + every: 60 + calc: $stats_size + warn: $this > 512 + summary: BIND statistics file size + info: BIND statistics-file size + to: sysadmin diff --git a/health/health.d/boinc.conf b/health/health.d/boinc.conf new file mode 100644 index 00000000..092a5684 --- /dev/null +++ b/health/health.d/boinc.conf @@ -0,0 +1,70 @@ +# Alarms for various BOINC issues. + +# Warn on any compute errors encountered. + template: boinc_compute_errors + on: boinc.states + class: Errors + type: Computing +component: BOINC + os: * + hosts: * + lookup: average -10m unaligned of comperror + units: tasks + every: 1m + warn: $this > 0 + delay: up 1m down 5m multiplier 1.5 max 1h + summary: BOINC compute errors + info: Average number of compute errors over the last 10 minutes + to: sysadmin + +# Warn on lots of upload errors + template: boinc_upload_errors + on: boinc.states + class: Errors + type: Computing +component: BOINC + os: * + hosts: * + lookup: average -10m unaligned of upload_failed + units: tasks + every: 1m + warn: $this > 0 + delay: up 1m down 5m multiplier 1.5 max 1h + summary: BOINC failed uploads + info: Average number of failed uploads over the last 10 minutes + to: sysadmin + +# Warn on the task queue being empty + template: boinc_total_tasks + on: boinc.tasks + class: Utilization + type: Computing +component: BOINC + os: * + hosts: * + lookup: average -10m unaligned of total + units: tasks + every: 1m + warn: $this < 1 + delay: up 5m down 10m multiplier 1.5 max 1h + summary: BOINC total tasks + info: Average number of total tasks over the last 10 minutes + to: sysadmin + +# Warn on no active tasks with a non-empty queue + template: boinc_active_tasks + on: boinc.tasks + class: Utilization + type: Computing +component: BOINC + os: * + hosts: * + lookup: average -10m unaligned of active + calc: ($boinc_total_tasks >= 1) ? ($this) : (inf) + units: tasks + every: 1m + warn: $this < 1 + delay: up 5m down 10m multiplier 1.5 max 1h + summary: BOINC active tasks + info: Average number of active tasks over the last 10 minutes + to: sysadmin diff --git a/health/health.d/btrfs.conf b/health/health.d/btrfs.conf new file mode 100644 index 00000000..1557a594 --- /dev/null +++ b/health/health.d/btrfs.conf @@ -0,0 +1,142 @@ + + template: btrfs_allocated + on: btrfs.disk + class: Utilization + type: System +component: File system + os: * + hosts: * + calc: 100 - ($unallocated * 100 / ($unallocated + $data_used + $data_free + $meta_used + $meta_free + $sys_used + $sys_free)) + units: % + every: 10s + warn: $this > (($status == $CRITICAL) ? (95) : (98)) + delay: up 1m down 15m multiplier 1.5 max 1h + summary: BTRFS allocated space utilization + info: Percentage of allocated BTRFS physical disk space + to: silent + + template: btrfs_data + on: btrfs.data + class: Utilization + type: System +component: File system + os: * + hosts: * + calc: $used * 100 / ($used + $free) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98 + crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98 + delay: up 1m down 15m multiplier 1.5 max 1h + summary: BTRFS data space utilization + info: Utilization of BTRFS data space + to: sysadmin + + template: btrfs_metadata + on: btrfs.metadata + class: Utilization + type: System +component: File system + os: * + hosts: * + calc: ($used + $reserved) * 100 / ($used + $free + $reserved) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98 + crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98 + delay: up 1m down 15m multiplier 1.5 max 1h + summary: BTRFS metadata space utilization + info: Utilization of BTRFS metadata space + to: sysadmin + + template: btrfs_system + on: btrfs.system + class: Utilization + type: System +component: File system + os: * + hosts: * + calc: $used * 100 / ($used + $free) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (90) : (95)) && $btrfs_allocated > 98 + crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98 + delay: up 1m down 15m multiplier 1.5 max 1h + summary: BTRFS system space utilization + info: Utilization of BTRFS system space + to: sysadmin + + template: btrfs_device_read_errors + on: btrfs.device_errors + class: Errors + type: System +component: File system + os: * + hosts: * + units: errors + lookup: max -10m every 1m of read_errs + warn: $this > 0 + delay: up 1m down 15m multiplier 1.5 max 1h + summary: BTRFS device read errors + info: Number of encountered BTRFS read errors + to: sysadmin + + template: btrfs_device_write_errors + on: btrfs.device_errors + class: Errors + type: System +component: File system + os: * + hosts: * + units: errors + lookup: max -10m every 1m of write_errs + crit: $this > 0 + delay: up 1m down 15m multiplier 1.5 max 1h + summary: BTRFS device write errors + info: Number of encountered BTRFS write errors + to: sysadmin + + template: btrfs_device_flush_errors + on: btrfs.device_errors + class: Errors + type: System +component: File system + os: * + hosts: * + units: errors + lookup: max -10m every 1m of flush_errs + crit: $this > 0 + delay: up 1m down 15m multiplier 1.5 max 1h + summary: BTRFS device flush errors + info: Number of encountered BTRFS flush errors + to: sysadmin + + template: btrfs_device_corruption_errors + on: btrfs.device_errors + class: Errors + type: System +component: File system + os: * + hosts: * + units: errors + lookup: max -10m every 1m of corruption_errs + warn: $this > 0 + delay: up 1m down 15m multiplier 1.5 max 1h + summary: BTRFS device corruption errors + info: Number of encountered BTRFS corruption errors + to: sysadmin + + template: btrfs_device_generation_errors + on: btrfs.device_errors + class: Errors + type: System +component: File system + os: * + hosts: * + units: errors + lookup: max -10m every 1m of generation_errs + warn: $this > 0 + delay: up 1m down 15m multiplier 1.5 max 1h + summary: BTRFS device generation errors + info: Number of encountered BTRFS generation errors + to: sysadmin diff --git a/health/health.d/ceph.conf b/health/health.d/ceph.conf new file mode 100644 index 00000000..44d35133 --- /dev/null +++ b/health/health.d/ceph.conf @@ -0,0 +1,16 @@ +# low ceph disk available + + template: ceph_cluster_space_usage + on: ceph.general_usage + class: Utilization + type: Storage +component: Ceph + calc: $used * 100 / ($used + $avail) + units: % + every: 1m + warn: $this > (($status >= $WARNING ) ? (85) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 5m multiplier 1.2 max 1h + summary: Ceph cluster disk space utilization + info: Ceph cluster disk space utilization + to: sysadmin diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf new file mode 100644 index 00000000..9c55633e --- /dev/null +++ b/health/health.d/cgroups.conf @@ -0,0 +1,72 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + + template: cgroup_10min_cpu_usage + on: cgroup.cpu_limit + class: Utilization + type: Cgroups +component: CPU + os: linux + hosts: * + lookup: average -10m unaligned + units: % + every: 1m + warn: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + summary: Cgroup ${label:cgroup_name} CPU utilization + info: Cgroup ${label:cgroup_name} average CPU utilization over the last 10 minutes + to: silent + + template: cgroup_ram_in_use + on: cgroup.mem_usage + class: Utilization + type: Cgroups +component: Memory + os: linux + hosts: * + calc: ($ram) * 100 / $memory_limit + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + summary: Cgroup ${label:cgroup_name} memory utilization + info: Cgroup ${label:cgroup_name} memory utilization + to: silent + +# ---------------------------------K8s containers-------------------------------------------- + + template: k8s_cgroup_10min_cpu_usage + on: k8s.cgroup.cpu_limit + class: Utilization + type: Cgroups +component: CPU + os: linux + hosts: * + lookup: average -10m unaligned + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (85)) + delay: down 15m multiplier 1.5 max 1h + summary: Container ${label:k8s_container_name} pod ${label:k8s_pod_name} CPU utilization + info: Container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \ + average CPU utilization over the last 10 minutes + to: silent + + template: k8s_cgroup_ram_in_use + on: k8s.cgroup.mem_usage + class: Utilization + type: Cgroups +component: Memory + os: linux + hosts: * + calc: ($ram) * 100 / $memory_limit + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + summary: Container ${label:k8s_container_name} pod ${label:k8s_pod_name} memory utilization + info: container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \ + memory utilization + to: silent diff --git a/health/health.d/cockroachdb.conf b/health/health.d/cockroachdb.conf new file mode 100644 index 00000000..60f17835 --- /dev/null +++ b/health/health.d/cockroachdb.conf @@ -0,0 +1,78 @@ + +# Capacity + + template: cockroachdb_used_storage_capacity + on: cockroachdb.storage_used_capacity_percentage + class: Utilization + type: Database +component: CockroachDB + calc: $total + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + summary: CockroachDB storage space utilization + info: Storage capacity utilization + to: dba + + template: cockroachdb_used_usable_storage_capacity + on: cockroachdb.storage_used_capacity_percentage + class: Utilization + type: Database +component: CockroachDB + calc: $usable + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + summary: CockroachDB usable storage space utilization + info: Storage usable space utilization + to: dba + +# Replication + + template: cockroachdb_unavailable_ranges + on: cockroachdb.ranges_replication_problem + class: Errors + type: Database +component: CockroachDB + calc: $unavailable + units: num + every: 10s + warn: $this > 0 + delay: down 15m multiplier 1.5 max 1h + summary: CockroachDB unavailable replication + info: Number of ranges with fewer live replicas than needed for quorum + to: dba + + template: cockroachdb_underreplicated_ranges + on: cockroachdb.ranges_replication_problem + class: Errors + type: Database +component: CockroachDB + calc: $under_replicated + units: num + every: 10s + warn: $this > 0 + delay: down 15m multiplier 1.5 max 1h + summary: CockroachDB under-replicated + info: Number of ranges with fewer live replicas than the replication target + to: dba + +# FD + + template: cockroachdb_open_file_descriptors_limit + on: cockroachdb.process_file_descriptors + class: Utilization + type: Database +component: CockroachDB + calc: $open/$sys_fd_softlimit * 100 + units: % + every: 10s + warn: $this > 80 + delay: down 15m multiplier 1.5 max 1h + summary: CockroachDB file descriptors utilization + info: Open file descriptors utilization (against softlimit) + to: dba diff --git a/health/health.d/consul.conf b/health/health.d/consul.conf new file mode 100644 index 00000000..8b414a26 --- /dev/null +++ b/health/health.d/consul.conf @@ -0,0 +1,171 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: consul_license_expiration_time + on: consul.license_expiration_time + class: Errors + type: ServiceMesh +component: Consul + calc: $license_expiration + every: 60m + units: seconds + warn: $this < 14*24*60*60 + crit: $this < 7*24*60*60 + summary: Consul license expiration on ${label:node_name} + info: Consul Enterprise license expiration time on node ${label:node_name} datacenter ${label:datacenter} + to: sysadmin + + template: consul_autopilot_health_status + on: consul.autopilot_health_status + class: Errors + type: ServiceMesh +component: Consul + calc: $unhealthy + every: 10s + units: status + warn: $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: Consul datacenter ${label:datacenter} health + info: Datacenter ${label:datacenter} cluster is unhealthy as reported by server ${label:node_name} + to: sysadmin + + template: consul_autopilot_server_health_status + on: consul.autopilot_server_health_status + class: Errors + type: ServiceMesh +component: Consul + calc: $unhealthy + every: 10s + units: status + warn: $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: Consul server ${label:node_name} health + info: Server ${label:node_name} from datacenter ${label:datacenter} is unhealthy + to: sysadmin + + template: consul_raft_leader_last_contact_time + on: consul.raft_leader_last_contact_time + class: Errors + type: ServiceMesh +component: Consul + lookup: average -1m unaligned of quantile_0.5 + every: 10s + units: milliseconds + warn: $this > (($status >= $WARNING) ? (150) : (200)) + crit: $this > (($status == $CRITICAL) ? (200) : (500)) + delay: down 5m multiplier 1.5 max 1h + summary: Consul leader server ${label:node_name} last contact time + info: Median time elapsed since leader server ${label:node_name} datacenter ${label:datacenter} was last able to contact the follower nodes + to: sysadmin + + template: consul_raft_leadership_transitions + on: consul.raft_leadership_transitions_rate + class: Errors + type: ServiceMesh +component: Consul + lookup: sum -1m unaligned + every: 10s + units: transitions + warn: $this > 0 + delay: down 5m multiplier 1.5 max 1h + summary: Consul server ${label:node_name} leadership transitions + info: There has been a leadership change and server ${label:node_name} datacenter ${label:datacenter} has become the leader + to: sysadmin + + template: consul_raft_thread_main_saturation + on: consul.raft_thread_main_saturation_perc + class: Utilization + type: ServiceMesh +component: Consul + lookup: average -1m unaligned of quantile_0.9 + every: 10s + units: percentage + warn: $this > (($status >= $WARNING) ? (40) : (50)) + delay: down 5m multiplier 1.5 max 1h + summary: Consul server ${label:node_name} main Raft saturation + info: Average saturation of the main Raft goroutine on server ${label:node_name} datacenter ${label:datacenter} + to: sysadmin + + template: consul_raft_thread_fsm_saturation + on: consul.raft_thread_fsm_saturation_perc + class: Utilization + type: ServiceMesh +component: Consul + lookup: average -1m unaligned of quantile_0.9 + every: 10s + units: milliseconds + warn: $this > (($status >= $WARNING) ? (40) : (50)) + delay: down 5m multiplier 1.5 max 1h + summary: Consul server ${label:node_name} FSM Raft saturation + info: Average saturation of the FSM Raft goroutine on server ${label:node_name} datacenter ${label:datacenter} + to: sysadmin + + template: consul_client_rpc_requests_exceeded + on: consul.client_rpc_requests_exceeded_rate + class: Errors + type: ServiceMesh +component: Consul + lookup: sum -1m unaligned + every: 10s + units: requests + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: down 5m multiplier 1.5 max 1h + summary: Consul server ${label:node_name} RPC requests rate + info: Number of rate-limited RPC requests made by server ${label:node_name} datacenter ${label:datacenter} + to: sysadmin + + template: consul_client_rpc_requests_failed + on: consul.client_rpc_requests_failed_rate + class: Errors + type: ServiceMesh +component: Consul + lookup: sum -1m unaligned + every: 10s + units: requests + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: down 5m multiplier 1.5 max 1h + summary: Consul server ${label:node_name} failed RPC requests + info: number of failed RPC requests made by server ${label:node_name} datacenter ${label:datacenter} + to: sysadmin + + template: consul_node_health_check_status + on: consul.node_health_check_status + class: Errors + type: ServiceMesh +component: Consul + calc: $warning + $critical + every: 10s + units: status + warn: $this != nan AND $this != 0 + delay: down 5m multiplier 1.5 max 1h + summary: Consul node health check ${label:check_name} on ${label:node_name} + info: Node health check ${label:check_name} has failed on server ${label:node_name} datacenter ${label:datacenter} + to: sysadmin + + template: consul_service_health_check_status + on: consul.service_health_check_status + class: Errors + type: ServiceMesh +component: Consul + calc: $warning + $critical + every: 10s + units: status + warn: $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: Consul service health check ${label:check_name} service ${label:service_name} node ${label:node_name} + info: Service health check ${label:check_name} for service ${label:service_name} has failed on server ${label:node_name} datacenter ${label:datacenter} + to: sysadmin + + template: consul_gc_pause_time + on: consul.gc_pause_time + class: Errors + type: ServiceMesh +component: Consul + lookup: sum -1m unaligned + every: 10s + units: seconds + warn: $this > (($status >= $WARNING) ? (1) : (2)) + crit: $this > (($status >= $WARNING) ? (2) : (5)) + delay: down 5m multiplier 1.5 max 1h + summary: Consul server ${label:node_name} garbage collection pauses + info: Time spent in stop-the-world garbage collection pauses on server ${label:node_name} datacenter ${label:datacenter} + to: sysadmin diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf new file mode 100644 index 00000000..0b007d6b --- /dev/null +++ b/health/health.d/cpu.conf @@ -0,0 +1,69 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + + template: 10min_cpu_usage + on: system.cpu + class: Utilization + type: System +component: CPU + os: linux + hosts: * + lookup: average -10m unaligned of user,system,softirq,irq,guest + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + summary: System CPU utilization + info: Average CPU utilization over the last 10 minutes (excluding iowait, nice and steal) + to: silent + + template: 10min_cpu_iowait + on: system.cpu + class: Utilization + type: System +component: CPU + os: linux + hosts: * + lookup: average -10m unaligned of iowait + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (20) : (40)) + delay: up 30m down 30m multiplier 1.5 max 2h + summary: System CPU iowait time + info: Average CPU iowait time over the last 10 minutes + to: silent + + template: 20min_steal_cpu + on: system.cpu + class: Latency + type: System +component: CPU + os: linux + hosts: * + lookup: average -20m unaligned of steal + units: % + every: 5m + warn: $this > (($status >= $WARNING) ? (5) : (10)) + delay: down 1h multiplier 1.5 max 2h + summary: System CPU steal time + info: Average CPU steal time over the last 20 minutes + to: silent + +## FreeBSD + template: 10min_cpu_usage + on: system.cpu + class: Utilization + type: System +component: CPU + os: freebsd + hosts: * + lookup: average -10m unaligned of user,system,interrupt + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + summary: System CPU utilization + info: Average CPU utilization over the last 10 minutes (excluding nice) + to: silent diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf new file mode 100644 index 00000000..0a70d2e8 --- /dev/null +++ b/health/health.d/dbengine.conf @@ -0,0 +1,68 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: 10min_dbengine_global_fs_errors + on: netdata.dbengine_global_errors + class: Errors + type: Netdata +component: DB engine + os: linux freebsd macos + hosts: * + lookup: sum -10m unaligned of fs_errors + units: errors + every: 10s + crit: $this > 0 + delay: down 15m multiplier 1.5 max 1h + summary: Netdata DBengine filesystem errors + info: Number of filesystem errors in the last 10 minutes (too many open files, wrong permissions, etc) + to: sysadmin + + alarm: 10min_dbengine_global_io_errors + on: netdata.dbengine_global_errors + class: Errors + type: Netdata +component: DB engine + os: linux freebsd macos + hosts: * + lookup: sum -10m unaligned of io_errors + units: errors + every: 10s + crit: $this > 0 + delay: down 1h multiplier 1.5 max 3h + summary: Netdata DBengine IO errors + info: Number of IO errors in the last 10 minutes (CRC errors, out of space, bad disk, etc) + to: sysadmin + + alarm: 10min_dbengine_global_flushing_warnings + on: netdata.dbengine_global_errors + class: Errors + type: Netdata +component: DB engine + os: linux freebsd macos + hosts: * + lookup: sum -10m unaligned of pg_cache_over_half_dirty_events + units: errors + every: 10s + warn: $this > 0 + delay: down 1h multiplier 1.5 max 3h + summary: Netdata DBengine global flushing warnings + info: number of times when dbengine dirty pages were over 50% of the instance's page cache in the last 10 minutes. \ + Metric data are at risk of not being stored in the database. To remedy, reduce disk load or use faster disks. + to: sysadmin + + alarm: 10min_dbengine_global_flushing_errors + on: netdata.dbengine_long_term_page_stats + class: Errors + type: Netdata +component: DB engine + os: linux freebsd macos + hosts: * + lookup: sum -10m unaligned of flushing_pressure_deletions + units: pages + every: 10s + crit: $this != 0 + delay: down 1h multiplier 1.5 max 3h + summary: Netdata DBengine global flushing errors + info: Number of pages deleted due to failure to flush data to disk in the last 10 minutes. \ + Metric data were lost to unblock data collection. To fix, reduce disk load or use faster disks. + to: sysadmin diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf new file mode 100644 index 00000000..2e417fd4 --- /dev/null +++ b/health/health.d/disks.conf @@ -0,0 +1,172 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + + +# ----------------------------------------------------------------------------- +# low disk space + +# checking the latest collected values +# raise an alarm if the disk is low on +# available disk space + + template: disk_space_usage + on: disk.space + class: Utilization + type: System +component: Disk + os: linux freebsd + hosts: * +chart labels: mount_point=!/dev !/dev/* !/run !/run/* * + calc: $used * 100 / ($avail + $used) + units: % + every: 1m + warn: $this > (($status >= $WARNING ) ? (80) : (90)) + crit: ($this > (($status == $CRITICAL) ? (90) : (98))) && $avail < 5 + delay: up 1m down 15m multiplier 1.5 max 1h + summary: Disk ${label:mount_point} space usage + info: Total space utilization of disk ${label:mount_point} + to: sysadmin + + template: disk_inode_usage + on: disk.inodes + class: Utilization + type: System +component: Disk + os: linux freebsd + hosts: * +chart labels: mount_point=!/dev !/dev/* !/run !/run/* * + calc: $used * 100 / ($avail + $used) + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: up 1m down 15m multiplier 1.5 max 1h + summary: Disk ${label:mount_point} inode usage + info: Total inode utilization of disk ${label:mount_point} + to: sysadmin + + +# ----------------------------------------------------------------------------- +# disk fill rate + +# calculate the rate the disk fills +# use as base, the available space change +# during the last hour + +# this is just a calculation - it has no alarm +# we will use it in the next template to find +# the hours remaining + +template: disk_fill_rate + on: disk.space + os: linux freebsd + hosts: * + lookup: min -10m at -50m unaligned of avail + calc: ($this - $avail) / (($now - $after) / 3600) + every: 1m + units: GB/hour + info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour + +# calculate the hours remaining +# if the disk continues to fill +# in this rate + +template: out_of_disk_space_time + on: disk.space + os: linux freebsd + hosts: * + calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf) + units: hours + every: 10s + warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) + crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) + delay: down 15m multiplier 1.2 max 1h + summary: Disk ${label:mount_point} estimation of lack of space + info: Estimated time the disk ${label:mount_point} will run out of space, if the system continues to add data with the rate of the last hour + to: silent + + +# ----------------------------------------------------------------------------- +# disk inode fill rate + +# calculate the rate the disk inodes are allocated +# use as base, the available inodes change +# during the last hour + +# this is just a calculation - it has no alarm +# we will use it in the next template to find +# the hours remaining + +template: disk_inode_rate + on: disk.inodes + os: linux freebsd + hosts: * + lookup: min -10m at -50m unaligned of avail + calc: ($this - $avail) / (($now - $after) / 3600) + every: 1m + units: inodes/hour + info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour + +# calculate the hours remaining +# if the disk inodes are allocated +# in this rate + +template: out_of_disk_inodes_time + on: disk.inodes + os: linux freebsd + hosts: * + calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf) + units: hours + every: 10s + warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) + crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) + delay: down 15m multiplier 1.2 max 1h + summary: Disk ${label:mount_point} estimation of lack of inodes + info: Estimated time the disk ${label:mount_point} will run out of inodes, if the system continues to allocate inodes with the rate of the last hour + to: silent + + +# ----------------------------------------------------------------------------- +# disk congestion + +# raise an alarm if the disk is congested +# by calculating the average disk utilization +# for the last 10 minutes + + template: 10min_disk_utilization + on: disk.util + class: Utilization + type: System +component: Disk + os: linux freebsd + hosts: * + lookup: average -10m unaligned + units: % + every: 1m + warn: $this > 98 * (($status >= $WARNING) ? (0.7) : (1)) + delay: down 15m multiplier 1.2 max 1h + summary: Disk ${label:device} utilization + info: Average percentage of time ${label:device} disk was busy over the last 10 minutes + to: silent + + +# raise an alarm if the disk backlog +# is above 1000ms (1s) per second +# for 10 minutes +# (i.e. the disk cannot catch up) + + template: 10min_disk_backlog + on: disk.backlog + class: Latency + type: System +component: Disk + os: linux + hosts: * + lookup: average -10m unaligned + units: ms + every: 1m + warn: $this > 5000 * (($status >= $WARNING) ? (0.7) : (1)) + delay: down 15m multiplier 1.2 max 1h + summary: Disk ${label:device} backlog + info: Average backlog size of the ${label:device} disk over the last 10 minutes + to: silent diff --git a/health/health.d/dns_query.conf b/health/health.d/dns_query.conf new file mode 100644 index 00000000..756c6a1b --- /dev/null +++ b/health/health.d/dns_query.conf @@ -0,0 +1,15 @@ +# detect dns query failure + + template: dns_query_query_status + on: dns_query.query_status + class: Errors + type: DNS +component: DNS + calc: $success + units: status + every: 10s + warn: $this != nan && $this != 1 + delay: up 30s down 5m multiplier 1.5 max 1h + summary: DNS query unsuccessful requests to ${label:server} + info: DNS request type ${label:record_type} to server ${label:server} is unsuccessful + to: sysadmin diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf new file mode 100644 index 00000000..f6ef0194 --- /dev/null +++ b/health/health.d/dnsmasq_dhcp.conf @@ -0,0 +1,15 @@ +# dhcp-range utilization + + template: dnsmasq_dhcp_dhcp_range_utilization + on: dnsmasq_dhcp.dhcp_range_utilization + class: Utilization + type: DHCP +component: Dnsmasq + every: 10s + units: % + calc: $used + warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) ) + delay: down 5m + summary: Dnsmasq DHCP range ${label:dhcp_range} utilization + info: DHCP range ${label:dhcp_range} utilization + to: sysadmin diff --git a/health/health.d/docker.conf b/health/health.d/docker.conf new file mode 100644 index 00000000..668614d4 --- /dev/null +++ b/health/health.d/docker.conf @@ -0,0 +1,12 @@ + template: docker_container_unhealthy + on: docker.container_health_status + class: Errors + type: Containers +component: Docker + units: status + every: 10s + lookup: average -10s of unhealthy + warn: $this > 0 + summary: Docker container ${label:container_name} health + info: ${label:container_name} docker container health status is unhealthy + to: sysadmin diff --git a/health/health.d/elasticsearch.conf b/health/health.d/elasticsearch.conf new file mode 100644 index 00000000..600840c5 --- /dev/null +++ b/health/health.d/elasticsearch.conf @@ -0,0 +1,78 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +# 'red' is a threshold, can't lookup the 'red' dimension - using simple pattern is a workaround. + + template: elasticsearch_cluster_health_status_red + on: elasticsearch.cluster_health_status + class: Errors + type: SearchEngine +component: Elasticsearch + lookup: average -5s unaligned of *ed + every: 10s + units: status + crit: $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: Elasticsearch cluster ${label:cluster_name} status + info: Elasticsearch cluster ${label:cluster_name} health status is red. + to: sysadmin + +# the idea of '-10m' is to handle yellow status after node restart, +# (usually) no action is required because Elasticsearch will automatically restore the green status. + template: elasticsearch_cluster_health_status_yellow + on: elasticsearch.cluster_health_status + class: Errors + type: SearchEngine +component: Elasticsearch + lookup: average -10m unaligned of yellow + every: 1m + units: status + warn: $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: Elasticsearch cluster ${label:cluster_name} status + info: Elasticsearch cluster ${label:cluster_name} health status is yellow. + to: sysadmin + + template: elasticsearch_node_index_health_red + on: elasticsearch.node_index_health + class: Errors + type: SearchEngine +component: Elasticsearch + lookup: average -5s unaligned of *ed + every: 10s + units: status + warn: $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: Elasticsearch cluster ${label:cluster_name} index ${label:index} status + info: Elasticsearch cluster ${label:cluster_name} index ${label:index} health status is red. + to: sysadmin + +# don't convert 'lookup' value to seconds in 'calc' due to UI showing seconds as hh:mm:ss (0 as now). + + template: elasticsearch_node_indices_search_time_query + on: elasticsearch.node_indices_search_time + class: Workload + type: SearchEngine +component: Elasticsearch + lookup: average -10m unaligned of query + every: 10s + units: milliseconds + warn: $this > (($status >= $WARNING) ? (20 * 1000) : (30 * 1000)) + delay: down 5m multiplier 1.5 max 1h + summary: Elasticsearch cluster ${label:cluster_name} node ${label:node_name} query performance + info: Elasticsearch cluster ${label:cluster_name} node ${label:node_name} search performance is degraded, queries run slowly. + to: sysadmin + + template: elasticsearch_node_indices_search_time_fetch + on: elasticsearch.node_indices_search_time + class: Workload + type: SearchEngine +component: Elasticsearch + lookup: average -10m unaligned of fetch + every: 10s + units: milliseconds + warn: $this > (($status >= $WARNING) ? (3 * 1000) : (5 * 1000)) + crit: $this > (($status == $CRITICAL) ? (5 * 1000) : (30 * 1000)) + delay: down 5m multiplier 1.5 max 1h + summary: Elasticsearch cluster ${label:cluster_name} node ${label:node_name} fetch performance + info: Elasticsearch cluster ${label:cluster_name} node ${label:node_name} search performance is degraded, fetches run slowly. + to: sysadmin diff --git a/health/health.d/entropy.conf b/health/health.d/entropy.conf new file mode 100644 index 00000000..be8b1fe4 --- /dev/null +++ b/health/health.d/entropy.conf @@ -0,0 +1,20 @@ + +# check if entropy is too low +# the alarm is checked every 1 minute +# and examines the last hour of data + + alarm: lowest_entropy + on: system.entropy + class: Utilization + type: System +component: Cryptography + os: linux + hosts: * + lookup: min -5m unaligned + units: entries + every: 5m + warn: $this < (($status >= $WARNING) ? (200) : (100)) + delay: down 1h multiplier 1.5 max 2h + summary: System entropy pool number of entries + info: Minimum number of entries in the random numbers pool in the last 5 minutes + to: silent diff --git a/health/health.d/exporting.conf b/health/health.d/exporting.conf new file mode 100644 index 00000000..c0320193 --- /dev/null +++ b/health/health.d/exporting.conf @@ -0,0 +1,29 @@ + + template: exporting_last_buffering + on: netdata.exporting_data_size + class: Latency + type: Netdata +component: Exporting engine + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + summary: Netdata exporting data last successful buffering + info: Number of seconds since the last successful buffering of exporting data + to: dba + + template: exporting_metrics_sent + on: netdata.exporting_data_size + class: Workload + type: Netdata +component: Exporting engine + units: % + calc: abs($sent) * 100 / abs($buffered) + every: 10s + warn: $this != 100 + delay: down 5m multiplier 1.5 max 1h + summary: Netdata exporting metrics sent + info: Percentage of metrics sent to the external database server + to: dba diff --git a/health/health.d/file_descriptors.conf b/health/health.d/file_descriptors.conf new file mode 100644 index 00000000..20a592d6 --- /dev/null +++ b/health/health.d/file_descriptors.conf @@ -0,0 +1,33 @@ + # you can disable an alarm notification by setting the 'to' line to: silent + + template: system_file_descriptors_utilization + on: system.file_nr_utilization + class: Utilization + type: System + component: Processes + hosts: * + lookup: max -1m unaligned + units: % + every: 1m + crit: $this > 90 + delay: down 15m multiplier 1.5 max 1h + summary: System open file descriptors utilization + info: System-wide utilization of open files + to: sysadmin + + template: apps_group_file_descriptors_utilization + on: app.fds_open_limit + class: Utilization + type: System +component: Process + os: linux + module: * + hosts: * + lookup: max -10s unaligned foreach * + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + summary: App group ${label:app_group} file descriptors utilization + info: Open files percentage against the processes limits, among all PIDs in application group + to: sysadmin diff --git a/health/health.d/gearman.conf b/health/health.d/gearman.conf new file mode 100644 index 00000000..78e1165d --- /dev/null +++ b/health/health.d/gearman.conf @@ -0,0 +1,14 @@ + + template: gearman_workers_queued + on: gearman.single_job + class: Latency + type: Computing +component: Gearman + lookup: average -10m unaligned match-names of Pending + units: workers + every: 10s + warn: $this > 30000 + delay: down 5m multiplier 1.5 max 1h + summary: Gearman queued jobs + info: Average number of queued jobs over the last 10 minutes + to: sysadmin diff --git a/health/health.d/geth.conf b/health/health.d/geth.conf new file mode 100644 index 00000000..361b6b41 --- /dev/null +++ b/health/health.d/geth.conf @@ -0,0 +1,11 @@ +#chainhead_header is expected momenterarily to be ahead. If its considerably ahead (e.g more than 5 blocks), then the node is definitely out of sync. + template: geth_chainhead_diff_between_header_block + on: geth.chainhead + class: Workload + type: ethereum_node +component: geth + every: 10s + calc: $chain_head_block - $chain_head_header + units: blocks + warn: $this != 0 + delay: down 1m multiplier 1.5 max 1h diff --git a/health/health.d/go.d.plugin.conf b/health/health.d/go.d.plugin.conf new file mode 100644 index 00000000..7796a1bc --- /dev/null +++ b/health/health.d/go.d.plugin.conf @@ -0,0 +1,18 @@ + +# make sure go.d.plugin data collection job is running + + template: go.d_job_last_collected_secs + on: netdata.go_plugin_execution_time + class: Errors + type: Netdata +component: go.d.plugin + module: !* * + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + summary: Go.d plugin last collection + info: Number of seconds since the last successful data collection + to: webmaster diff --git a/health/health.d/haproxy.conf b/health/health.d/haproxy.conf new file mode 100644 index 00000000..66a488fa --- /dev/null +++ b/health/health.d/haproxy.conf @@ -0,0 +1,25 @@ + template: haproxy_backend_server_status + on: haproxy_hs.down + class: Errors + type: Web Proxy +component: HAProxy + units: failed servers + every: 10s + lookup: average -10s + crit: $this > 0 + summary: HAProxy server status + info: Average number of failed haproxy backend servers over the last 10 seconds + to: sysadmin + + template: haproxy_backend_status + on: haproxy_hb.down + class: Errors + type: Web Proxy +component: HAProxy + units: failed backend + every: 10s + lookup: average -10s + crit: $this > 0 + summary: HAProxy backend status + info: Average number of failed haproxy backends over the last 10 seconds + to: sysadmin diff --git a/health/health.d/hdfs.conf b/health/health.d/hdfs.conf new file mode 100644 index 00000000..566e815a --- /dev/null +++ b/health/health.d/hdfs.conf @@ -0,0 +1,81 @@ + +# Common + + template: hdfs_capacity_usage + on: hdfs.capacity + class: Utilization + type: Storage +component: HDFS + calc: ($used) * 100 / ($used + $remaining) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (80) : (98)) + delay: down 15m multiplier 1.5 max 1h + summary: HDFS datanodes space utilization + info: summary datanodes space capacity utilization + to: sysadmin + + +# NameNode + + template: hdfs_missing_blocks + on: hdfs.blocks + class: Errors + type: Storage +component: HDFS + calc: $missing + units: missing blocks + every: 10s + warn: $this > 0 + delay: down 15m multiplier 1.5 max 1h + summary: HDFS missing blocks + info: number of missing blocks + to: sysadmin + + + template: hdfs_stale_nodes + on: hdfs.data_nodes + class: Errors + type: Storage +component: HDFS + calc: $stale + units: dead nodes + every: 10s + warn: $this > 0 + delay: down 15m multiplier 1.5 max 1h + summary: HDFS stale datanodes + info: number of datanodes marked stale due to delayed heartbeat + to: sysadmin + + + template: hdfs_dead_nodes + on: hdfs.data_nodes + class: Errors + type: Storage +component: HDFS + calc: $dead + units: dead nodes + every: 10s + crit: $this > 0 + delay: down 15m multiplier 1.5 max 1h + summary: HDFS dead datanodes + info: number of datanodes which are currently dead + to: sysadmin + + +# DataNode + + template: hdfs_num_failed_volumes + on: hdfs.num_failed_volumes + class: Errors + type: Storage +component: HDFS + calc: $fsds_num_failed_volumes + units: failed volumes + every: 10s + warn: $this > 0 + delay: down 15m multiplier 1.5 max 1h + summary: HDFS failed volumes + info: number of failed volumes + to: sysadmin diff --git a/health/health.d/httpcheck.conf b/health/health.d/httpcheck.conf new file mode 100644 index 00000000..da5dec79 --- /dev/null +++ b/health/health.d/httpcheck.conf @@ -0,0 +1,73 @@ + +# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges + template: httpcheck_web_service_up + on: httpcheck.status + class: Utilization + type: Web Server +component: HTTP endpoint + lookup: average -1m unaligned percentage of success + calc: ($this < 75) ? (0) : ($this) + every: 5s + units: up/down + info: HTTP check endpoint ${label:url} liveness status + to: silent + + template: httpcheck_web_service_bad_content + on: httpcheck.status + class: Workload + type: Web Server +component: HTTP endpoint + lookup: average -5m unaligned percentage of bad_content + every: 10s + units: % + warn: $this >= 10 AND $this < 40 + crit: $this >= 40 + delay: down 5m multiplier 1.5 max 1h + summary: HTTP check for ${label:url} unexpected content + info: Percentage of HTTP responses from ${label:url} with unexpected content in the last 5 minutes + to: webmaster + + template: httpcheck_web_service_bad_status + on: httpcheck.status + class: Workload + type: Web Server +component: HTTP endpoint + lookup: average -5m unaligned percentage of bad_status + every: 10s + units: % + warn: $this >= 10 AND $this < 40 + crit: $this >= 40 + delay: down 5m multiplier 1.5 max 1h + summary: HTTP check for ${label:url} unexpected status + info: Percentage of HTTP responses from ${label:url} with unexpected status in the last 5 minutes + to: webmaster + + template: httpcheck_web_service_timeouts + on: httpcheck.status + class: Latency + type: Web Server +component: HTTP endpoint + lookup: average -5m unaligned percentage of timeout + every: 10s + units: % + warn: $this >= 10 AND $this < 40 + crit: $this >= 40 + delay: down 5m multiplier 1.5 max 1h + summary: HTTP check for ${label:url} timeouts + info: Percentage of timed-out HTTP requests to ${label:url} in the last 5 minutes + to: webmaster + + template: httpcheck_web_service_no_connection + on: httpcheck.status + class: Errors + type: Other +component: HTTP endpoint + lookup: average -5m unaligned percentage of no_connection + every: 10s + units: % + warn: $this >= 10 AND $this < 40 + crit: $this >= 40 + delay: down 5m multiplier 1.5 max 1h + summary: HTTP check for ${label:url} failed requests + info: Percentage of failed HTTP requests to ${label:url} in the last 5 minutes + to: webmaster diff --git a/health/health.d/ioping.conf b/health/health.d/ioping.conf new file mode 100644 index 00000000..6d832bf0 --- /dev/null +++ b/health/health.d/ioping.conf @@ -0,0 +1,14 @@ + template: ioping_disk_latency + on: ioping.latency + class: Latency + type: System +component: Disk + lookup: average -10s unaligned of latency + units: microseconds + every: 10s + green: 10000 + warn: $this > $green + delay: down 30m multiplier 1.5 max 2h + summary: IO ping latency + info: Average I/O latency over the last 10 seconds + to: silent diff --git a/health/health.d/ipc.conf b/health/health.d/ipc.conf new file mode 100644 index 00000000..f77f5606 --- /dev/null +++ b/health/health.d/ipc.conf @@ -0,0 +1,34 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: semaphores_used + on: system.ipc_semaphores + class: Utilization + type: System +component: IPC + os: linux + hosts: * + calc: $semaphores * 100 / $ipc_semaphores_max + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (70) : (80)) + delay: down 5m multiplier 1.5 max 1h + summary: IPC semaphores used + info: IPC semaphore utilization + to: sysadmin + + alarm: semaphore_arrays_used + on: system.ipc_semaphore_arrays + class: Utilization + type: System +component: IPC + os: linux + hosts: * + calc: $arrays * 100 / $ipc_semaphores_arrays_max + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (70) : (80)) + delay: down 5m multiplier 1.5 max 1h + summary: IPC semaphore arrays used + info: IPC semaphore arrays utilization + to: sysadmin diff --git a/health/health.d/ipfs.conf b/health/health.d/ipfs.conf new file mode 100644 index 00000000..4dfee3c7 --- /dev/null +++ b/health/health.d/ipfs.conf @@ -0,0 +1,15 @@ + + template: ipfs_datastore_usage + on: ipfs.repo_size + class: Utilization + type: Data Sharing +component: IPFS + calc: $size * 100 / $avail + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + summary: IPFS datastore utilization + info: IPFS datastore utilization + to: sysadmin diff --git a/health/health.d/ipmi.conf b/health/health.d/ipmi.conf new file mode 100644 index 00000000..cec2320a --- /dev/null +++ b/health/health.d/ipmi.conf @@ -0,0 +1,28 @@ + template: ipmi_sensor_state + on: ipmi.sensor_state + class: Errors + type: System +component: IPMI + calc: $warning + $critical + units: state + every: 10s + warn: $warning > 0 + crit: $critical > 0 + delay: up 5m down 15m multiplier 1.5 max 1h + summary: IPMI sensor ${label:sensor} state + info: IPMI sensor ${label:sensor} (${label:component}) state + to: sysadmin + + alarm: ipmi_events + on: ipmi.events + class: Utilization + type: System +component: IPMI + calc: $events + units: events + every: 30s + warn: $this > 0 + delay: up 5m down 15m multiplier 1.5 max 1h + summary: IPMI entries in System Event Log + info: number of events in the IPMI System Event Log (SEL) + to: silent diff --git a/health/health.d/isc_dhcpd.conf b/health/health.d/isc_dhcpd.conf new file mode 100644 index 00000000..d1f93969 --- /dev/null +++ b/health/health.d/isc_dhcpd.conf @@ -0,0 +1,10 @@ +# template: isc_dhcpd_leases_size +# on: isc_dhcpd.leases_total +# units: KB +# every: 60 +# calc: $leases_size +# warn: $this > 3072 +# crit: $this > 6144 +# delay: up 2m down 5m +# info: dhcpd.leases file too big! Module can slow down your server. +# to: sysadmin diff --git a/health/health.d/kubelet.conf b/health/health.d/kubelet.conf new file mode 100644 index 00000000..8adf5f7d --- /dev/null +++ b/health/health.d/kubelet.conf @@ -0,0 +1,151 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +# ----------------------------------------------------------------------------- + +# True (1) if the node is experiencing a configuration-related error, false (0) otherwise. + + template: kubelet_node_config_error + on: k8s_kubelet.kubelet_node_config_error + class: Errors + type: Kubernetes +component: Kubelet + calc: $experiencing_error + units: bool + every: 10s + warn: $this == 1 + delay: down 1m multiplier 1.5 max 2h + summary: Kubelet node config error + info: The node is experiencing a configuration-related error (0: false, 1: true) + to: sysadmin + +# Failed Token() requests to the alternate token source + + template: kubelet_token_requests + on: k8s_kubelet.kubelet_token_requests + class: Errors + type: Kubernetes +component: Kubelet + lookup: sum -10s of failed + units: requests + every: 10s + warn: $this > 0 + delay: down 1m multiplier 1.5 max 2h + summary: Kubelet failed token requests + info: Number of failed Token() requests to the alternate token source + to: sysadmin + +# Docker and runtime operation errors + + template: kubelet_operations_error + on: k8s_kubelet.kubelet_operations_errors + class: Errors + type: Kubernetes +component: Kubelet + lookup: sum -1m + units: errors + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : (20)) + delay: up 30s down 1m multiplier 1.5 max 2h + summary: Kubelet runtime errors + info: Number of Docker or runtime operation errors + to: sysadmin + +# ----------------------------------------------------------------------------- + +# Pod Lifecycle Event Generator Relisting Latency + +# 1. calculate the pleg relisting latency for 1m (quantile 0.5, quantile 0.9, quantile 0.99) +# 2. do the same for the last 10s +# 3. raise an alarm if the later is: +# - 2x the first for quantile 0.5 +# - 4x the first for quantile 0.9 +# - 8x the first for quantile 0.99 +# +# we assume the minimum latency is 1000 microseconds + +# quantile 0.5 + + template: kubelet_1m_pleg_relist_latency_quantile_05 + on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds + class: Latency + type: Kubernetes +component: Kubelet + lookup: average -1m unaligned of 0.5 + units: microseconds + every: 10s + info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.5) + + template: kubelet_10s_pleg_relist_latency_quantile_05 + on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds + class: Latency + type: Kubernetes +component: Kubelet + lookup: average -10s unaligned of 0.5 + calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05)) + every: 10s + units: % + warn: $this > (($status >= $WARNING)?(100):(200)) + crit: $this > (($status >= $WARNING)?(200):(400)) + delay: down 1m multiplier 1.5 max 2h + summary: Kubelet relisting latency (quantile 0.5) + info: Ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \ + compared to the last minute (quantile 0.5) + to: sysadmin + +# quantile 0.9 + + template: kubelet_1m_pleg_relist_latency_quantile_09 + on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds + class: Latency + type: Kubernetes +component: Kubelet + lookup: average -1m unaligned of 0.9 + units: microseconds + every: 10s + info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.9) + + template: kubelet_10s_pleg_relist_latency_quantile_09 + on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds + class: Latency + type: Kubernetes +component: Kubelet + lookup: average -10s unaligned of 0.9 + calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09)) + every: 10s + units: % + warn: $this > (($status >= $WARNING)?(200):(400)) + crit: $this > (($status >= $WARNING)?(400):(800)) + delay: down 1m multiplier 1.5 max 2h + summary: Kubelet relisting latency (quantile 0.9) + info: Ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \ + compared to the last minute (quantile 0.9) + to: sysadmin + +# quantile 0.99 + + template: kubelet_1m_pleg_relist_latency_quantile_099 + on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds + class: Latency + type: Kubernetes +component: Kubelet + lookup: average -1m unaligned of 0.99 + units: microseconds + every: 10s + info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.99) + + template: kubelet_10s_pleg_relist_latency_quantile_099 + on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds + class: Latency + type: Kubernetes +component: Kubelet + lookup: average -10s unaligned of 0.99 + calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099)) + every: 10s + units: % + warn: $this > (($status >= $WARNING)?(400):(800)) + crit: $this > (($status >= $WARNING)?(800):(1200)) + delay: down 1m multiplier 1.5 max 2h + summary: Kubelet relisting latency (quantile 0.99) + info: Ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \ + compared to the last minute (quantile 0.99) + to: sysadmin diff --git a/health/health.d/linux_power_supply.conf b/health/health.d/linux_power_supply.conf new file mode 100644 index 00000000..b0d35e75 --- /dev/null +++ b/health/health.d/linux_power_supply.conf @@ -0,0 +1,15 @@ +# Alert on low battery capacity. + + template: linux_power_supply_capacity + on: powersupply.capacity + class: Utilization + type: Power Supply +component: Battery + calc: $capacity + units: % + every: 10s + warn: $this < 10 + delay: up 30s down 5m multiplier 1.2 max 1h + summary: Power supply capacity + info: Percentage of remaining power supply capacity + to: silent diff --git a/health/health.d/load.conf b/health/health.d/load.conf new file mode 100644 index 00000000..fd8bf939 --- /dev/null +++ b/health/health.d/load.conf @@ -0,0 +1,72 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + +# Calculate the base trigger point for the load average alarms. +# This is the maximum number of CPU's in the system over the past 1 +# minute, with a special case for a single CPU of setting the trigger at 2. + alarm: load_cpu_number + on: system.load + class: Utilization + type: System +component: Load + os: linux + hosts: * + calc: ($active_processors == nan or $active_processors == 0) ? (nan) : ( ($active_processors < 2) ? ( 2 ) : ( $active_processors ) ) + units: cpus + every: 1m + info: Number of active CPU cores in the system + +# Send alarms if the load average is unusually high. +# These intentionally _do not_ calculate the average over the sampled +# time period because the values being checked already are averages. + + alarm: load_average_15 + on: system.load + class: Utilization + type: System +component: Load + os: linux + hosts: * + lookup: max -1m unaligned of load15 + calc: ($load_cpu_number == nan) ? (nan) : ($this) + units: load + every: 1m + warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 175 : 200) + delay: down 15m multiplier 1.5 max 1h + summary: Host load average (15 minutes) + info: System load average for the past 15 minutes + to: silent + + alarm: load_average_5 + on: system.load + class: Utilization + type: System +component: Load + os: linux + hosts: * + lookup: max -1m unaligned of load5 + calc: ($load_cpu_number == nan) ? (nan) : ($this) + units: load + every: 1m + warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 350 : 400) + delay: down 15m multiplier 1.5 max 1h + summary: System load average (5 minutes) + info: System load average for the past 5 minutes + to: silent + + alarm: load_average_1 + on: system.load + class: Utilization + type: System +component: Load + os: linux + hosts: * + lookup: max -1m unaligned of load1 + calc: ($load_cpu_number == nan) ? (nan) : ($this) + units: load + every: 1m + warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 700 : 800) + delay: down 15m multiplier 1.5 max 1h + summary: System load average (1 minute) + info: System load average for the past 1 minute + to: silent diff --git a/health/health.d/mdstat.conf b/health/health.d/mdstat.conf new file mode 100644 index 00000000..90f97d85 --- /dev/null +++ b/health/health.d/mdstat.conf @@ -0,0 +1,43 @@ + + template: mdstat_disks + on: md.disks + class: Errors + type: System +component: RAID + units: failed devices + every: 10s + calc: $down + warn: $this > 0 + summary: MD array device ${label:device} down + info: Number of devices in the down state for the ${label:device} ${label:raid_level} array. \ + Any number > 0 indicates that the array is degraded. + to: sysadmin + + template: mdstat_mismatch_cnt + on: md.mismatch_cnt + class: Errors + type: System +component: RAID +chart labels: raid_level=!raid1 !raid10 * + units: unsynchronized blocks + calc: $count + every: 60s + warn: $this > 1024 + delay: up 30m + summary: MD array device ${label:device} unsynchronized blocks + info: Number of unsynchronized blocks for the ${label:device} ${label:raid_level} array + to: silent + + template: mdstat_nonredundant_last_collected + on: md.nonredundant + class: Latency + type: System +component: RAID + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + summary: MD array last collected + info: Number of seconds since the last successful data collection + to: sysadmin diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf new file mode 100644 index 00000000..118997a5 --- /dev/null +++ b/health/health.d/megacli.conf @@ -0,0 +1,76 @@ + +## Adapters (controllers) + + template: megacli_adapter_state + on: megacli.adapter_degraded + class: Errors + type: System +component: RAID + lookup: max -10s foreach * + units: boolean + every: 10s + crit: $this > 0 + delay: down 5m multiplier 2 max 10m + summary: MegaCLI adapter state + info: Adapter is in the degraded state (0: false, 1: true) + to: sysadmin + +## Physical Disks + + template: megacli_pd_predictive_failures + on: megacli.pd_predictive_failure + class: Errors + type: System +component: RAID + lookup: sum -10s foreach * + units: predictive failures + every: 10s + warn: $this > 0 + delay: up 1m down 5m multiplier 2 max 10m + summary: MegaCLI physical drive predictive failures + info: Number of physical drive predictive failures + to: sysadmin + + template: megacli_pd_media_errors + on: megacli.pd_media_error + class: Errors + type: System +component: RAID + lookup: sum -10s foreach * + units: media errors + every: 10s + warn: $this > 0 + delay: up 1m down 5m multiplier 2 max 10m + summary: MegaCLI physical drive errors + info: Number of physical drive media errors + to: sysadmin + +## Battery Backup Units (BBU) + + template: megacli_bbu_relative_charge + on: megacli.bbu_relative_charge + class: Workload + type: System +component: RAID + lookup: average -10s + units: percent + every: 10s + warn: $this <= (($status >= $WARNING) ? (85) : (80)) + crit: $this <= (($status == $CRITICAL) ? (50) : (40)) + summary: MegaCLI BBU charge state + info: Average battery backup unit (BBU) relative state of charge over the last 10 seconds + to: sysadmin + + template: megacli_bbu_cycle_count + on: megacli.bbu_cycle_count + class: Workload + type: System +component: RAID + lookup: average -10s + units: cycles + every: 10s + warn: $this >= 100 + crit: $this >= 500 + summary: MegaCLI BBU cycles count + info: Average battery backup unit (BBU) charge cycles count over the last 10 seconds + to: sysadmin diff --git a/health/health.d/memcached.conf b/health/health.d/memcached.conf new file mode 100644 index 00000000..77ca0afa --- /dev/null +++ b/health/health.d/memcached.conf @@ -0,0 +1,50 @@ + +# detect if memcached cache is full + + template: memcached_cache_memory_usage + on: memcached.cache + class: Utilization + type: KV Storage +component: Memcached + calc: $used * 100 / ($used + $available) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (80) : (90)) + delay: up 0 down 15m multiplier 1.5 max 1h + summary: Memcached memory utilization + info: Cache memory utilization + to: dba + + +# find the rate memcached cache is filling + + template: memcached_cache_fill_rate + on: memcached.cache + class: Utilization + type: KV Storage +component: Memcached + lookup: min -10m at -50m unaligned of available + calc: ($this - $available) / (($now - $after) / 3600) + units: KB/hour + every: 1m + info: Average rate the cache fills up (positive), or frees up (negative) space over the last hour + + +# find the hours remaining until memcached cache is full + + template: memcached_out_of_cache_space_time + on: memcached.cache + class: Utilization + type: KV Storage +component: Memcached + calc: ($memcached_cache_fill_rate > 0) ? ($available / $memcached_cache_fill_rate) : (inf) + units: hours + every: 10s + warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) + crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) + delay: down 15m multiplier 1.5 max 1h + summary: Memcached estimation of lack of cache space + info: Estimated time the cache will run out of space \ + if the system continues to add data at the same rate as the past hour + to: dba diff --git a/health/health.d/memory.conf b/health/health.d/memory.conf new file mode 100644 index 00000000..5ab3d2d9 --- /dev/null +++ b/health/health.d/memory.conf @@ -0,0 +1,85 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: 1hour_memory_hw_corrupted + on: mem.hwcorrupt + class: Errors + type: System +component: Memory + os: linux + hosts: * + calc: $HardwareCorrupted + units: MB + every: 10s + warn: $this > 0 + delay: down 1h multiplier 1.5 max 1h + summary: System corrupted memory + info: Amount of memory corrupted due to a hardware failure + to: sysadmin + +## ECC Controller + + template: ecc_memory_mc_correctable + on: mem.edac_mc + class: Errors + type: System +component: Memory + os: linux + hosts: * + lookup: sum -10m unaligned of correctable, correctable_noinfo + units: errors + every: 1m + warn: $this > 0 + delay: down 1h multiplier 1.5 max 1h + summary: System ECC memory ${label:controller} correctable errors + info: Memory controller ${label:controller} ECC correctable errors in the last 10 minutes + to: sysadmin + + template: ecc_memory_mc_uncorrectable + on: mem.edac_mc + class: Errors + type: System +component: Memory + os: linux + hosts: * + lookup: sum -10m unaligned of uncorrectable,uncorrectable_noinfo + units: errors + every: 1m + crit: $this > 0 + delay: down 1h multiplier 1.5 max 1h + summary: System ECC memory ${label:controller} uncorrectable errors + info: Memory controller ${label:controller} ECC uncorrectable errors in the last 10 minutes + to: sysadmin + +## ECC DIMM + + template: ecc_memory_dimm_correctable + on: mem.edac_mc_dimm + class: Errors + type: System +component: Memory + os: linux + hosts: * + lookup: sum -10m unaligned of correctable + units: errors + every: 1m + warn: $this > 0 + delay: down 1h multiplier 1.5 max 1h + summary: System ECC memory DIMM ${label:dimm} correctable errors + info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC correctable errors in the last 10 minutes + to: sysadmin + + template: ecc_memory_dimm_uncorrectable + on: mem.edac_mc_dimm + class: Errors + type: System +component: Memory + os: linux + hosts: * + lookup: sum -10m unaligned of uncorrectable + units: errors + every: 1m + crit: $this > 0 + delay: down 1h multiplier 1.5 max 1h + summary: System ECC memory DIMM ${label:dimm} uncorrectable errors + info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC uncorrectable errors in the last 10 minutes + to: sysadmin diff --git a/health/health.d/ml.conf b/health/health.d/ml.conf new file mode 100644 index 00000000..aef9b036 --- /dev/null +++ b/health/health.d/ml.conf @@ -0,0 +1,56 @@ +# below are some examples of using the `anomaly-bit` option to define alerts based on anomaly +# rates as opposed to raw metric values. You can read more about the anomaly-bit and Netdata's +# native anomaly detection here: +# https://learn.netdata.cloud/docs/agent/ml#anomaly-bit---100--anomalous-0--normal + +# some examples below are commented, you would need to uncomment and adjust as desired to enable them. + +# node level anomaly rate +# https://learn.netdata.cloud/docs/agent/ml#node-anomaly-rate +# if node level anomaly rate is above 1% then warning (pick your own threshold that works best via trial and error). + template: ml_1min_node_ar + on: anomaly_detection.anomaly_rate + class: Workload + type: System +component: ML + os: * + hosts: * + lookup: average -1m of anomaly_rate + calc: $this + units: % + every: 30s + warn: $this > 1 + summary: ML node anomaly rate + info: Rolling 1min node level anomaly rate + to: silent + +# alert per dimension example +# if anomaly rate is between 5-20% then warning (pick your own threshold that works best via tial and error). +# if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error). +# template: ml_5min_cpu_dims +# on: system.cpu +# os: linux +# hosts: * +# lookup: average -5m anomaly-bit foreach * +# calc: $this +# units: % +# every: 30s +# warn: $this > (($status >= $WARNING) ? (5) : (20)) +# crit: $this > (($status == $CRITICAL) ? (20) : (100)) +# info: rolling 5min anomaly rate for each system.cpu dimension + +# alert per chart example +# if anomaly rate is between 5-20% then warning (pick your own threshold that works best via tial and error). +# if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error). +# template: ml_5min_cpu_chart +# on: system.cpu +# os: linux +# hosts: * +# lookup: average -5m anomaly-bit of * +# calc: $this +# units: % +# every: 30s +# warn: $this > (($status >= $WARNING) ? (5) : (20)) +# crit: $this > (($status == $CRITICAL) ? (20) : (100)) +# info: rolling 5min anomaly rate for system.cpu chart + diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf new file mode 100644 index 00000000..572560b4 --- /dev/null +++ b/health/health.d/mysql.conf @@ -0,0 +1,187 @@ + +# slow queries + + template: mysql_10s_slow_queries + on: mysql.queries + class: Latency + type: Database +component: MySQL + lookup: sum -10s of slow_queries + units: slow queries + every: 10s + warn: $this > (($status >= $WARNING) ? (5) : (10)) + crit: $this > (($status == $CRITICAL) ? (10) : (20)) + delay: down 5m multiplier 1.5 max 1h + summary: MySQL slow queries + info: Number of slow queries in the last 10 seconds + to: dba + + +# ----------------------------------------------------------------------------- +# lock waits + + template: mysql_10s_table_locks_immediate + on: mysql.table_locks + class: Utilization + type: Database +component: MySQL + lookup: sum -10s absolute of immediate + units: immediate locks + every: 10s + summary: MySQL table immediate locks + info: Number of table immediate locks in the last 10 seconds + to: dba + + template: mysql_10s_table_locks_waited + on: mysql.table_locks + class: Latency + type: Database +component: MySQL + lookup: sum -10s absolute of waited + units: waited locks + every: 10s + summary: MySQL table waited locks + info: Number of table waited locks in the last 10 seconds + to: dba + + template: mysql_10s_waited_locks_ratio + on: mysql.table_locks + class: Latency + type: Database +component: MySQL + calc: ( ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate) > 0 ) ? (($mysql_10s_table_locks_waited * 100) / ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate)) : 0 + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (10) : (25)) + crit: $this > (($status == $CRITICAL) ? (25) : (50)) + delay: down 30m multiplier 1.5 max 1h + summary: MySQL waited table locks ratio + info: Ratio of waited table locks over the last 10 seconds + to: dba + + +# ----------------------------------------------------------------------------- +# connections + + template: mysql_connections + on: mysql.connections_active + class: Utilization + type: Database +component: MySQL + calc: $active * 100 / $limit + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (60) : (70)) + crit: $this > (($status == $CRITICAL) ? (80) : (90)) + delay: down 15m multiplier 1.5 max 1h + summary: MySQL connections utilization + info: Client connections utilization + to: dba + + +# ----------------------------------------------------------------------------- +# replication + + template: mysql_replication + on: mysql.slave_status + class: Errors + type: Database +component: MySQL + calc: ($sql_running <= 0 OR $io_running <= 0)?0:1 + units: ok/failed + every: 10s + crit: $this == 0 + delay: down 5m multiplier 1.5 max 1h + summary: MySQL replication status + info: Replication status (0: stopped, 1: working) + to: dba + + template: mysql_replication_lag + on: mysql.slave_behind + class: Latency + type: Database +component: MySQL + calc: $seconds + units: seconds + every: 10s + warn: $this > (($status >= $WARNING) ? (5) : (10)) + crit: $this > (($status == $CRITICAL) ? (10) : (30)) + delay: down 15m multiplier 1.5 max 1h + summary: MySQL replication lag + info: Difference between the timestamp of the latest transaction processed by the SQL thread and \ + the timestamp of the same transaction when it was processed on the master + to: dba + + +# ----------------------------------------------------------------------------- +# galera cluster size + + template: mysql_galera_cluster_size_max_2m + on: mysql.galera_cluster_size + class: Utilization + type: Database +component: MySQL + lookup: max -2m at -1m unaligned + units: nodes + every: 10s + info: maximum galera cluster size in the last 2 minutes starting one minute ago + to: dba + + template: mysql_galera_cluster_size + on: mysql.galera_cluster_size + class: Utilization + type: Database +component: MySQL + calc: $nodes + units: nodes + every: 10s + warn: $this > $mysql_galera_cluster_size_max_2m + crit: $this < $mysql_galera_cluster_size_max_2m + delay: up 20s down 5m multiplier 1.5 max 1h + summary: MySQL galera cluster size + info: Current galera cluster size, compared to the maximum size in the last 2 minutes + to: dba + +# galera node state + + template: mysql_galera_cluster_state_warn + on: mysql.galera_cluster_state + class: Errors + type: Database +component: MySQL + calc: $donor + $joined + every: 10s + warn: $this != nan AND $this != 0 + delay: up 30s down 5m multiplier 1.5 max 1h + summary: MySQL galera node state + info: Galera node state is either Donor/Desynced or Joined. + to: dba + + template: mysql_galera_cluster_state_crit + on: mysql.galera_cluster_state + class: Errors + type: Database +component: MySQL + calc: $undefined + $joining + $error + every: 10s + crit: $this != nan AND $this != 0 + delay: up 30s down 5m multiplier 1.5 max 1h + summary: MySQL galera node state + info: Galera node state is either Undefined or Joining or Error. + to: dba + +# galera node status + + template: mysql_galera_cluster_status + on: mysql.galera_cluster_status + class: Errors + type: Database +component: MySQL + calc: $primary + every: 10s + crit: $this != nan AND $this != 1 + delay: up 30s down 5m multiplier 1.5 max 1h + summary: MySQL galera cluster status + info: Galera node is part of a nonoperational component. \ + This occurs in cases of multiple membership changes that result in a loss of Quorum or in cases of split-brain situations. + to: dba diff --git a/health/health.d/net.conf b/health/health.d/net.conf new file mode 100644 index 00000000..2dfe6bba --- /dev/null +++ b/health/health.d/net.conf @@ -0,0 +1,258 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + +# ----------------------------------------------------------------------------- +# net traffic overflow + + template: interface_speed + on: net.net + class: Latency + type: System +component: Network + os: * + hosts: * + calc: ( $nic_speed_max > 0 ) ? ( $nic_speed_max / 1000) : ( nan ) + units: Mbit + every: 10s + info: Network interface ${label:device} current speed + + template: 1m_received_traffic_overflow + on: net.net + class: Workload + type: System +component: Network + os: linux + hosts: * + lookup: average -1m unaligned absolute of received + calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan ) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (85) : (90)) + delay: up 1m down 1m multiplier 1.5 max 1h + summary: System network interface ${label:device} inbound utilization + info: Average inbound utilization for the network interface ${label:device} over the last minute + to: silent + + template: 1m_sent_traffic_overflow + on: net.net + class: Workload + type: System +component: Network + os: linux + hosts: * + lookup: average -1m unaligned absolute of sent + calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan ) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (85) : (90)) + delay: up 1m down 1m multiplier 1.5 max 1h + summary: System network interface ${label:device} outbound utilization + info: Average outbound utilization for the network interface ${label:device} over the last minute + to: silent + +# ----------------------------------------------------------------------------- +# dropped packets + +# check if an interface is dropping packets +# the alarm is checked every 1 minute +# and examines the last 10 minutes of data +# +# it is possible to have expected packet drops on an interface for some network configurations +# look at the Monitoring Network Interfaces section in the proc.plugin documentation for more information + + template: net_interface_inbound_packets + on: net.packets + class: Workload + type: System +component: Network + os: * + hosts: * + lookup: sum -10m unaligned absolute of received + units: packets + every: 1m + summary: Network interface ${label:device} received packets + info: Received packets for the network interface ${label:device} in the last 10 minutes + + template: net_interface_outbound_packets + on: net.packets + class: Workload + type: System +component: Network + os: * + hosts: * + lookup: sum -10m unaligned absolute of sent + units: packets + every: 1m + summary: Network interface ${label:device} sent packets + info: Sent packets for the network interface ${label:device} in the last 10 minutes + + template: inbound_packets_dropped_ratio + on: net.drops + class: Errors + type: System +component: Network + os: * + hosts: * +chart labels: device=!wl* * + lookup: sum -10m unaligned absolute of inbound + calc: (($net_interface_inbound_packets > 10000) ? ($this * 100 / $net_interface_inbound_packets) : (0)) + units: % + every: 1m + warn: $this >= 2 + delay: up 1m down 1h multiplier 1.5 max 2h + summary: System network interface ${label:device} inbound drops + info: Ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes + to: silent + + template: outbound_packets_dropped_ratio + on: net.drops + class: Errors + type: System +component: Network + os: * + hosts: * +chart labels: device=!wl* * + lookup: sum -10m unaligned absolute of outbound + calc: (($net_interface_outbound_packets > 1000) ? ($this * 100 / $net_interface_outbound_packets) : (0)) + units: % + every: 1m + warn: $this >= 2 + delay: up 1m down 1h multiplier 1.5 max 2h + summary: System network interface ${label:device} outbound drops + info: Ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes + to: silent + + template: wifi_inbound_packets_dropped_ratio + on: net.drops + class: Errors + type: System +component: Network + os: linux + hosts: * +chart labels: device=wl* + lookup: sum -10m unaligned absolute of received + calc: (($net_interface_inbound_packets > 10000) ? ($this * 100 / $net_interface_inbound_packets) : (0)) + units: % + every: 1m + warn: $this >= 10 + delay: up 1m down 1h multiplier 1.5 max 2h + summary: System network interface ${label:device} inbound drops ratio + info: Ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes + to: silent + + template: wifi_outbound_packets_dropped_ratio + on: net.drops + class: Errors + type: System +component: Network + os: linux + hosts: * +chart labels: device=wl* + lookup: sum -10m unaligned absolute of sent + calc: (($net_interface_outbound_packets > 1000) ? ($this * 100 / $net_interface_outbound_packets) : (0)) + units: % + every: 1m + warn: $this >= 10 + delay: up 1m down 1h multiplier 1.5 max 2h + summary: System network interface ${label:device} outbound drops ratio + info: Ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes + to: silent + +# ----------------------------------------------------------------------------- +# interface errors + + template: interface_inbound_errors + on: net.errors + class: Errors + type: System +component: Network + os: freebsd + hosts: * + lookup: sum -10m unaligned absolute of inbound + units: errors + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + summary: System network interface ${label:device} inbound errors + info: Number of inbound errors for the network interface ${label:device} in the last 10 minutes + to: silent + + template: interface_outbound_errors + on: net.errors + class: Errors + type: System +component: Network + os: freebsd + hosts: * + lookup: sum -10m unaligned absolute of outbound + units: errors + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + summary: System network interface ${label:device} outbound errors + info: Number of outbound errors for the network interface ${label:device} in the last 10 minutes + to: silent + +# ----------------------------------------------------------------------------- +# FIFO errors + +# check if an interface is having FIFO +# buffer errors +# the alarm is checked every 1 minute +# and examines the last 10 minutes of data + + template: 10min_fifo_errors + on: net.fifo + class: Errors + type: System +component: Network + os: linux + hosts: * + lookup: sum -10m unaligned absolute + units: errors + every: 1m + warn: $this > 0 + delay: down 1h multiplier 1.5 max 2h + summary: System network interface ${label:device} FIFO errors + info: Number of FIFO errors for the network interface ${label:device} in the last 10 minutes + to: silent + +# ----------------------------------------------------------------------------- +# check for packet storms + +# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate +# 2. do the same for the last 10s +# 3. raise an alarm if the later is 10x or 20x the first +# we assume the minimum packet storm should at least have +# 10000 packets/s, average of the last 10 seconds + + template: 1m_received_packets_rate + on: net.packets + class: Workload + type: System +component: Network + os: linux freebsd + hosts: * + lookup: average -1m unaligned of received + units: packets + every: 10s + info: Average number of packets received by the network interface ${label:device} over the last minute + + template: 10s_received_packets_storm + on: net.packets + class: Workload + type: System +component: Network + os: linux freebsd + hosts: * + lookup: average -10s unaligned of received + calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate)) + every: 10s + units: % + warn: $this > (($status >= $WARNING)?(200):(5000)) + crit: $this > (($status == $CRITICAL)?(5000):(6000)) + options: no-clear-notification + summary: System network interface ${label:device} inbound packet storm + info: Ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \ + compared to the rate over the last minute + to: silent diff --git a/health/health.d/netfilter.conf b/health/health.d/netfilter.conf new file mode 100644 index 00000000..417105d4 --- /dev/null +++ b/health/health.d/netfilter.conf @@ -0,0 +1,20 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: netfilter_conntrack_full + on: netfilter.conntrack_sockets + class: Workload + type: System +component: Network + os: linux + hosts: * + lookup: max -10s unaligned of connections + calc: $this * 100 / $netfilter_conntrack_max + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (85) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (95)) + delay: down 5m multiplier 1.5 max 1h + summary: System Netfilter connection tracker utilization + info: Netfilter connection tracker table size utilization + to: sysadmin diff --git a/health/health.d/nvme.conf b/health/health.d/nvme.conf new file mode 100644 index 00000000..aea402e8 --- /dev/null +++ b/health/health.d/nvme.conf @@ -0,0 +1,15 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: nvme_device_critical_warnings_state + on: nvme.device_critical_warnings_state + class: Errors + type: System +component: Disk + lookup: max -30s unaligned + units: state + every: 10s + crit: $this != nan AND $this != 0 + delay: down 5m multiplier 1.5 max 2h + summary: NVMe device ${label:device} state + info: NVMe device ${label:device} has critical warnings + to: sysadmin diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf new file mode 100644 index 00000000..c4db835c --- /dev/null +++ b/health/health.d/pihole.conf @@ -0,0 +1,33 @@ + +# Blocklist last update time. +# Default update interval is a week. + + template: pihole_blocklist_last_update + on: pihole.blocklist_last_update + class: Errors + type: Ad Filtering +component: Pi-hole + every: 10s + units: seconds + calc: $ago + warn: $this > 60 * 60 * 24 * 30 + summary: Pi-hole blocklist last update + info: gravity.list (blocklist) file last update time + to: sysadmin + +# Pi-hole's ability to block unwanted domains. +# Should be enabled. The whole point of Pi-hole! + + template: pihole_status + on: pihole.unwanted_domains_blocking_status + class: Errors + type: Ad Filtering +component: Pi-hole + every: 10s + units: status + calc: $disabled + warn: $this != nan AND $this == 1 + delay: up 2m down 5m + summary: Pi-hole domains blocking status + info: Unwanted domains blocking is disabled + to: sysadmin diff --git a/health/health.d/ping.conf b/health/health.d/ping.conf new file mode 100644 index 00000000..0e434420 --- /dev/null +++ b/health/health.d/ping.conf @@ -0,0 +1,50 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: ping_host_reachable + on: ping.host_packet_loss + class: Errors + type: Other +component: Network + lookup: average -30s unaligned of loss + calc: $this != nan AND $this < 100 + units: up/down + every: 10s + crit: $this == 0 + delay: down 30m multiplier 1.5 max 2h + summary: Host ${label:host} ping status + info: Network host ${label:host} reachability status + to: sysadmin + + template: ping_packet_loss + on: ping.host_packet_loss + class: Errors + type: Other +component: Network + lookup: average -10m unaligned of loss + green: 5 + red: 10 + units: % + every: 10s + warn: $this > $green + crit: $this > $red + delay: down 30m multiplier 1.5 max 2h + summary: Host ${label:host} ping packet loss + info: Packet loss percentage to the network host ${label:host} over the last 10 minutes + to: sysadmin + + template: ping_host_latency + on: ping.host_rtt + class: Latency + type: Other +component: Network + lookup: average -10s unaligned of avg + units: ms + every: 10s + green: 500 + red: 1000 + warn: $this > $green OR $max > $red + crit: $this > $red + delay: down 30m multiplier 1.5 max 2h + summary: Host ${label:host} ping latency + info: Average latency to the network host ${label:host} over the last 10 seconds + to: sysadmin diff --git a/health/health.d/plugin.conf b/health/health.d/plugin.conf new file mode 100644 index 00000000..8615a021 --- /dev/null +++ b/health/health.d/plugin.conf @@ -0,0 +1,12 @@ + template: plugin_availability_status + on: netdata.plugin_availability_status + class: Errors + type: Netdata + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : (20 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + summary: Plugin ${label:_collect_plugin} availability status + info: the amount of time that ${label:_collect_plugin} did not report its availability status + to: sysadmin diff --git a/health/health.d/portcheck.conf b/health/health.d/portcheck.conf new file mode 100644 index 00000000..281731c8 --- /dev/null +++ b/health/health.d/portcheck.conf @@ -0,0 +1,44 @@ + +# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges + template: portcheck_service_reachable + on: portcheck.status + class: Workload + type: Other +component: TCP endpoint + lookup: average -1m unaligned percentage of success + calc: ($this < 75) ? (0) : ($this) + every: 5s + units: up/down + summary: Portcheck status for ${label:host}:${label:port} + info: TCP host ${label:host} port ${label:port} liveness status + to: silent + + template: portcheck_connection_timeouts + on: portcheck.status + class: Errors + type: Other +component: TCP endpoint + lookup: average -5m unaligned percentage of timeout + every: 10s + units: % + warn: $this >= 10 AND $this < 40 + crit: $this >= 40 + delay: down 5m multiplier 1.5 max 1h + summary: Portcheck timeouts for ${label:host}:${label:port} + info: Percentage of timed-out TCP connections to host ${label:host} port ${label:port} in the last 5 minutes + to: sysadmin + + template: portcheck_connection_fails + on: portcheck.status + class: Errors + type: Other +component: TCP endpoint + lookup: average -5m unaligned percentage of no_connection,failed + every: 10s + units: % + warn: $this >= 10 AND $this < 40 + crit: $this >= 40 + delay: down 5m multiplier 1.5 max 1h + summary: Portcheck fails for ${label:host}:${label:port} + info: Percentage of failed TCP connections to host ${label:host} port ${label:port} in the last 5 minutes + to: sysadmin diff --git a/health/health.d/postgres.conf b/health/health.d/postgres.conf new file mode 100644 index 00000000..de4c0078 --- /dev/null +++ b/health/health.d/postgres.conf @@ -0,0 +1,228 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: postgres_total_connection_utilization + on: postgres.connections_utilization + class: Utilization + type: Database +component: PostgreSQL + hosts: * + lookup: average -1m unaligned of used + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (80) : (90)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL connection utilization + info: Average total connection utilization over the last minute + to: dba + + template: postgres_acquired_locks_utilization + on: postgres.locks_utilization + class: Utilization + type: Database +component: PostgreSQL + hosts: * + lookup: average -1m unaligned of used + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (15) : (20)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL acquired locks utilization + info: Average acquired locks utilization over the last minute + to: dba + + template: postgres_txid_exhaustion_perc + on: postgres.txid_exhaustion_perc + class: Utilization + type: Database +component: PostgreSQL + hosts: * + calc: $txid_exhaustion + units: % + every: 1m + warn: $this > 90 + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL TXID exhaustion + info: Percent towards TXID wraparound + to: dba + +# Database alarms + + template: postgres_db_cache_io_ratio + on: postgres.db_cache_io_ratio + class: Workload + type: Database +component: PostgreSQL + hosts: * + lookup: average -1m unaligned of miss + calc: 100 - $this + units: % + every: 1m + warn: $this < (($status >= $WARNING) ? (70) : (60)) + crit: $this < (($status == $CRITICAL) ? (60) : (50)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL DB ${label:database} cache hit ratio + info: Average cache hit ratio in db ${label:database} over the last minute + to: dba + + template: postgres_db_transactions_rollback_ratio + on: postgres.db_transactions_ratio + class: Workload + type: Database +component: PostgreSQL + hosts: * + lookup: average -5m unaligned of rollback + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (2)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL DB ${label:database} aborted transactions + info: Average aborted transactions percentage in db ${label:database} over the last five minutes + to: dba + + template: postgres_db_deadlocks_rate + on: postgres.db_deadlocks_rate + class: Errors + type: Database +component: PostgreSQL + hosts: * + lookup: sum -1m unaligned of deadlocks + units: deadlocks + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (10)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL DB ${label:database} deadlocks rate + info: Number of deadlocks detected in db ${label:database} in the last minute + to: dba + +# Table alarms + + template: postgres_table_cache_io_ratio + on: postgres.table_cache_io_ratio + class: Workload + type: Database +component: PostgreSQL + hosts: * + lookup: average -1m unaligned of miss + calc: 100 - $this + units: % + every: 1m + warn: $this < (($status >= $WARNING) ? (70) : (60)) + crit: $this < (($status == $CRITICAL) ? (60) : (50)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL table ${label:table} db ${label:database} cache hit ratio + info: Average cache hit ratio in db ${label:database} table ${label:table} over the last minute + to: dba + + template: postgres_table_index_cache_io_ratio + on: postgres.table_index_cache_io_ratio + class: Workload + type: Database +component: PostgreSQL + hosts: * + lookup: average -1m unaligned of miss + calc: 100 - $this + units: % + every: 1m + warn: $this < (($status >= $WARNING) ? (70) : (60)) + crit: $this < (($status == $CRITICAL) ? (60) : (50)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL table ${label:table} db ${label:database} index cache hit ratio + info: Average index cache hit ratio in db ${label:database} table ${label:table} over the last minute + to: dba + + template: postgres_table_toast_cache_io_ratio + on: postgres.table_toast_cache_io_ratio + class: Workload + type: Database +component: PostgreSQL + hosts: * + lookup: average -1m unaligned of miss + calc: 100 - $this + units: % + every: 1m + warn: $this < (($status >= $WARNING) ? (70) : (60)) + crit: $this < (($status == $CRITICAL) ? (60) : (50)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL table ${label:table} db ${label:database} toast cache hit ratio + info: Average TOAST hit ratio in db ${label:database} table ${label:table} over the last minute + to: dba + + template: postgres_table_toast_index_cache_io_ratio + on: postgres.table_toast_index_cache_io_ratio + class: Workload + type: Database +component: PostgreSQL + hosts: * + lookup: average -1m unaligned of miss + calc: 100 - $this + units: % + every: 1m + warn: $this < (($status >= $WARNING) ? (70) : (60)) + crit: $this < (($status == $CRITICAL) ? (60) : (50)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL table ${label:table} db ${label:database} index toast hit ratio + info: average index TOAST hit ratio in db ${label:database} table ${label:table} over the last minute + to: dba + + template: postgres_table_bloat_size_perc + on: postgres.table_bloat_size_perc + class: Errors + type: Database +component: PostgreSQL + hosts: * + calc: ($table_size > (1024 * 1024 * 100)) ? ($bloat) : (0) + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (60) : (70)) + crit: $this > (($status == $CRITICAL) ? (70) : (80)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL table ${label:table} db ${label:database} bloat size + info: Bloat size percentage in db ${label:database} table ${label:table} + to: dba + + template: postgres_table_last_autovacuum_time + on: postgres.table_autovacuum_since_time + class: Errors + type: Database +component: PostgreSQL + hosts: !* + calc: $time + units: seconds + every: 1m + warn: $this != nan AND $this > (60 * 60 * 24 * 7) + summary: PostgreSQL table ${label:table} db ${label:database} last autovacuum + info: Time elapsed since db ${label:database} table ${label:table} was vacuumed by the autovacuum daemon + to: dba + + template: postgres_table_last_autoanalyze_time + on: postgres.table_autoanalyze_since_time + class: Errors + type: Database +component: PostgreSQL + hosts: !* + calc: $time + units: seconds + every: 1m + warn: $this != nan AND $this > (60 * 60 * 24 * 7) + summary: PostgreSQL table ${label:table} db ${label:database} last autoanalyze + info: Time elapsed since db ${label:database} table ${label:table} was analyzed by the autovacuum daemon + to: dba + +# Index alarms + + template: postgres_index_bloat_size_perc + on: postgres.index_bloat_size_perc + class: Errors + type: Database +component: PostgreSQL + hosts: * + calc: ($index_size > (1024 * 1024 * 10)) ? ($bloat) : (0) + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (60) : (70)) + crit: $this > (($status == $CRITICAL) ? (70) : (80)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL table ${label:table} db ${label:database} index bloat size + info: Bloat size percentage in db ${label:database} table ${label:table} index ${label:index} + to: dba diff --git a/health/health.d/processes.conf b/health/health.d/processes.conf new file mode 100644 index 00000000..8f2e0fda --- /dev/null +++ b/health/health.d/processes.conf @@ -0,0 +1,17 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: active_processes + on: system.active_processes + class: Workload + type: System +component: Processes + hosts: * + calc: $active * 100 / $pidmax + units: % + every: 5s + warn: $this > (($status >= $WARNING) ? (85) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (95)) + delay: down 5m multiplier 1.5 max 1h + summary: System PIDs utilization + info: System process IDs (PID) space utilization + to: sysadmin diff --git a/health/health.d/python.d.plugin.conf b/health/health.d/python.d.plugin.conf new file mode 100644 index 00000000..da27ad5b --- /dev/null +++ b/health/health.d/python.d.plugin.conf @@ -0,0 +1,18 @@ + +# make sure python.d.plugin data collection job is running + + template: python.d_job_last_collected_secs + on: netdata.pythond_runtime + class: Errors + type: Netdata +component: python.d.plugin + module: !* * + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + summary: Python.d plugin last collection + info: Number of seconds since the last successful data collection + to: webmaster diff --git a/health/health.d/qos.conf b/health/health.d/qos.conf new file mode 100644 index 00000000..970ea636 --- /dev/null +++ b/health/health.d/qos.conf @@ -0,0 +1,18 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + +# check if a QoS class is dropping packets +# the alarm is checked every 10 seconds +# and examines the last minute of data + +template: 10min_qos_packet_drops + on: tc.qos_dropped + os: linux + hosts: * + lookup: sum -5m unaligned absolute + every: 30s + warn: $this > 0 + units: packets + summary: QOS packet drops + info: Dropped packets in the last 5 minutes + to: silent diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf new file mode 100644 index 00000000..51f307ca --- /dev/null +++ b/health/health.d/ram.conf @@ -0,0 +1,82 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: ram_in_use + on: system.ram + class: Utilization + type: System +component: Memory + os: linux + hosts: * + calc: $used * 100 / ($used + $cached + $free + $buffers) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + summary: System memory utilization + info: System memory utilization + to: sysadmin + + alarm: ram_available + on: mem.available + class: Utilization + type: System +component: Memory + os: linux + hosts: * + calc: $avail * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers) + units: % + every: 10s + warn: $this < (($status >= $WARNING) ? (15) : (10)) + delay: down 15m multiplier 1.5 max 1h + summary: System available memory + info: Percentage of estimated amount of RAM available for userspace processes, without causing swapping + to: silent + + alarm: oom_kill + on: mem.oom_kill + os: linux + hosts: * + lookup: sum -30m unaligned + units: kills + every: 5m + warn: $this > 0 + delay: down 10m + summary: System OOM kills + info: Number of out of memory kills in the last 30 minutes + to: silent + +## FreeBSD + alarm: ram_in_use + on: system.ram + class: Utilization + type: System +component: Memory + os: freebsd + hosts: * + calc: ($active + $wired + $laundry + $buffers) * 100 / ($active + $wired + $laundry + $buffers + $cache + $free + $inactive) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + summary: System memory utilization + info: System memory utilization + to: sysadmin + + alarm: ram_available + on: mem.available + class: Utilization + type: System +component: Memory + os: freebsd + hosts: * + calc: $avail * 100 / ($system.ram.free + $system.ram.active + $system.ram.inactive + $system.ram.wired + $system.ram.cache + $system.ram.laundry + $system.ram.buffers) + units: % + every: 10s + warn: $this < (($status >= $WARNING) ? (15) : (10)) + delay: down 15m multiplier 1.5 max 1h + summary: System available memory + info: Percentage of estimated amount of RAM available for userspace processes, without causing swapping + to: silent diff --git a/health/health.d/redis.conf b/health/health.d/redis.conf new file mode 100644 index 00000000..7c2945e6 --- /dev/null +++ b/health/health.d/redis.conf @@ -0,0 +1,57 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: redis_connections_rejected + on: redis.connections + class: Errors + type: KV Storage +component: Redis + lookup: sum -1m unaligned of rejected + every: 10s + units: connections + warn: $this > 0 + summary: Redis rejected connections + info: Connections rejected because of maxclients limit in the last minute + delay: down 5m multiplier 1.5 max 1h + to: dba + + template: redis_bgsave_broken + on: redis.bgsave_health + class: Errors + type: KV Storage +component: Redis + every: 10s + crit: $last_bgsave != nan AND $last_bgsave != 0 + units: ok/failed + summary: Redis background save + info: Status of the last RDB save operation (0: ok, 1: error) + delay: down 5m multiplier 1.5 max 1h + to: dba + + template: redis_bgsave_slow + on: redis.bgsave_now + class: Latency + type: KV Storage +component: Redis + every: 10s + calc: $current_bgsave_time + warn: $this > 600 + crit: $this > 1200 + units: seconds + summary: Redis slow background save + info: Duration of the on-going RDB save operation + delay: down 5m multiplier 1.5 max 1h + to: dba + + template: redis_master_link_down + on: redis.master_link_down_since_time + class: Errors + type: KV Storage +component: Redis + every: 10s + calc: $time + units: seconds + crit: $this != nan AND $this > 0 + summary: Redis master link down + info: Time elapsed since the link between master and slave is down + delay: down 5m multiplier 1.5 max 1h + to: dba diff --git a/health/health.d/retroshare.conf b/health/health.d/retroshare.conf new file mode 100644 index 00000000..c665430f --- /dev/null +++ b/health/health.d/retroshare.conf @@ -0,0 +1,17 @@ + +# make sure the DHT is fine when active + + template: retroshare_dht_working + on: retroshare.dht + class: Utilization + type: Data Sharing +component: Retroshare + calc: $dht_size_all + units: peers + every: 1m + warn: $this < (($status >= $WARNING) ? (120) : (100)) + crit: $this < (($status == $CRITICAL) ? (10) : (1)) + delay: up 0 down 15m multiplier 1.5 max 1h + summary: Retroshare DHT peers + info: Number of DHT peers + to: sysadmin diff --git a/health/health.d/riakkv.conf b/health/health.d/riakkv.conf new file mode 100644 index 00000000..677e3cb4 --- /dev/null +++ b/health/health.d/riakkv.conf @@ -0,0 +1,98 @@ + +# Warn if a list keys operation is running. + template: riakkv_list_keys_active + on: riak.core.fsm_active + class: Utilization + type: Database +component: Riak KV + calc: $list_fsm_active + units: state machines + every: 10s + warn: $list_fsm_active > 0 + summary: Riak KV active list keys + info: Number of currently running list keys finite state machines + to: dba + + +## Timing healthchecks +# KV GET + template: riakkv_1h_kv_get_mean_latency + on: riak.kv.latency.get + class: Latency + type: Database +component: Riak KV + calc: $node_get_fsm_time_mean + lookup: average -1h unaligned of time + every: 30s + units: ms + info: average time between reception of client GET request and \ + subsequent response to client over the last hour + + template: riakkv_kv_get_slow + on: riak.kv.latency.get + class: Latency + type: Database +component: Riak KV + calc: $mean + lookup: average -3m unaligned of time + units: ms + every: 10s + warn: ($this > ($riakkv_1h_kv_get_mean_latency * 2) ) + crit: ($this > ($riakkv_1h_kv_get_mean_latency * 3) ) + summary: Riak KV GET latency + info: Average time between reception of client GET request and \ + subsequent response to the client over the last 3 minutes, \ + compared to the average over the last hour + delay: down 5m multiplier 1.5 max 1h + to: dba + +# KV PUT + template: riakkv_1h_kv_put_mean_latency + on: riak.kv.latency.put + class: Latency + type: Database +component: Riak KV + calc: $node_put_fsm_time_mean + lookup: average -1h unaligned of time + every: 30s + units: ms + summary: Riak KV PUT mean latency + info: Average time between reception of client PUT request and \ + subsequent response to the client over the last hour + + template: riakkv_kv_put_slow + on: riak.kv.latency.put + class: Latency + type: Database +component: Riak KV + calc: $mean + lookup: average -3m unaligned of time + units: ms + every: 10s + warn: ($this > ($riakkv_1h_kv_put_mean_latency * 2) ) + crit: ($this > ($riakkv_1h_kv_put_mean_latency * 3) ) + summary: Riak KV PUT latency + info: Average time between reception of client PUT request and \ + subsequent response to the client over the last 3 minutes, \ + compared to the average over the last hour + delay: down 5m multiplier 1.5 max 1h + to: dba + + +## VM healthchecks + +# Default Erlang VM process limit: 262144 +# On systems observed, this is < 2000, but may grow depending on load. + template: riakkv_vm_high_process_count + on: riak.vm + class: Utilization + type: Database +component: Riak KV + calc: $sys_process_count + units: processes + every: 10s + warn: $this > 10000 + crit: $this > 100000 + summary: Riak KV number of processes + info: Number of processes running in the Erlang VM + to: dba diff --git a/health/health.d/scaleio.conf b/health/health.d/scaleio.conf new file mode 100644 index 00000000..b089cb85 --- /dev/null +++ b/health/health.d/scaleio.conf @@ -0,0 +1,33 @@ + +# make sure Storage Pool capacity utilization is under limit + + template: scaleio_storage_pool_capacity_utilization + on: scaleio.storage_pool_capacity_utilization + class: Utilization + type: Storage +component: ScaleIO + calc: $used + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (90)) + delay: down 15m multiplier 1.5 max 1h + summary: ScaleIO storage pool capacity utilization + info: Storage pool capacity utilization + to: sysadmin + + +# make sure Sdc is connected to MDM + + template: scaleio_sdc_mdm_connection_state + on: scaleio.sdc_mdm_connection_state + class: Utilization + type: Storage +component: ScaleIO + calc: $connected + every: 10s + warn: $this != 1 + delay: up 30s down 5m multiplier 1.5 max 1h + summary: ScaleIO SDC-MDM connection state + info: Data Client (SDC) to Metadata Manager (MDM) connection state (0: disconnected, 1: connected) + to: sysadmin diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf new file mode 100644 index 00000000..8d7ba566 --- /dev/null +++ b/health/health.d/softnet.conf @@ -0,0 +1,57 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + +# check for common /proc/net/softnet_stat errors + + alarm: 1min_netdev_backlog_exceeded + on: system.softnet_stat + class: Errors + type: System +component: Network + os: linux + hosts: * + lookup: average -1m unaligned absolute of dropped + units: packets + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : (10)) + delay: down 1h multiplier 1.5 max 2h + summary: System netdev dropped packets + info: Average number of dropped packets in the last minute \ + due to exceeded net.core.netdev_max_backlog + to: silent + + alarm: 1min_netdev_budget_ran_outs + on: system.softnet_stat + class: Errors + type: System +component: Network + os: linux + hosts: * + lookup: average -1m unaligned absolute of squeezed + units: events + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : (10)) + delay: down 1h multiplier 1.5 max 2h + summary: System netdev budget run outs + info: Average number of times ksoftirq ran out of sysctl net.core.netdev_budget or \ + net.core.netdev_budget_usecs with work remaining over the last minute \ + (this can be a cause for dropped packets) + to: silent + + alarm: 10min_netisr_backlog_exceeded + on: system.softnet_stat + class: Errors + type: System +component: Network + os: freebsd + hosts: * + lookup: average -1m unaligned absolute of qdrops + units: packets + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : (10)) + delay: down 1h multiplier 1.5 max 2h + summary: System netisr drops + info: Average number of drops in the last minute \ + due to exceeded sysctl net.route.netisr_maxqlen \ + (this can be a cause for dropped packets) + to: silent diff --git a/health/health.d/swap.conf b/health/health.d/swap.conf new file mode 100644 index 00000000..e3973399 --- /dev/null +++ b/health/health.d/swap.conf @@ -0,0 +1,37 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: 30min_ram_swapped_out + on: mem.swapio + class: Workload + type: System +component: Memory + os: linux freebsd + hosts: * + lookup: sum -30m unaligned absolute of out + # we have to convert KB to MB by dividing $this (i.e. the result of the lookup) with 1024 + calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free ) + units: % of RAM + every: 1m + warn: $this > (($status >= $WARNING) ? (20) : (30)) + delay: down 15m multiplier 1.5 max 1h + summary: System memory swapped out + info: Percentage of the system RAM swapped in the last 30 minutes + to: silent + + alarm: used_swap + on: mem.swap + class: Utilization + type: System +component: Memory + os: linux freebsd + hosts: * + calc: (($used + $free) > 0) ? ($used * 100 / ($used + $free)) : 0 + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: up 30s down 15m multiplier 1.5 max 1h + summary: System swap memory utilization + info: Swap memory utilization + to: sysadmin diff --git a/health/health.d/synchronization.conf b/health/health.d/synchronization.conf new file mode 100644 index 00000000..6c947d90 --- /dev/null +++ b/health/health.d/synchronization.conf @@ -0,0 +1,13 @@ + alarm: sync_freq + on: mem.sync + lookup: sum -1m of sync + units: calls + plugin: ebpf.plugin + every: 1m + warn: $this > 6 + delay: up 1m down 10m multiplier 1.5 max 1h + summary: Sync system call frequency + info: Number of sync() system calls. \ + Every call causes all pending modifications to filesystem metadata and \ + cached file data to be written to the underlying filesystems. + to: silent diff --git a/health/health.d/systemdunits.conf b/health/health.d/systemdunits.conf new file mode 100644 index 00000000..ad53a0e1 --- /dev/null +++ b/health/health.d/systemdunits.conf @@ -0,0 +1,161 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +## Service units + template: systemd_service_unit_failed_state + on: systemd.service_unit_state + class: Errors + type: Linux +component: Systemd units + module: !* * + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd service unit in the failed state + to: sysadmin + +## Socket units + template: systemd_socket_unit_failed_state + on: systemd.socket_unit_state + class: Errors + type: Linux +component: Systemd units + module: !* * + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd socket unit in the failed state + to: sysadmin + +## Target units + template: systemd_target_unit_failed_state + on: systemd.target_unit_state + class: Errors + type: Linux +component: Systemd units + module: !* * + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd target unit in the failed state + to: sysadmin + +## Path units + template: systemd_path_unit_failed_state + on: systemd.path_unit_state + class: Errors + type: Linux +component: Systemd units + module: !* * + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd path unit in the failed state + to: sysadmin + +## Device units + template: systemd_device_unit_failed_state + on: systemd.device_unit_state + class: Errors + type: Linux +component: Systemd units + module: !* * + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd device unit in the failed state + to: sysadmin + +## Mount units + template: systemd_mount_unit_failed_state + on: systemd.mount_unit_state + class: Errors + type: Linux +component: Systemd units + module: !* * + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd mount units in the failed state + to: sysadmin + +## Automount units + template: systemd_automount_unit_failed_state + on: systemd.automount_unit_state + class: Errors + type: Linux +component: Systemd units + module: !* * + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd automount unit in the failed state + to: sysadmin + +## Swap units + template: systemd_swap_unit_failed_state + on: systemd.swap_unit_state + class: Errors + type: Linux +component: Systemd units + module: !* * + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd swap units in the failed state + to: sysadmin + +## Scope units + template: systemd_scope_unit_failed_state + on: systemd.scope_unit_state + class: Errors + type: Linux +component: Systemd units + module: !* * + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd scope units in the failed state + to: sysadmin + +## Slice units + template: systemd_slice_unit_failed_state + on: systemd.slice_unit_state + class: Errors + type: Linux +component: Systemd units + module: !* * + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd slice units in the failed state + to: sysadmin diff --git a/health/health.d/tcp_conn.conf b/health/health.d/tcp_conn.conf new file mode 100644 index 00000000..2b2f9740 --- /dev/null +++ b/health/health.d/tcp_conn.conf @@ -0,0 +1,23 @@ + +# +# ${tcp_max_connections} may be nan or -1 if the system +# supports dynamic threshold for TCP connections. +# In this case, the alarm will always be zero. +# + + alarm: tcp_connections + on: ip.tcpsock + class: Workload + type: System +component: Network + os: linux + hosts: * + calc: (${tcp_max_connections} > 0) ? ( ${connections} * 100 / ${tcp_max_connections} ) : 0 + units: % + every: 10s + warn: $this > (($status >= $WARNING ) ? ( 60 ) : ( 80 )) + crit: $this > (($status == $CRITICAL) ? ( 80 ) : ( 90 )) + delay: up 0 down 5m multiplier 1.5 max 1h + summary: System TCP connections utilization + info: IPv4 TCP connections utilization + to: sysadmin diff --git a/health/health.d/tcp_listen.conf b/health/health.d/tcp_listen.conf new file mode 100644 index 00000000..9d1104a5 --- /dev/null +++ b/health/health.d/tcp_listen.conf @@ -0,0 +1,100 @@ +# +# There are two queues involved when incoming TCP connections are handled +# (both at the kernel): +# +# SYN queue +# The SYN queue tracks TCP handshakes until connections are fully established. +# It overflows when too many incoming TCP connection requests hang in the +# half-open state and the server is not configured to fall back to SYN cookies. +# Overflows are usually caused by SYN flood DoS attacks (i.e. someone sends +# lots of SYN packets and never completes the handshakes). +# +# Accept queue +# The accept queue holds fully established TCP connections waiting to be handled +# by the listening application. It overflows when the server application fails +# to accept new connections at the rate they are coming in. +# +# +# ----------------------------------------------------------------------------- +# tcp accept queue (at the kernel) + + alarm: 1m_tcp_accept_queue_overflows + on: ip.tcp_accept_queue + class: Workload + type: System +component: Network + os: linux + hosts: * + lookup: average -60s unaligned absolute of ListenOverflows + units: overflows + every: 10s + warn: $this > 1 + crit: $this > (($status == $CRITICAL) ? (1) : (5)) + delay: up 0 down 5m multiplier 1.5 max 1h + summary: System TCP accept queue overflows + info: Average number of overflows in the TCP accept queue over the last minute + to: silent + +# THIS IS TOO GENERIC +# CHECK: https://github.com/netdata/netdata/issues/3234#issuecomment-423935842 + alarm: 1m_tcp_accept_queue_drops + on: ip.tcp_accept_queue + class: Workload + type: System +component: Network + os: linux + hosts: * + lookup: average -60s unaligned absolute of ListenDrops + units: drops + every: 10s + warn: $this > 1 + crit: $this > (($status == $CRITICAL) ? (1) : (5)) + delay: up 0 down 5m multiplier 1.5 max 1h + summary: System TCP accept queue dropped packets + info: Average number of dropped packets in the TCP accept queue over the last minute + to: silent + + +# ----------------------------------------------------------------------------- +# tcp SYN queue (at the kernel) + +# When the SYN queue is full, either TcpExtTCPReqQFullDoCookies or +# TcpExtTCPReqQFullDrop is incremented, depending on whether SYN cookies are +# enabled or not. In both cases this probably indicates a SYN flood attack, +# so i guess a notification should be sent. + + alarm: 1m_tcp_syn_queue_drops + on: ip.tcp_syn_queue + class: Workload + type: System +component: Network + os: linux + hosts: * + lookup: average -60s unaligned absolute of TCPReqQFullDrop + units: drops + every: 10s + warn: $this > 1 + crit: $this > (($status == $CRITICAL) ? (0) : (5)) + delay: up 10 down 5m multiplier 1.5 max 1h + summary: System TCP SYN queue drops + info: Average number of SYN requests was dropped due to the full TCP SYN queue over the last minute \ + (SYN cookies were not enabled) + to: silent + + alarm: 1m_tcp_syn_queue_cookies + on: ip.tcp_syn_queue + class: Workload + type: System +component: Network + os: linux + hosts: * + lookup: average -60s unaligned absolute of TCPReqQFullDoCookies + units: cookies + every: 10s + warn: $this > 1 + crit: $this > (($status == $CRITICAL) ? (0) : (5)) + delay: up 10 down 5m multiplier 1.5 max 1h + summary: System TCP SYN queue cookies + info: Average number of sent SYN cookies due to the full TCP SYN queue over the last minute + to: silent + diff --git a/health/health.d/tcp_mem.conf b/health/health.d/tcp_mem.conf new file mode 100644 index 00000000..4e422ec1 --- /dev/null +++ b/health/health.d/tcp_mem.conf @@ -0,0 +1,24 @@ +# +# check +# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html +# +# We give a warning when TCP is under memory pressure +# and a critical when TCP is 90% of its upper memory limit +# + + alarm: tcp_memory + on: ipv4.sockstat_tcp_mem + class: Utilization + type: System +component: Network + os: linux + hosts: * + calc: ${mem} * 100 / ${tcp_mem_high} + units: % + every: 10s + warn: ${mem} > (($status >= $WARNING ) ? ( ${tcp_mem_pressure} * 0.8 ) : ( ${tcp_mem_pressure} )) + crit: ${mem} > (($status == $CRITICAL ) ? ( ${tcp_mem_pressure} ) : ( ${tcp_mem_high} * 0.9 )) + delay: up 0 down 5m multiplier 1.5 max 1h + summary: System TCP memory utilization + info: TCP memory utilization + to: silent diff --git a/health/health.d/tcp_orphans.conf b/health/health.d/tcp_orphans.conf new file mode 100644 index 00000000..8f665d50 --- /dev/null +++ b/health/health.d/tcp_orphans.conf @@ -0,0 +1,25 @@ + +# +# check +# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html +# +# The kernel may penalize orphans by 2x or even 4x +# so we alarm warning at 25% and critical at 50% +# + + alarm: tcp_orphans + on: ipv4.sockstat_tcp_sockets + class: Errors + type: System +component: Network + os: linux + hosts: * + calc: ${orphan} * 100 / ${tcp_max_orphans} + units: % + every: 10s + warn: $this > (($status >= $WARNING ) ? ( 20 ) : ( 25 )) + crit: $this > (($status == $CRITICAL) ? ( 25 ) : ( 50 )) + delay: up 0 down 5m multiplier 1.5 max 1h + summary: System TCP orphan sockets utilization + info: Orphan IPv4 TCP sockets utilization + to: silent diff --git a/health/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf new file mode 100644 index 00000000..7c39db2d --- /dev/null +++ b/health/health.d/tcp_resets.conf @@ -0,0 +1,71 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + +# ----------------------------------------------------------------------------- +# tcp resets this host sends + + alarm: 1m_ip_tcp_resets_sent + on: ip.tcphandshake + class: Errors + type: System +component: Network + os: linux + hosts: * + lookup: average -1m at -10s unaligned absolute of OutRsts + units: tcp resets/s + every: 10s + info: average number of sent TCP RESETS over the last minute + + alarm: 10s_ip_tcp_resets_sent + on: ip.tcphandshake + class: Errors + type: System +component: Network + os: linux + hosts: * + lookup: average -10s unaligned absolute of OutRsts + units: tcp resets/s + every: 10s + warn: $netdata.uptime.uptime > (1 * 60) AND $this > ((($1m_ip_tcp_resets_sent < 5)?(5):($1m_ip_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (10))) + delay: up 20s down 60m multiplier 1.2 max 2h + options: no-clear-notification + summary: System TCP outbound resets + info: Average number of sent TCP RESETS over the last 10 seconds. \ + This can indicate a port scan, \ + or that a service running on this host has crashed. \ + Netdata will not send a clear notification for this alarm. + to: silent + +# ----------------------------------------------------------------------------- +# tcp resets this host receives + + alarm: 1m_ip_tcp_resets_received + on: ip.tcphandshake + class: Errors + type: System +component: Network + os: linux freebsd + hosts: * + lookup: average -1m at -10s unaligned absolute of AttemptFails + units: tcp resets/s + every: 10s + info: average number of received TCP RESETS over the last minute + + alarm: 10s_ip_tcp_resets_received + on: ip.tcphandshake + class: Errors + type: System +component: Network + os: linux freebsd + hosts: * + lookup: average -10s unaligned absolute of AttemptFails + units: tcp resets/s + every: 10s + warn: $netdata.uptime.uptime > (1 * 60) AND $this > ((($1m_ip_tcp_resets_received < 5)?(5):($1m_ip_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (10))) + delay: up 20s down 60m multiplier 1.2 max 2h + options: no-clear-notification + summary: System TCP inbound resets + info: average number of received TCP RESETS over the last 10 seconds. \ + This can be an indication that a service this host needs has crashed. \ + Netdata will not send a clear notification for this alarm. + to: silent diff --git a/health/health.d/timex.conf b/health/health.d/timex.conf new file mode 100644 index 00000000..65c9628b --- /dev/null +++ b/health/health.d/timex.conf @@ -0,0 +1,18 @@ + +# It can take several minutes before ntpd selects a server to synchronize with; +# try checking after 17 minutes (1024 seconds). + + alarm: system_clock_sync_state + on: system.clock_sync_state + os: linux + class: Errors + type: System +component: Clock + calc: $state + units: synchronization state + every: 10s + warn: $system.uptime.uptime > 17 * 60 AND $this == 0 + delay: down 5m + summary: System clock sync state + info: When set to 0, the system kernel believes the system clock is not properly synchronized to a reliable server + to: silent diff --git a/health/health.d/udp_errors.conf b/health/health.d/udp_errors.conf new file mode 100644 index 00000000..dc094840 --- /dev/null +++ b/health/health.d/udp_errors.conf @@ -0,0 +1,40 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + +# ----------------------------------------------------------------------------- +# UDP receive buffer errors + + alarm: 1m_ipv4_udp_receive_buffer_errors + on: ipv4.udperrors + class: Errors + type: System +component: Network + os: linux freebsd + hosts: * + lookup: average -1m unaligned absolute of RcvbufErrors + units: errors + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : (10)) + summary: System UDP receive buffer errors + info: Average number of UDP receive buffer errors over the last minute + delay: up 1m down 60m multiplier 1.2 max 2h + to: silent + +# ----------------------------------------------------------------------------- +# UDP send buffer errors + + alarm: 1m_ipv4_udp_send_buffer_errors + on: ipv4.udperrors + class: Errors + type: System +component: Network + os: linux + hosts: * + lookup: average -1m unaligned absolute of SndbufErrors + units: errors + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : (10)) + summary: System UDP send buffer errors + info: Average number of UDP send buffer errors over the last minute + delay: up 1m down 60m multiplier 1.2 max 2h + to: silent diff --git a/health/health.d/unbound.conf b/health/health.d/unbound.conf new file mode 100644 index 00000000..3c898f1d --- /dev/null +++ b/health/health.d/unbound.conf @@ -0,0 +1,30 @@ + +# make sure there is no overwritten/dropped queries in the request-list + + template: unbound_request_list_overwritten + on: unbound.request_list_jostle_list + class: Errors + type: DNS +component: Unbound + lookup: average -60s unaligned absolute match-names of overwritten + units: queries + every: 10s + warn: $this > 5 + delay: up 10 down 5m multiplier 1.5 max 1h + summary: Unbound overwritten queries + info: Number of overwritten queries in the request-list + to: sysadmin + + template: unbound_request_list_dropped + on: unbound.request_list_jostle_list + class: Errors + type: DNS +component: Unbound + lookup: average -60s unaligned absolute match-names of dropped + units: queries + every: 10s + warn: $this > 0 + delay: up 10 down 5m multiplier 1.5 max 1h + summary: Unbound dropped queries + info: Number of dropped queries in the request-list + to: sysadmin diff --git a/health/health.d/upsd.conf b/health/health.d/upsd.conf new file mode 100644 index 00000000..703a6488 --- /dev/null +++ b/health/health.d/upsd.conf @@ -0,0 +1,50 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: upsd_10min_ups_load + on: upsd.ups_load + class: Utilization + type: Power Supply +component: UPS + os: * + hosts: * + lookup: average -10m unaligned of load + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 10m multiplier 1.5 max 1h + summary: UPS ${label:ups_name} load + info: UPS ${label:ups_name} average load over the last 10 minutes + to: sitemgr + + template: upsd_ups_battery_charge + on: upsd.ups_battery_charge + class: Errors + type: Power Supply +component: UPS + os: * + hosts: * + lookup: average -60s unaligned of charge + units: % + every: 60s + warn: $this < 75 + crit: $this < 40 + delay: down 10m multiplier 1.5 max 1h + summary: UPS ${label:ups_name} battery charge + info: UPS ${label:ups_name} average battery charge over the last minute + to: sitemgr + + template: upsd_ups_last_collected_secs + on: upsd.ups_load + class: Latency + type: Power Supply +component: UPS device + calc: $now - $last_collected_t + every: 10s + units: seconds ago + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + summary: UPS ${label:ups_name} last collected + info: UPS ${label:ups_name} number of seconds since the last successful data collection + to: sitemgr diff --git a/health/health.d/vcsa.conf b/health/health.d/vcsa.conf new file mode 100644 index 00000000..3e20bfd1 --- /dev/null +++ b/health/health.d/vcsa.conf @@ -0,0 +1,230 @@ + +# Overall system health: +# - 0: all components are healthy. +# - 1: one or more components might become overloaded soon. +# - 2: one or more components in the appliance might be degraded. +# - 3: one or more components might be in an unusable status and the appliance might become unresponsive soon. +# - 4: no health data is available. + + template: vcsa_system_health_warn + on: vcsa.system_health_status + class: Errors + type: Virtual Machine +component: VMware vCenter + calc: $orange + units: status + every: 10s + warn: $this == 1 + delay: down 1m multiplier 1.5 max 1h + summary: VCSA system status + info: VCSA overall system status is orange. One or more components are degraded. + to: sysadmin + + template: vcsa_system_health_crit + on: vcsa.system_health_status + class: Errors + type: Virtual Machine +component: VMware vCenter + calc: $red + units: status + every: 10s + crit: $this == 1 + delay: down 1m multiplier 1.5 max 1h + summary: VCSA system status + info: VCSA overall system status is red. One or more components are unavailable or will stop functioning soon. + to: sysadmin + +# Components health: +# - 0: healthy. +# - 1: healthy, but may have some problems. +# - 2: degraded, and may have serious problems. +# - 3: unavailable, or will stop functioning soon. +# - 4: no health data is available. + + template: vcsa_applmgmt_health_warn + on: vcsa.applmgmt_health_status + class: Errors + type: Virtual Machine +component: VMware vCenter + calc: $orange + units: status + every: 10s + warn: $this == 1 + delay: down 1m multiplier 1.5 max 1h + summary: VCSA ApplMgmt service status + info: VCSA ApplMgmt component status is orange. It is degraded, and may have serious problems. + to: silent + + template: vcsa_applmgmt_health_crit + on: vcsa.applmgmt_health_status + class: Errors + type: Virtual Machine +component: VMware vCenter + calc: $red + units: status + every: 10s + warn: $this == 1 + delay: down 1m multiplier 1.5 max 1h + summary: VCSA ApplMgmt service status + info: VCSA ApplMgmt component status is red. It is unavailable, or will stop functioning soon. + to: sysadmin + + template: vcsa_load_health_warn + on: vcsa.load_health_status + class: Errors + type: Virtual Machine +component: VMware vCenter + calc: $orange + units: status + every: 10s + warn: $this == 1 + delay: down 1m multiplier 1.5 max 1h + summary: VCSA Load status + info: VCSA Load component status is orange. It is degraded, and may have serious problems. + to: silent + + template: vcsa_load_health_crit + on: vcsa.load_health_status + class: Errors + type: Virtual Machine +component: VMware vCenter + calc: $red + units: status + every: 10s + warn: $this == 1 + delay: down 1m multiplier 1.5 max 1h + summary: VCSA Load status + info: VCSA Load component status is red. It is unavailable, or will stop functioning soon. + to: sysadmin + + template: vcsa_mem_health_warn + on: vcsa.mem_health_status + class: Errors + type: Virtual Machine +component: VMware vCenter + calc: $orange + units: status + every: 10s + warn: $this == 1 + delay: down 1m multiplier 1.5 max 1h + summary: VCSA Memory status + info: VCSA Memory component status is orange. It is degraded, and may have serious problems. + to: silent + + template: vcsa_mem_health_crit + on: vcsa.mem_health_status + class: Errors + type: Virtual Machine +component: VMware vCenter + calc: $red + units: status + every: 10s + warn: $this == 1 + delay: down 1m multiplier 1.5 max 1h + summary: VCSA Memory status + info: VCSA Memory component status is red. It is unavailable, or will stop functioning soon. + to: sysadmin + + template: vcsa_swap_health_warn + on: vcsa.swap_health_status + class: Errors + type: Virtual Machine +component: VMware vCenter + calc: $orange + units: status + every: 10s + warn: $this == 1 + delay: down 1m multiplier 1.5 max 1h + summary: VCSA Swap status + info: VCSA Swap component status is orange. It is degraded, and may have serious problems. + to: silent + + template: vcsa_swap_health_crit + on: vcsa.swap_health_status + class: Errors + type: Virtual Machine +component: VMware vCenter + calc: $red + units: status + every: 10s + warn: $this == 1 + delay: down 1m multiplier 1.5 max 1h + summary: VCSA Swap status + info: VCSA Swap component status is red. It is unavailable, or will stop functioning soon. + to: sysadmin + + template: vcsa_database_storage_health_warn + on: vcsa.database_storage_health_status + class: Errors + type: Virtual Machine +component: VMware vCenter + calc: $orange + units: status + every: 10s + warn: $this == 1 + delay: down 1m multiplier 1.5 max 1h + summary: VCSA Database status + info: VCSA Database Storage component status is orange. It is degraded, and may have serious problems. + to: silent + + template: vcsa_database_storage_health_crit + on: vcsa.database_storage_health_status + class: Errors + type: Virtual Machine +component: VMware vCenter + calc: $red + units: status + every: 10s + warn: $this == 1 + delay: down 1m multiplier 1.5 max 1h + summary: VCSA Database status + info: VCSA Database Storage component status is red. It is unavailable, or will stop functioning soon. + to: sysadmin + + template: vcsa_storage_health_warn + on: vcsa.storage_health_status + class: Errors + type: Virtual Machine +component: VMware vCenter + calc: $orange + units: status + every: 10s + warn: $this == 1 + delay: down 1m multiplier 1.5 max 1h + summary: VCSA Storage status + info: VCSA Storage component status is orange. It is degraded, and may have serious problems. + to: silent + + template: vcsa_storage_health_crit + on: vcsa.storage_health_status + class: Errors + type: Virtual Machine +component: VMware vCenter + calc: $red + units: status + every: 10s + warn: $this == 1 + delay: down 1m multiplier 1.5 max 1h + summary: VCSA Storage status + info: VCSA Storage component status is red. It is unavailable, or will stop functioning soon. + to: sysadmin + +# Software updates health: +# - 0: no updates available. +# - 2: non-security updates are available. +# - 3: security updates are available. +# - 4: an error retrieving information on software updates. + + template: vcsa_software_packages_health_warn + on: vcsa.software_packages_health_status + class: Errors + type: Virtual Machine +component: VMware vCenter + calc: $orange + units: status + every: 10s + warn: $this == 1 + delay: down 1m multiplier 1.5 max 1h + summary: VCSA software status + info: VCSA software packages security updates are available. + to: silent diff --git a/health/health.d/vernemq.conf b/health/health.d/vernemq.conf new file mode 100644 index 00000000..6ea9f99d --- /dev/null +++ b/health/health.d/vernemq.conf @@ -0,0 +1,391 @@ + +# Socket errors + + template: vernemq_socket_errors + on: vernemq.socket_errors + class: Errors + type: Messaging +component: VerneMQ + lookup: sum -1m unaligned absolute of socket_error + units: errors + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + summary: VerneMQ socket errors + info: Number of socket errors in the last minute + to: sysadmin + +# Queues dropped/expired/unhandled PUBLISH messages + + template: vernemq_queue_message_drop + on: vernemq.queue_undelivered_messages + class: Errors + type: Messaging +component: VerneMQ + lookup: average -1m unaligned absolute of queue_message_drop + units: dropped messages + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + summary: VerneMQ dropped messages + info: Number of dropped messages due to full queues in the last minute + to: sysadmin + + template: vernemq_queue_message_expired + on: vernemq.queue_undelivered_messages + class: Latency + type: Messaging +component: VerneMQ + lookup: average -1m unaligned absolute of queue_message_expired + units: expired messages + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + summary: VerneMQ expired messages + info: number of messages which expired before delivery in the last minute + to: sysadmin + + template: vernemq_queue_message_unhandled + on: vernemq.queue_undelivered_messages + class: Latency + type: Messaging +component: VerneMQ + lookup: average -1m unaligned absolute of queue_message_unhandled + units: unhandled messages + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + summary: VerneMQ unhandled messages + info: Number of unhandled messages (connections with clean session=true) in the last minute + to: sysadmin + +# Erlang VM + + template: vernemq_average_scheduler_utilization + on: vernemq.average_scheduler_utilization + class: Utilization + type: Messaging +component: VerneMQ + lookup: average -10m unaligned + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + summary: VerneMQ scheduler utilization + info: Average scheduler utilization over the last 10 minutes + to: sysadmin + +# Cluster communication and netsplits + + template: vernemq_cluster_dropped + on: vernemq.cluster_dropped + class: Errors + type: Messaging +component: VerneMQ + lookup: sum -1m unaligned + units: KiB + every: 1m + warn: $this > 0 + delay: up 5m down 5m multiplier 1.5 max 1h + summary: VerneMQ dropped traffic + info: Amount of traffic dropped during communication with the cluster nodes in the last minute + to: sysadmin + + template: vernemq_netsplits + on: vernemq.netsplits + class: Workload + type: Messaging +component: VerneMQ + lookup: sum -1m unaligned absolute of netsplit_detected + units: netsplits + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 2h + summary: VerneMQ netsplits + info: Number of detected netsplits (split brain situation) in the last minute + to: sysadmin + +# Unsuccessful CONNACK + + template: vernemq_mqtt_connack_sent_reason_unsuccessful + on: vernemq.mqtt_connack_sent_reason + class: Errors + type: Messaging +component: VerneMQ + lookup: average -1m unaligned absolute match-names of !success,* + units: packets + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + summary: VerneMQ unsuccessful CONNACK + info: Number of sent unsuccessful v3/v5 CONNACK packets in the last minute + to: sysadmin + +# Not normal DISCONNECT + + template: vernemq_mqtt_disconnect_received_reason_not_normal + on: vernemq.mqtt_disconnect_received_reason + class: Workload + type: Messaging +component: VerneMQ + lookup: average -1m unaligned absolute match-names of !normal_disconnect,* + units: packets + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + summary: VerneMQ received not normal DISCONNECT + info: Number of received not normal v5 DISCONNECT packets in the last minute + to: sysadmin + + template: vernemq_mqtt_disconnect_sent_reason_not_normal + on: vernemq.mqtt_disconnect_sent_reason + class: Errors + type: Messaging +component: VerneMQ + lookup: average -1m unaligned absolute match-names of !normal_disconnect,* + units: packets + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + summary: VerneMQ sent not normal DISCONNECT + info: Number of sent not normal v5 DISCONNECT packets in the last minute + to: sysadmin + +# SUBSCRIBE errors and unauthorized attempts + + template: vernemq_mqtt_subscribe_error + on: vernemq.mqtt_subscribe_error + class: Errors + type: Messaging +component: VerneMQ + lookup: average -1m unaligned absolute + units: failed ops + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + summary: VerneMQ failed SUBSCRIBE + info: Number of failed v3/v5 SUBSCRIBE operations in the last minute + to: sysadmin + + template: vernemq_mqtt_subscribe_auth_error + on: vernemq.mqtt_subscribe_auth_error + class: Workload + type: Messaging +component: VerneMQ + lookup: average -1m unaligned absolute + units: attempts + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + summary: VerneMQ unauthorized SUBSCRIBE + info: number of unauthorized v3/v5 SUBSCRIBE attempts in the last minute + to: sysadmin + +# UNSUBSCRIBE errors + + template: vernemq_mqtt_unsubscribe_error + on: vernemq.mqtt_unsubscribe_error + class: Errors + type: Messaging +component: VerneMQ + lookup: average -1m unaligned absolute + units: failed ops + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + summary: VerneMQ failed UNSUBSCRIBE + info: Number of failed v3/v5 UNSUBSCRIBE operations in the last minute + to: sysadmin + +# PUBLISH errors and unauthorized attempts + + template: vernemq_mqtt_publish_errors + on: vernemq.mqtt_publish_errors + class: Errors + type: Messaging +component: VerneMQ + lookup: average -1m unaligned absolute + units: failed ops + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + summary: VerneMQ failed PUBLISH + info: Number of failed v3/v5 PUBLISH operations in the last minute + to: sysadmin + + template: vernemq_mqtt_publish_auth_errors + on: vernemq.mqtt_publish_auth_errors + class: Workload + type: Messaging +component: VerneMQ + lookup: average -1m unaligned absolute + units: attempts + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + summary: VerneMQ unauthorized PUBLISH + info: Number of unauthorized v3/v5 PUBLISH attempts in the last minute + to: sysadmin + +# Unsuccessful and unexpected PUBACK + + template: vernemq_mqtt_puback_received_reason_unsuccessful + on: vernemq.mqtt_puback_received_reason + class: Errors + type: Messaging +component: VerneMQ + lookup: average -1m unaligned absolute match-names of !success,* + units: packets + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + summary: VerneMQ unsuccessful received PUBACK + info: Number of received unsuccessful v5 PUBACK packets in the last minute + to: sysadmin + + template: vernemq_mqtt_puback_sent_reason_unsuccessful + on: vernemq.mqtt_puback_sent_reason + class: Errors + type: Messaging +component: VerneMQ + lookup: average -1m unaligned absolute match-names of !success,* + units: packets + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + summary: VerneMQ unsuccessful sent PUBACK + info: Number of sent unsuccessful v5 PUBACK packets in the last minute + to: sysadmin + + template: vernemq_mqtt_puback_unexpected + on: vernemq.mqtt_puback_invalid_error + class: Workload + type: Messaging +component: VerneMQ + lookup: average -1m unaligned absolute + units: messages + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + summary: VerneMQ unnexpected recieved PUBACK + info: Number of received unexpected v3/v5 PUBACK packets in the last minute + to: sysadmin + +# Unsuccessful and unexpected PUBREC + + template: vernemq_mqtt_pubrec_received_reason_unsuccessful + on: vernemq.mqtt_pubrec_received_reason + class: Errors + type: Messaging +component: VerneMQ + lookup: average -1m unaligned absolute match-names of !success,* + units: packets + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + summary: VerneMQ unsuccessful received PUBREC + info: Number of received unsuccessful v5 PUBREC packets in the last minute + to: sysadmin + + template: vernemq_mqtt_pubrec_sent_reason_unsuccessful + on: vernemq.mqtt_pubrec_sent_reason + class: Errors + type: Messaging +component: VerneMQ + lookup: average -1m unaligned absolute match-names of !success,* + units: packets + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + summary: VerneMQ unsuccessful sent PUBREC + info: Number of sent unsuccessful v5 PUBREC packets in the last minute + to: sysadmin + + template: vernemq_mqtt_pubrec_invalid_error + on: vernemq.mqtt_pubrec_invalid_error + class: Workload + type: Messaging +component: VerneMQ + lookup: average -1m unaligned absolute + units: messages + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + summary: VerneMQ invalid received PUBREC + info: Number of received invalid v3 PUBREC packets in the last minute + to: sysadmin + +# Unsuccessful PUBREL + + template: vernemq_mqtt_pubrel_received_reason_unsuccessful + on: vernemq.mqtt_pubrel_received_reason + class: Errors + type: Messaging +component: VerneMQ + lookup: average -1m unaligned absolute match-names of !success,* + units: packets + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + summary: VerneMQ unsuccessful received PUBREL + info: Number of received unsuccessful v5 PUBREL packets in the last minute + to: sysadmin + + template: vernemq_mqtt_pubrel_sent_reason_unsuccessful + on: vernemq.mqtt_pubrel_sent_reason + class: Errors + type: Messaging +component: VerneMQ + lookup: average -1m unaligned absolute match-names of !success,* + units: packets + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + summary: VerneMQ unsuccessful sent PUBREL + info: number of sent unsuccessful v5 PUBREL packets in the last minute + to: sysadmin + +# Unsuccessful and unexpected PUBCOMP + + template: vernemq_mqtt_pubcomp_received_reason_unsuccessful + on: vernemq.mqtt_pubcomp_received_reason + class: Errors + type: Messaging +component: VerneMQ + lookup: average -1m unaligned absolute match-names of !success,* + units: packets + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + summary: VerneMQ unsuccessful received PUBCOMP + info: Number of received unsuccessful v5 PUBCOMP packets in the last minute + to: sysadmin + + template: vernemq_mqtt_pubcomp_sent_reason_unsuccessful + on: vernemq.mqtt_pubcomp_sent_reason + class: Errors + type: Messaging +component: VerneMQ + lookup: average -1m unaligned absolute match-names of !success,* + units: packets + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + summary: VerneMQ unsuccessful sent PUBCOMP + info: number of sent unsuccessful v5 PUBCOMP packets in the last minute + to: sysadmin + + template: vernemq_mqtt_pubcomp_unexpected + on: vernemq.mqtt_pubcomp_invalid_error + class: Workload + type: Messaging +component: VerneMQ + lookup: average -1m unaligned absolute + units: messages + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (5)) + delay: up 2m down 5m multiplier 1.5 max 2h + summary: VerneMQ unexpected received PUBCOMP + info: number of received unexpected v3/v5 PUBCOMP packets in the last minute + to: sysadmin diff --git a/health/health.d/vsphere.conf b/health/health.d/vsphere.conf new file mode 100644 index 00000000..b8ad9aee --- /dev/null +++ b/health/health.d/vsphere.conf @@ -0,0 +1,70 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + +# -----------------------------------------------Virtual Machine-------------------------------------------------------- + + template: vsphere_vm_cpu_utilization + on: vsphere.vm_cpu_utilization + class: Utilization + type: Virtual Machine +component: CPU + hosts: * + lookup: average -10m unaligned match-names of used + units: % + every: 20s + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + summary: vSphere CPU utilization for VM ${label:vm} + info: CPU utilization VM ${label:vm} host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter} + to: silent + + template: vsphere_vm_mem_utilization + on: vsphere.vm_mem_utilization + class: Utilization + type: Virtual Machine +component: Memory + hosts: * + calc: $used + units: % + every: 20s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + summary: vSphere memory utilization for VM ${label:vm} + info: Memory utilization VM ${label:vm} host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter} + to: silent + +# -----------------------------------------------ESXI host-------------------------------------------------------------- + + template: vsphere_host_cpu_utilization + on: vsphere.host_cpu_utilization + class: Utilization + type: Virtual Machine +component: CPU + hosts: * + lookup: average -10m unaligned match-names of used + units: % + every: 20s + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + summary: vSphere ESXi CPU utilization for host ${label:host} + info: CPU utilization ESXi host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter} + to: sysadmin + + template: vsphere_host_mem_utilization + on: vsphere.host_mem_utilization + class: Utilization + type: Virtual Machine +component: Memory + hosts: * + calc: $used + units: % + every: 20s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + summary: vSphere ESXi Ram utilization for host ${label:host} + info: Memory utilization ESXi host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter} + to: sysadmin diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf new file mode 100644 index 00000000..78f1cc7f --- /dev/null +++ b/health/health.d/web_log.conf @@ -0,0 +1,205 @@ + +# unmatched lines + +# the following alarms trigger only when there are enough data. +# we assume there are enough data when: +# +# $1m_total_requests > 120 +# +# i.e. when there are at least 120 requests during the last minute + + template: web_log_1m_total_requests + on: web_log.requests + class: Workload + type: Web Server +component: Web log + lookup: sum -1m unaligned + calc: ($this == 0)?(1):($this) + units: requests + every: 10s + info: number of HTTP requests in the last minute + + template: web_log_1m_unmatched + on: web_log.excluded_requests + class: Errors + type: Web Server +component: Web log + lookup: sum -1m unaligned of unmatched + calc: $this * 100 / $web_log_1m_total_requests + units: % + every: 10s + warn: ($web_log_1m_total_requests > 120) ? ($this > 1) : ( 0 ) + delay: up 1m down 5m multiplier 1.5 max 1h + summary: Web log unparsed + info: Percentage of unparsed log lines over the last minute + to: webmaster + +# ----------------------------------------------------------------------------- +# high level response code alarms + +# the following alarms trigger only when there are enough data. +# we assume there are enough data when: +# +# $1m_requests > 120 +# +# i.e. when there are at least 120 requests during the last minute + + template: web_log_1m_requests + on: web_log.type_requests + class: Workload + type: Web Server +component: Web log + lookup: sum -1m unaligned + calc: ($this == 0)?(1):($this) + units: requests + every: 10s + info: number of HTTP requests in the last minute + + template: web_log_1m_successful + on: web_log.type_requests + class: Workload + type: Web Server +component: Web log + lookup: sum -1m unaligned of success + calc: $this * 100 / $web_log_1m_requests + units: % + every: 10s + warn: ($web_log_1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 ) + crit: ($web_log_1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 ) + delay: up 2m down 15m multiplier 1.5 max 1h + summary: Web log successful + info: Ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401) + to: webmaster + + template: web_log_1m_redirects + on: web_log.type_requests + class: Workload + type: Web Server +component: Web log + lookup: sum -1m unaligned of redirect + calc: $this * 100 / $web_log_1m_requests + units: % + every: 10s + warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING ) ? ( 1 ) : ( 20 )) ) : ( 0 ) + delay: up 2m down 15m multiplier 1.5 max 1h + summary: Web log redirects + info: Ratio of redirection HTTP requests over the last minute (3xx except 304) + to: webmaster + + template: web_log_1m_bad_requests + on: web_log.type_requests + class: Errors + type: Web Server +component: Web log + lookup: sum -1m unaligned of bad + calc: $this * 100 / $web_log_1m_requests + units: % + every: 10s + warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 10 ) : ( 30 )) ) : ( 0 ) + delay: up 2m down 15m multiplier 1.5 max 1h + summary: Web log bad requests + info: Ratio of client error HTTP requests over the last minute (4xx except 401) + to: webmaster + + template: web_log_1m_internal_errors + on: web_log.type_requests + class: Errors + type: Web Server +component: Web log + lookup: sum -1m unaligned of error + calc: $this * 100 / $web_log_1m_requests + units: % + every: 10s + warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING) ? ( 1 ) : ( 2 )) ) : ( 0 ) + crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 ) + delay: up 2m down 15m multiplier 1.5 max 1h + summary: Web log server errors + info: Ratio of server error HTTP requests over the last minute (5xx) + to: webmaster + +# ----------------------------------------------------------------------------- +# web slow + +# the following alarms trigger only when there are enough data. +# we assume there are enough data when: +# +# $1m_requests > 120 +# +# i.e. when there are at least 120 requests during the last minute + + template: web_log_10m_response_time + on: web_log.request_processing_time + class: Latency + type: System +component: Web log + lookup: average -10m unaligned of avg + units: ms + every: 30s + info: average HTTP response time over the last 10 minutes + + template: web_log_web_slow + on: web_log.request_processing_time + class: Latency + type: Web Server +component: Web log + lookup: average -1m unaligned of avg + units: ms + every: 10s + green: 500 + red: 1000 + warn: ($web_log_1m_requests > 120) ? ($this > $green && $this > ($web_log_10m_response_time * 2) ) : ( 0 ) + crit: ($web_log_1m_requests > 120) ? ($this > $red && $this > ($web_log_10m_response_time * 4) ) : ( 0 ) + delay: down 15m multiplier 1.5 max 1h + summary: Web log processing time + info: Average HTTP response time over the last 1 minute + options: no-clear-notification + to: webmaster + +# ----------------------------------------------------------------------------- +# web too many or too few requests + +# the following alarms trigger only when there are enough data. +# we assume there are enough data when: +# +# $5m_successful_old > 120 +# +# i.e. when there were at least 120 requests during the 5 minutes starting +# at -10m and ending at -5m + + template: web_log_5m_successful_old + on: web_log.type_requests + class: Workload + type: Web Server +component: Web log + lookup: average -5m at -5m unaligned of success + units: requests/s + every: 30s + info: average number of successful HTTP requests for the 5 minutes starting 10 minutes ago + + template: web_log_5m_successful + on: web_log.type_requests + class: Workload + type: Web Server +component: Web log + lookup: average -5m unaligned of success + units: requests/s + every: 30s + info: average number of successful HTTP requests over the last 5 minutes + + template: web_log_5m_requests_ratio + on: web_log.type_requests + class: Workload + type: Web Server +component: Web log + calc: ($web_log_5m_successful_old > 0)?($web_log_5m_successful * 100 / $web_log_5m_successful_old):(100) + units: % + every: 30s + warn: ($web_log_5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0) + crit: ($web_log_5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0) + delay: down 15m multiplier 1.5 max 1h + options: no-clear-notification + summary: Web log 5 minutes requests ratio + info: Ratio of successful HTTP requests over over the last 5 minutes, \ + compared with the previous 5 minutes \ + (clear notification for this alarm will not be sent) + to: webmaster diff --git a/health/health.d/whoisquery.conf b/health/health.d/whoisquery.conf new file mode 100644 index 00000000..0a328b59 --- /dev/null +++ b/health/health.d/whoisquery.conf @@ -0,0 +1,14 @@ + + template: whoisquery_days_until_expiration + on: whoisquery.time_until_expiration + class: Utilization + type: Other +component: WHOIS + calc: $expiry + units: seconds + every: 60s + warn: $this < $days_until_expiration_warning*24*60*60 + crit: $this < $days_until_expiration_critical*24*60*60 + summary: Whois expiration time for domain ${label:domain} + info: Time until the domain name registration for ${label:domain} expires + to: webmaster diff --git a/health/health.d/windows.conf b/health/health.d/windows.conf new file mode 100644 index 00000000..706fcbf2 --- /dev/null +++ b/health/health.d/windows.conf @@ -0,0 +1,126 @@ + +## CPU + + template: windows_10min_cpu_usage + on: windows.cpu_utilization_total + class: Utilization + type: Windows +component: CPU + os: * + hosts: * + lookup: average -10m unaligned match-names of dpc,user,privileged,interrupt + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + summary: CPU utilization + info: Average CPU utilization over the last 10 minutes + to: silent + + +## Memory + + template: windows_ram_in_use + on: windows.memory_utilization + class: Utilization + type: Windows +component: Memory + os: * + hosts: * + calc: ($used) * 100 / ($used + $available) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + summary: Ram utilization + info: Memory utilization + to: sysadmin + + +## Network + + template: windows_inbound_packets_discarded + on: windows.net_nic_discarded + class: Errors + type: Windows +component: Network + os: * + hosts: * + lookup: sum -10m unaligned absolute match-names of inbound + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + summary: Inbound network packets discarded + info: Number of inbound discarded packets for the network interface in the last 10 minutes + to: silent + + template: windows_outbound_packets_discarded + on: windows.net_nic_discarded + class: Errors + type: Windows +component: Network + os: * + hosts: * + lookup: sum -10m unaligned absolute match-names of outbound + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + summary: Outbound network packets discarded + info: Number of outbound discarded packets for the network interface in the last 10 minutes + to: silent + + template: windows_inbound_packets_errors + on: windows.net_nic_errors + class: Errors + type: Windows +component: Network + os: * + hosts: * + lookup: sum -10m unaligned absolute match-names of inbound + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + summary: Inbound network errors + info: Number of inbound errors for the network interface in the last 10 minutes + to: silent + + template: windows_outbound_packets_errors + on: windows.net_nic_errors + class: Errors + type: Windows +component: Network + os: * + hosts: * + lookup: sum -10m unaligned absolute match-names of outbound + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + summary: Outbound network errors + info: Number of outbound errors for the network interface in the last 10 minutes + to: silent + + +## Disk + + template: windows_disk_in_use + on: windows.logical_disk_space_usage + class: Utilization + type: Windows +component: Disk + os: * + hosts: * + calc: ($used) * 100 / ($used + $free) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + summary: Disk space usage + info: Disk space utilization + to: sysadmin diff --git a/health/health.d/x509check.conf b/health/health.d/x509check.conf new file mode 100644 index 00000000..d05f3ef0 --- /dev/null +++ b/health/health.d/x509check.conf @@ -0,0 +1,26 @@ + + template: x509check_days_until_expiration + on: x509check.time_until_expiration + class: Latency + type: Certificates +component: x509 certificates + calc: $expiry + units: seconds + every: 60s + warn: $this < $days_until_expiration_warning*24*60*60 + crit: $this < $days_until_expiration_critical*24*60*60 + summary: x509 certificate expiration for ${label:source} + info: Time until x509 certificate expires for ${label:source} + to: webmaster + + template: x509check_revocation_status + on: x509check.revocation_status + class: Errors + type: Certificates +component: x509 certificates + calc: $revoked + every: 60s + crit: $this != nan AND $this != 0 + summary: x509 certificate revocation status for ${label:source} + info: x509 certificate revocation status (0: revoked, 1: valid) for ${label:source} + to: webmaster diff --git a/health/health.d/zfs.conf b/health/health.d/zfs.conf new file mode 100644 index 00000000..d2a56100 --- /dev/null +++ b/health/health.d/zfs.conf @@ -0,0 +1,44 @@ + + alarm: zfs_memory_throttle + on: zfs.memory_ops + class: Utilization + type: System +component: File system + lookup: sum -10m unaligned absolute of throttled + units: events + every: 1m + warn: $this > 0 + delay: down 1h multiplier 1.5 max 2h + summary: ZFS ARC growth throttling + info: number of times ZFS had to limit the ARC growth in the last 10 minutes + to: silent + +# ZFS pool state + + template: zfs_pool_state_warn + on: zfspool.state + class: Errors + type: System +component: File system + calc: $degraded + units: boolean + every: 10s + warn: $this > 0 + delay: down 1m multiplier 1.5 max 1h + summary: ZFS pool ${label:pool} state + info: ZFS pool ${label:pool} state is degraded + to: sysadmin + + template: zfs_pool_state_crit + on: zfspool.state + class: Errors + type: System +component: File system + calc: $faulted + $unavail + units: boolean + every: 10s + crit: $this > 0 + delay: down 1m multiplier 1.5 max 1h + summary: Critical ZFS pool ${label:pool} state + info: ZFS pool ${label:pool} state is faulted or unavail + to: sysadmin |