diff options
Diffstat (limited to '')
-rw-r--r-- | src/health/health.d/adaptec_raid.conf | 29 | ||||
-rw-r--r-- | src/health/health.d/anomalies.conf | 25 | ||||
-rw-r--r-- | src/health/health.d/apcupsd.conf (renamed from health/health.d/apcupsd.conf) | 4 | ||||
-rw-r--r-- | src/health/health.d/bcache.conf (renamed from health/health.d/bcache.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/beanstalkd.conf (renamed from health/health.d/beanstalkd.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/boinc.conf (renamed from health/health.d/boinc.conf) | 10 | ||||
-rw-r--r-- | src/health/health.d/btrfs.conf (renamed from health/health.d/btrfs.conf) | 19 | ||||
-rw-r--r-- | src/health/health.d/ceph.conf (renamed from health/health.d/ceph.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/cgroups.conf | 67 | ||||
-rw-r--r-- | src/health/health.d/clickhouse.conf | 140 | ||||
-rw-r--r-- | src/health/health.d/cockroachdb.conf (renamed from health/health.d/cockroachdb.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/consul.conf (renamed from health/health.d/consul.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/cpu.conf | 65 | ||||
-rw-r--r-- | src/health/health.d/dbengine.conf (renamed from health/health.d/dbengine.conf) | 9 | ||||
-rw-r--r-- | src/health/health.d/disks.conf | 161 | ||||
-rw-r--r-- | src/health/health.d/dns_query.conf (renamed from health/health.d/dns_query.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/dnsmasq_dhcp.conf (renamed from health/health.d/dnsmasq_dhcp.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/docker.conf (renamed from health/health.d/docker.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/elasticsearch.conf (renamed from health/health.d/elasticsearch.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/entropy.conf | 19 | ||||
-rw-r--r-- | src/health/health.d/exporting.conf (renamed from health/health.d/exporting.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/file_descriptors.conf | 30 | ||||
-rw-r--r-- | src/health/health.d/gearman.conf (renamed from health/health.d/gearman.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/geth.conf (renamed from health/health.d/geth.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/go.d.plugin.conf | 17 | ||||
-rw-r--r-- | src/health/health.d/haproxy.conf (renamed from health/health.d/haproxy.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/hdfs.conf (renamed from health/health.d/hdfs.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/httpcheck.conf | 88 | ||||
-rw-r--r-- | src/health/health.d/ioping.conf (renamed from health/health.d/ioping.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/ipc.conf | 32 | ||||
-rw-r--r-- | src/health/health.d/ipfs.conf (renamed from health/health.d/ipfs.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/ipmi.conf (renamed from health/health.d/ipmi.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/isc_dhcpd.conf | 15 | ||||
-rw-r--r-- | src/health/health.d/kubelet.conf (renamed from health/health.d/kubelet.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/linux_power_supply.conf (renamed from health/health.d/linux_power_supply.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/load.conf | 67 | ||||
-rw-r--r-- | src/health/health.d/lvm.conf | 31 | ||||
-rw-r--r-- | src/health/health.d/mdstat.conf (renamed from health/health.d/mdstat.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/megacli.conf | 77 | ||||
-rw-r--r-- | src/health/health.d/memcached.conf (renamed from health/health.d/memcached.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/memory.conf | 76 | ||||
-rw-r--r-- | src/health/health.d/ml.conf (renamed from health/health.d/ml.conf) | 7 | ||||
-rw-r--r-- | src/health/health.d/mysql.conf (renamed from health/health.d/mysql.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/net.conf | 239 | ||||
-rw-r--r-- | src/health/health.d/netfilter.conf | 18 | ||||
-rw-r--r-- | src/health/health.d/nvme.conf (renamed from health/health.d/nvme.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/pihole.conf (renamed from health/health.d/pihole.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/ping.conf | 50 | ||||
-rw-r--r-- | src/health/health.d/plugin.conf (renamed from health/health.d/plugin.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/portcheck.conf (renamed from health/health.d/portcheck.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/postgres.conf | 216 | ||||
-rw-r--r-- | src/health/health.d/processes.conf (renamed from health/health.d/processes.conf) | 1 | ||||
-rw-r--r-- | src/health/health.d/python.d.plugin.conf | 17 | ||||
-rw-r--r-- | src/health/health.d/qos.conf | 16 | ||||
-rw-r--r-- | src/health/health.d/ram.conf | 76 | ||||
-rw-r--r-- | src/health/health.d/redis.conf | 58 | ||||
-rw-r--r-- | src/health/health.d/retroshare.conf (renamed from health/health.d/retroshare.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/riakkv.conf (renamed from health/health.d/riakkv.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/scaleio.conf (renamed from health/health.d/scaleio.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/softnet.conf | 53 | ||||
-rw-r--r-- | src/health/health.d/storcli.conf | 61 | ||||
-rw-r--r-- | src/health/health.d/swap.conf | 34 | ||||
-rw-r--r-- | src/health/health.d/synchronization.conf (renamed from health/health.d/synchronization.conf) | 1 | ||||
-rw-r--r-- | src/health/health.d/systemdunits.conf | 177 | ||||
-rw-r--r-- | src/health/health.d/tcp_conn.conf | 21 | ||||
-rw-r--r-- | src/health/health.d/tcp_listen.conf | 93 | ||||
-rw-r--r-- | src/health/health.d/tcp_mem.conf | 22 | ||||
-rw-r--r-- | src/health/health.d/tcp_orphans.conf | 22 | ||||
-rw-r--r-- | src/health/health.d/tcp_resets.conf | 66 | ||||
-rw-r--r-- | src/health/health.d/timex.conf | 17 | ||||
-rw-r--r-- | src/health/health.d/udp_errors.conf | 37 | ||||
-rw-r--r-- | src/health/health.d/unbound.conf (renamed from health/health.d/unbound.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/upsd.conf | 46 | ||||
-rw-r--r-- | src/health/health.d/vcsa.conf (renamed from health/health.d/vcsa.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/vernemq.conf (renamed from health/health.d/vernemq.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/vsphere.conf | 66 | ||||
-rw-r--r-- | src/health/health.d/web_log.conf (renamed from health/health.d/web_log.conf) | 0 | ||||
-rw-r--r-- | src/health/health.d/whoisquery.conf | 14 | ||||
-rw-r--r-- | src/health/health.d/windows.conf | 108 | ||||
-rw-r--r-- | src/health/health.d/x509check.conf | 26 | ||||
-rw-r--r-- | src/health/health.d/zfs.conf | 90 |
81 files changed, 2584 insertions, 49 deletions
diff --git a/src/health/health.d/adaptec_raid.conf b/src/health/health.d/adaptec_raid.conf new file mode 100644 index 000000000..b01113b69 --- /dev/null +++ b/src/health/health.d/adaptec_raid.conf @@ -0,0 +1,29 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: adaptec_raid_ld_health_status + on: adaptecraid.logical_device_status + class: Errors + type: System +component: RAID + lookup: average -1m unaligned percentage of ok + units: % + every: 10s + crit: $this < 100 + delay: down 5m multiplier 1.5 max 1h + summary: Adaptec RAID LD (number ${label:ld_number}) health status + info: Adaptec RAID logical device (number ${label:ld_number} name ${label:ld_name}) health status is critical + to: sysadmin + + template: adaptec_raid_pd_health_state + on: adaptecraid.physical_device_state + class: Errors + type: System +component: RAID + lookup: average -1m unaligned percentage of ok + units: % + every: 10s + crit: $this < 100 + delay: down 5m multiplier 1.5 max 1h + summary: Adaptec RAID PD (number ${label:pd_number}) health state + info: Adaptec RAID physical device (number ${label:pd_number} location ${label:location}) health state is critical + to: sysadmin diff --git a/src/health/health.d/anomalies.conf b/src/health/health.d/anomalies.conf new file mode 100644 index 000000000..80d63bb8d --- /dev/null +++ b/src/health/health.d/anomalies.conf @@ -0,0 +1,25 @@ +## raise a warning alarm if an anomaly probability is consistently above 50% + +## "foreach" was removed, these alarms don't work anymore + +# template: anomalies_anomaly_probabilities +# on: anomalies.probability +# class: Errors +# type: Netdata +#component: ML +# lookup: average -2m foreach * +# every: 1m +# warn: $this > 50 +# info: average anomaly probability over the last 2 minutes + +# raise a warning alarm if an anomaly flag is consistently firing + +# template: anomalies_anomaly_flags +# on: anomalies.anomaly +# class: Errors +# type: Netdata +#component: ML +# lookup: sum -2m foreach * +# every: 1m +# warn: $this > 10 +# info: number of anomalies in the last 2 minutes diff --git a/health/health.d/apcupsd.conf b/src/health/health.d/apcupsd.conf index 90a72af19..5fd7aa112 100644 --- a/health/health.d/apcupsd.conf +++ b/src/health/health.d/apcupsd.conf @@ -5,8 +5,6 @@ class: Utilization type: Power Supply component: UPS - os: * - hosts: * lookup: average -10m unaligned of percentage units: % every: 1m @@ -23,8 +21,6 @@ component: UPS class: Errors type: Power Supply component: UPS - os: * - hosts: * lookup: average -60s unaligned of charge units: % every: 60s diff --git a/health/health.d/bcache.conf b/src/health/health.d/bcache.conf index 446173428..446173428 100644 --- a/health/health.d/bcache.conf +++ b/src/health/health.d/bcache.conf diff --git a/health/health.d/beanstalkd.conf b/src/health/health.d/beanstalkd.conf index 0d37f28e0..0d37f28e0 100644 --- a/health/health.d/beanstalkd.conf +++ b/src/health/health.d/beanstalkd.conf diff --git a/health/health.d/boinc.conf b/src/health/health.d/boinc.conf index 092a56845..6fd987de1 100644 --- a/health/health.d/boinc.conf +++ b/src/health/health.d/boinc.conf @@ -1,4 +1,4 @@ -# Alarms for various BOINC issues. +# you can disable an alarm notification by setting the 'to' line to: silent # Warn on any compute errors encountered. template: boinc_compute_errors @@ -6,8 +6,6 @@ class: Errors type: Computing component: BOINC - os: * - hosts: * lookup: average -10m unaligned of comperror units: tasks every: 1m @@ -23,8 +21,6 @@ component: BOINC class: Errors type: Computing component: BOINC - os: * - hosts: * lookup: average -10m unaligned of upload_failed units: tasks every: 1m @@ -40,8 +36,6 @@ component: BOINC class: Utilization type: Computing component: BOINC - os: * - hosts: * lookup: average -10m unaligned of total units: tasks every: 1m @@ -57,8 +51,6 @@ component: BOINC class: Utilization type: Computing component: BOINC - os: * - hosts: * lookup: average -10m unaligned of active calc: ($boinc_total_tasks >= 1) ? ($this) : (inf) units: tasks diff --git a/health/health.d/btrfs.conf b/src/health/health.d/btrfs.conf index 1557a5941..f43f600c0 100644 --- a/health/health.d/btrfs.conf +++ b/src/health/health.d/btrfs.conf @@ -1,11 +1,10 @@ +# you can disable an alarm notification by setting the 'to' line to: silent template: btrfs_allocated on: btrfs.disk class: Utilization type: System component: File system - os: * - hosts: * calc: 100 - ($unallocated * 100 / ($unallocated + $data_used + $data_free + $meta_used + $meta_free + $sys_used + $sys_free)) units: % every: 10s @@ -20,8 +19,6 @@ component: File system class: Utilization type: System component: File system - os: * - hosts: * calc: $used * 100 / ($used + $free) units: % every: 10s @@ -37,8 +34,6 @@ component: File system class: Utilization type: System component: File system - os: * - hosts: * calc: ($used + $reserved) * 100 / ($used + $free + $reserved) units: % every: 10s @@ -54,8 +49,6 @@ component: File system class: Utilization type: System component: File system - os: * - hosts: * calc: $used * 100 / ($used + $free) units: % every: 10s @@ -71,8 +64,6 @@ component: File system class: Errors type: System component: File system - os: * - hosts: * units: errors lookup: max -10m every 1m of read_errs warn: $this > 0 @@ -86,8 +77,6 @@ component: File system class: Errors type: System component: File system - os: * - hosts: * units: errors lookup: max -10m every 1m of write_errs crit: $this > 0 @@ -101,8 +90,6 @@ component: File system class: Errors type: System component: File system - os: * - hosts: * units: errors lookup: max -10m every 1m of flush_errs crit: $this > 0 @@ -116,8 +103,6 @@ component: File system class: Errors type: System component: File system - os: * - hosts: * units: errors lookup: max -10m every 1m of corruption_errs warn: $this > 0 @@ -131,8 +116,6 @@ component: File system class: Errors type: System component: File system - os: * - hosts: * units: errors lookup: max -10m every 1m of generation_errs warn: $this > 0 diff --git a/health/health.d/ceph.conf b/src/health/health.d/ceph.conf index 44d351338..44d351338 100644 --- a/health/health.d/ceph.conf +++ b/src/health/health.d/ceph.conf diff --git a/src/health/health.d/cgroups.conf b/src/health/health.d/cgroups.conf new file mode 100644 index 000000000..52ca02624 --- /dev/null +++ b/src/health/health.d/cgroups.conf @@ -0,0 +1,67 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: cgroup_10min_cpu_usage + on: cgroup.cpu_limit + class: Utilization + type: Cgroups + component: CPU +host labels: _os=linux + lookup: average -10m unaligned + units: % + every: 1m + warn: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + summary: Cgroup ${label:cgroup_name} CPU utilization + info: Cgroup ${label:cgroup_name} average CPU utilization over the last 10 minutes + to: silent + + template: cgroup_ram_in_use + on: cgroup.mem_usage + class: Utilization + type: Cgroups + component: Memory +host labels: _os=linux + calc: ($ram) * 100 / $memory_limit + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + summary: Cgroup ${label:cgroup_name} memory utilization + info: Cgroup ${label:cgroup_name} memory utilization + to: silent + +# ---------------------------------K8s containers-------------------------------------------- + + template: k8s_cgroup_10min_cpu_usage + on: k8s.cgroup.cpu_limit + class: Utilization + type: Cgroups + component: CPU +host labels: _os=linux + lookup: average -10m unaligned + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (85)) + delay: down 15m multiplier 1.5 max 1h + summary: Container ${label:k8s_container_name} pod ${label:k8s_pod_name} CPU utilization + info: Container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \ + average CPU utilization over the last 10 minutes + to: silent + + template: k8s_cgroup_ram_in_use + on: k8s.cgroup.mem_usage + class: Utilization + type: Cgroups + component: Memory +host labels: _os=linux + calc: ($ram) * 100 / $memory_limit + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + summary: Container ${label:k8s_container_name} pod ${label:k8s_pod_name} memory utilization + info: container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \ + memory utilization + to: silent diff --git a/src/health/health.d/clickhouse.conf b/src/health/health.d/clickhouse.conf new file mode 100644 index 000000000..e24f71830 --- /dev/null +++ b/src/health/health.d/clickhouse.conf @@ -0,0 +1,140 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: clickhouse_restarted + on: clickhouse.uptime + class: Error + type: Database +component: ClickHouse + calc: $uptime + units: seconds + every: 10s + warn: $this > 1 AND $this < 180 + summary: ClickHouse restart detected + info: ClickHouse has recently been restarted + to: silent + + template: clickhouse_queries_preempted + on: clickhouse.queries_preempted + class: Workload + type: Database +component: ClickHouse + lookup: max -1m unaligned + units: preempted_queries + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 1h + summary: ClickHouse preempted queries detected + info: ClickHouse has queries that are stopped and waiting due to priority setting + to: dba + + template: clickhouse_long_running_query + on: clickhouse.longest_running_query_time + class: Latency + type: Database +component: ClickHouse + lookup: max -1m unaligned + units: seconds + every: 10s + warn: $this > (($status >= $WARNING) ? (300) : (600)) + delay: down 5m multiplier 1.5 max 1h + summary: ClickHouse long-running query detected + info: ClickHouse has a long-running query exceeding the threshold + to: dba + + template: clickhouse_rejected_inserts + on: clickhouse.rejected_inserts + class: Workload + type: Database +component: ClickHouse + lookup: sum -1m unaligned + units: rejected_inserts + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 1h + summary: ClickHouse rejected INSERT queries detected + info: ClickHouse has INSERT queries that are rejected due to high number of active data parts for partition in a MergeTree + to: dba + + template: clickhouse_delayed_inserts + on: clickhouse.delayed_inserts + class: Workload + type: Database +component: ClickHouse + lookup: sum -1m unaligned + units: delayed_inserts + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 1h + summary: ClickHouse delayed INSERT queries detected + info: ClickHouse has INSERT queries that are throttled due to high number of active data parts for partition in a MergeTree + to: silent + + template: clickhouse_replication_lag + on: clickhouse.replicas_max_absolute_delay + class: Workload + type: Database +component: ClickHouse + lookup: avg -1m unaligned + units: seconds + every: 10s + warn: $this > (($status >= $WARNING) ? (250) : (300)) + delay: down 5m multiplier 1.5 max 1h + summary: ClickHouse high replication lag detected + info: ClickHouse is experiencing replication lag greater than 5 minutes + to: dba + + template: clickhouse_replicated_readonly_tables + on: clickhouse.replicated_readonly_tables + class: Error + type: Database +component: ClickHouse + lookup: max -1m unaligned + units: readonly_tables + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 1h + summary: ClickHouse replicated tables in readonly state detected + info: ClickHouse has replicated tables in readonly state due to ZooKeeper session loss/startup without ZooKeeper configured + to: dba + + template: clickhouse_max_part_count_for_partition + on: clickhouse.max_part_count_for_partition + class: Workload + type: Database +component: ClickHouse + lookup: avg -1m unaligned + units: parts + every: 10s + warn: $this > (($status >= $WARNING) ? (200) : (300)) + delay: down 5m multiplier 1.5 max 1h + summary: ClickHouse high parts/partition detected + info: ClickHouse high number of parts per partition + to: dba + + template: clickhouse_distributed_connections_failures + on: clickhouse.distributed_connections_fail_exhausted_retries + class: Error + type: Database +component: ClickHouse + lookup: sum -1m unaligned + units: failures + every: 10s + warn: $this > 0 + delay: down 5m multiplier 1.5 max 1h + summary: ClickHouse distributed connections failures detected + info: ClickHouse has failed distributed connections after exhausting all retry attempts + to: dba + + template: clickhouse_distributed_files_to_insert + on: clickhouse.distributed_files_to_insert + class: Workload + type: Database +component: ClickHouse + lookup: max -1m unaligned + units: files + every: 10s + warn: $this > (($status >= $WARNING) ? (40) : (80)) + delay: down 5m multiplier 1.5 max 1h + summary: ClickHouse high files to insert detected + info: ClickHouse high number of pending files to process for asynchronous insertion into Distributed tables + to: silent diff --git a/health/health.d/cockroachdb.conf b/src/health/health.d/cockroachdb.conf index 60f178354..60f178354 100644 --- a/health/health.d/cockroachdb.conf +++ b/src/health/health.d/cockroachdb.conf diff --git a/health/health.d/consul.conf b/src/health/health.d/consul.conf index 8b414a26d..8b414a26d 100644 --- a/health/health.d/consul.conf +++ b/src/health/health.d/consul.conf diff --git a/src/health/health.d/cpu.conf b/src/health/health.d/cpu.conf new file mode 100644 index 000000000..29f541e56 --- /dev/null +++ b/src/health/health.d/cpu.conf @@ -0,0 +1,65 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + + template: 10min_cpu_usage + on: system.cpu + class: Utilization + type: System + component: CPU +host labels: _os=linux + lookup: average -10m unaligned of user,system,softirq,irq,guest + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + summary: System CPU utilization + info: Average CPU utilization over the last 10 minutes (excluding iowait, nice and steal) + to: sysadmin + + template: 10min_cpu_iowait + on: system.cpu + class: Utilization + type: System + component: CPU +host labels: _os=linux + lookup: average -10m unaligned of iowait + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (20) : (40)) + delay: up 30m down 30m multiplier 1.5 max 2h + summary: System CPU iowait time + info: Average CPU iowait time over the last 10 minutes + to: silent + + template: 20min_steal_cpu + on: system.cpu + class: Latency + type: System + component: CPU +host labels: _os=linux + lookup: average -20m unaligned of steal + units: % + every: 5m + warn: $this > (($status >= $WARNING) ? (5) : (10)) + delay: down 1h multiplier 1.5 max 2h + summary: System CPU steal time + info: Average CPU steal time over the last 20 minutes + to: silent + +## FreeBSD + template: 10min_cpu_usage + on: system.cpu + class: Utilization + type: System + component: CPU +host labels: _os=freebsd + lookup: average -10m unaligned of user,system,interrupt + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + summary: System CPU utilization + info: Average CPU utilization over the last 10 minutes (excluding nice) + to: sysadmin diff --git a/health/health.d/dbengine.conf b/src/health/health.d/dbengine.conf index 0a70d2e8f..5585a9533 100644 --- a/health/health.d/dbengine.conf +++ b/src/health/health.d/dbengine.conf @@ -1,4 +1,3 @@ - # you can disable an alarm notification by setting the 'to' line to: silent alarm: 10min_dbengine_global_fs_errors @@ -6,8 +5,6 @@ class: Errors type: Netdata component: DB engine - os: linux freebsd macos - hosts: * lookup: sum -10m unaligned of fs_errors units: errors every: 10s @@ -22,8 +19,6 @@ component: DB engine class: Errors type: Netdata component: DB engine - os: linux freebsd macos - hosts: * lookup: sum -10m unaligned of io_errors units: errors every: 10s @@ -38,8 +33,6 @@ component: DB engine class: Errors type: Netdata component: DB engine - os: linux freebsd macos - hosts: * lookup: sum -10m unaligned of pg_cache_over_half_dirty_events units: errors every: 10s @@ -55,8 +48,6 @@ component: DB engine class: Errors type: Netdata component: DB engine - os: linux freebsd macos - hosts: * lookup: sum -10m unaligned of flushing_pressure_deletions units: pages every: 10s diff --git a/src/health/health.d/disks.conf b/src/health/health.d/disks.conf new file mode 100644 index 000000000..fe96837fb --- /dev/null +++ b/src/health/health.d/disks.conf @@ -0,0 +1,161 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +# ----------------------------------------------------------------------------- +# low disk space + +# checking the latest collected values +# raise an alarm if the disk is low on +# available disk space + + template: disk_space_usage + on: disk.space + class: Utilization + type: System + component: Disk + host labels: _os=linux freebsd +chart labels: mount_point=!/dev !/dev/* !/run !/run/* * + calc: $used * 100 / ($avail + $used) + units: % + every: 1m + warn: $this > (($status >= $WARNING ) ? (80) : (90)) + crit: ($this > (($status == $CRITICAL) ? (90) : (98))) && $avail < 5 + delay: up 1m down 15m multiplier 1.5 max 1h + summary: Disk ${label:mount_point} space usage + info: Total space utilization of disk ${label:mount_point} + to: sysadmin + + template: disk_inode_usage + on: disk.inodes + class: Utilization + type: System + component: Disk + host labels: _os=linux freebsd +chart labels: mount_point=!/dev !/dev/* !/run !/run/* * + calc: $used * 100 / ($avail + $used) + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: up 1m down 15m multiplier 1.5 max 1h + summary: Disk ${label:mount_point} inode usage + info: Total inode utilization of disk ${label:mount_point} + to: sysadmin + + +# ----------------------------------------------------------------------------- +# disk fill rate + +# calculate the rate the disk fills +# use as base, the available space change +# during the last hour + +# this is just a calculation - it has no alarm +# we will use it in the next template to find +# the hours remaining + + template: disk_fill_rate + on: disk.space +host labels: _os=linux freebsd + lookup: min -10m at -50m unaligned of avail + calc: ($this - $avail) / (($now - $after) / 3600) + every: 1m + units: GB/hour + info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour + +# calculate the hours remaining +# if the disk continues to fill in this rate + + template: out_of_disk_space_time + on: disk.space +host labels: _os=linux freebsd + calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf) + units: hours + every: 10s + warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) + crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) + delay: down 15m multiplier 1.2 max 1h + summary: Disk ${label:mount_point} estimation of lack of space + info: Estimated time the disk ${label:mount_point} will run out of space, if the system continues to add data with the rate of the last hour + to: silent + + +# ----------------------------------------------------------------------------- +# disk inode fill rate + +# calculate the rate the disk inodes are allocated +# use as base, the available inodes change +# during the last hour + +# this is just a calculation - it has no alarm +# we will use it in the next template to find +# the hours remaining + + template: disk_inode_rate + on: disk.inodes +host labels: _os=linux freebsd + lookup: min -10m at -50m unaligned of avail + calc: ($this - $avail) / (($now - $after) / 3600) + every: 1m + units: inodes/hour + info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour + +# calculate the hours remaining +# if the disk inodes are allocated +# in this rate + + template: out_of_disk_inodes_time + on: disk.inodes +host labels: _os=linux freebsd + calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf) + units: hours + every: 10s + warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) + crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) + delay: down 15m multiplier 1.2 max 1h + summary: Disk ${label:mount_point} estimation of lack of inodes + info: Estimated time the disk ${label:mount_point} will run out of inodes, if the system continues to allocate inodes with the rate of the last hour + to: silent + + +# ----------------------------------------------------------------------------- +# disk congestion + +# raise an alarm if the disk is congested +# by calculating the average disk utilization +# for the last 10 minutes + + template: 10min_disk_utilization + on: disk.util + class: Utilization + type: System + component: Disk +host labels: _os=linux freebsd + lookup: average -10m unaligned + units: % + every: 1m + warn: $this > 98 * (($status >= $WARNING) ? (0.7) : (1)) + delay: down 15m multiplier 1.2 max 1h + summary: Disk ${label:device} utilization + info: Average percentage of time ${label:device} disk was busy over the last 10 minutes + to: silent + + +# raise an alarm if the disk backlog +# is above 1000ms (1s) per second +# for 10 minutes +# (i.e. the disk cannot catch up) + + template: 10min_disk_backlog + on: disk.backlog + class: Latency + type: System + component: Disk +host labels: _os=linux freebsd + lookup: average -10m unaligned + units: ms + every: 1m + warn: $this > 5000 * (($status >= $WARNING) ? (0.7) : (1)) + delay: down 15m multiplier 1.2 max 1h + summary: Disk ${label:device} backlog + info: Average backlog size of the ${label:device} disk over the last 10 minutes + to: silent diff --git a/health/health.d/dns_query.conf b/src/health/health.d/dns_query.conf index 756c6a1b6..756c6a1b6 100644 --- a/health/health.d/dns_query.conf +++ b/src/health/health.d/dns_query.conf diff --git a/health/health.d/dnsmasq_dhcp.conf b/src/health/health.d/dnsmasq_dhcp.conf index f6ef01940..f6ef01940 100644 --- a/health/health.d/dnsmasq_dhcp.conf +++ b/src/health/health.d/dnsmasq_dhcp.conf diff --git a/health/health.d/docker.conf b/src/health/health.d/docker.conf index 668614d4d..668614d4d 100644 --- a/health/health.d/docker.conf +++ b/src/health/health.d/docker.conf diff --git a/health/health.d/elasticsearch.conf b/src/health/health.d/elasticsearch.conf index 600840c58..600840c58 100644 --- a/health/health.d/elasticsearch.conf +++ b/src/health/health.d/elasticsearch.conf diff --git a/src/health/health.d/entropy.conf b/src/health/health.d/entropy.conf new file mode 100644 index 000000000..f7671353c --- /dev/null +++ b/src/health/health.d/entropy.conf @@ -0,0 +1,19 @@ + +# check if entropy is too low +# the alarm is checked every 1 minute +# and examines the last hour of data + + alarm: lowest_entropy + on: system.entropy + class: Utilization + type: System + component: Cryptography +host labels: _os=linux + lookup: min -5m unaligned + units: entries + every: 5m + warn: $this < (($status >= $WARNING) ? (200) : (100)) + delay: down 1h multiplier 1.5 max 2h + summary: System entropy pool number of entries + info: Minimum number of entries in the random numbers pool in the last 5 minutes + to: silent diff --git a/health/health.d/exporting.conf b/src/health/health.d/exporting.conf index c0320193c..c0320193c 100644 --- a/health/health.d/exporting.conf +++ b/src/health/health.d/exporting.conf diff --git a/src/health/health.d/file_descriptors.conf b/src/health/health.d/file_descriptors.conf new file mode 100644 index 000000000..b4b4500e3 --- /dev/null +++ b/src/health/health.d/file_descriptors.conf @@ -0,0 +1,30 @@ + # you can disable an alarm notification by setting the 'to' line to: silent + + template: system_file_descriptors_utilization + on: system.file_nr_utilization + class: Utilization + type: System + component: Processes + lookup: max -1m unaligned + units: % + every: 1m + crit: $this > 90 + delay: down 15m multiplier 1.5 max 1h + summary: System open file descriptors utilization + info: System-wide utilization of open files + to: sysadmin + + template: apps_group_file_descriptors_utilization + on: app.fds_open_limit + class: Utilization + type: System + component: Process +host labels: _os=linux + lookup: max -10s unaligned + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + summary: App group ${label:app_group} file descriptors utilization + info: Open files percentage against the processes limits, among all PIDs in application group + to: sysadmin diff --git a/health/health.d/gearman.conf b/src/health/health.d/gearman.conf index 78e1165d1..78e1165d1 100644 --- a/health/health.d/gearman.conf +++ b/src/health/health.d/gearman.conf diff --git a/health/health.d/geth.conf b/src/health/health.d/geth.conf index 361b6b41f..361b6b41f 100644 --- a/health/health.d/geth.conf +++ b/src/health/health.d/geth.conf diff --git a/src/health/health.d/go.d.plugin.conf b/src/health/health.d/go.d.plugin.conf new file mode 100644 index 000000000..eb951448b --- /dev/null +++ b/src/health/health.d/go.d.plugin.conf @@ -0,0 +1,17 @@ +# make sure go.d.plugin data collection job is running + + template: go.d_job_last_collected_secs + on: netdata.go_plugin_execution_time + class: Errors + type: Netdata + component: go.d.plugin +host labels: _hostname=!* + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + summary: Go.d plugin last collection + info: Number of seconds since the last successful data collection + to: webmaster diff --git a/health/health.d/haproxy.conf b/src/health/health.d/haproxy.conf index 66a488fa4..66a488fa4 100644 --- a/health/health.d/haproxy.conf +++ b/src/health/health.d/haproxy.conf diff --git a/health/health.d/hdfs.conf b/src/health/health.d/hdfs.conf index 566e815aa..566e815aa 100644 --- a/health/health.d/hdfs.conf +++ b/src/health/health.d/hdfs.conf diff --git a/src/health/health.d/httpcheck.conf b/src/health/health.d/httpcheck.conf new file mode 100644 index 000000000..3d32dedbf --- /dev/null +++ b/src/health/health.d/httpcheck.conf @@ -0,0 +1,88 @@ + +# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges + template: httpcheck_web_service_up + on: httpcheck.status + class: Utilization + type: Web Server +component: HTTP endpoint + lookup: average -1m unaligned percentage of success + calc: ($this < 75) ? (0) : ($this) + every: 5s + units: up/down + info: HTTP check endpoint ${label:url} liveness status + to: silent + + template: httpcheck_web_service_bad_content + on: httpcheck.status + class: Workload + type: Web Server +component: HTTP endpoint + lookup: average -5m unaligned percentage of bad_content + every: 10s + units: % + warn: $this >= 10 AND $this < 40 + crit: $this >= 40 + delay: down 5m multiplier 1.5 max 1h + summary: HTTP check for ${label:url} unexpected content + info: Percentage of HTTP responses from ${label:url} with unexpected content in the last 5 minutes + to: webmaster + + template: httpcheck_web_service_bad_status + on: httpcheck.status + class: Workload + type: Web Server +component: HTTP endpoint + lookup: average -5m unaligned percentage of bad_status + every: 10s + units: % + warn: $this >= 10 AND $this < 40 + crit: $this >= 40 + delay: down 5m multiplier 1.5 max 1h + summary: HTTP check for ${label:url} unexpected status + info: Percentage of HTTP responses from ${label:url} with unexpected status in the last 5 minutes + to: webmaster + + template: httpcheck_web_service_bad_header + on: httpcheck.status + class: Errors + type: Web Server +component: HTTP endpoint + lookup: average -5m unaligned percentage of bad_header + every: 10s + units: % + warn: $this >= 10 AND $this < 40 + crit: $this >= 40 + delay: down 5m multiplier 1.5 max 1h + summary: HTTP check for ${label:url} unexpected header + info: Percentage of HTTP responses from ${label:url} with unexpected header in the last 5 minutes + to: webmaster + + template: httpcheck_web_service_timeouts + on: httpcheck.status + class: Latency + type: Web Server +component: HTTP endpoint + lookup: average -5m unaligned percentage of timeout + every: 10s + units: % + warn: $this >= 10 AND $this < 40 + crit: $this >= 40 + delay: down 5m multiplier 1.5 max 1h + summary: HTTP check for ${label:url} timeouts + info: Percentage of timed-out HTTP requests to ${label:url} in the last 5 minutes + to: webmaster + + template: httpcheck_web_service_no_connection + on: httpcheck.status + class: Errors + type: Other +component: HTTP endpoint + lookup: average -5m unaligned percentage of no_connection + every: 10s + units: % + warn: $this >= 10 AND $this < 40 + crit: $this >= 40 + delay: down 5m multiplier 1.5 max 1h + summary: HTTP check for ${label:url} failed requests + info: Percentage of failed HTTP requests to ${label:url} in the last 5 minutes + to: webmaster diff --git a/health/health.d/ioping.conf b/src/health/health.d/ioping.conf index 6d832bf00..6d832bf00 100644 --- a/health/health.d/ioping.conf +++ b/src/health/health.d/ioping.conf diff --git a/src/health/health.d/ipc.conf b/src/health/health.d/ipc.conf new file mode 100644 index 000000000..f46cf4285 --- /dev/null +++ b/src/health/health.d/ipc.conf @@ -0,0 +1,32 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: semaphores_used + on: system.ipc_semaphores + class: Utilization + type: System + component: IPC +host labels: _os=linux + calc: $semaphores * 100 / $ipc_semaphores_max + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (70) : (80)) + delay: down 5m multiplier 1.5 max 1h + summary: IPC semaphores used + info: IPC semaphore utilization + to: sysadmin + + alarm: semaphore_arrays_used + on: system.ipc_semaphore_arrays + class: Utilization + type: System + component: IPC +host labels: _os=linux + calc: $arrays * 100 / $ipc_semaphores_arrays_max + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (70) : (80)) + delay: down 5m multiplier 1.5 max 1h + summary: IPC semaphore arrays used + info: IPC semaphore arrays utilization + to: sysadmin diff --git a/health/health.d/ipfs.conf b/src/health/health.d/ipfs.conf index 4dfee3c7f..4dfee3c7f 100644 --- a/health/health.d/ipfs.conf +++ b/src/health/health.d/ipfs.conf diff --git a/health/health.d/ipmi.conf b/src/health/health.d/ipmi.conf index cec2320a9..cec2320a9 100644 --- a/health/health.d/ipmi.conf +++ b/src/health/health.d/ipmi.conf diff --git a/src/health/health.d/isc_dhcpd.conf b/src/health/health.d/isc_dhcpd.conf new file mode 100644 index 000000000..3f6e9d5d4 --- /dev/null +++ b/src/health/health.d/isc_dhcpd.conf @@ -0,0 +1,15 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: isc_dhcpd_dhcp_pool_utilization + on: isc_dhcpd.dhcp_pool_utilization + class: Utilization + type: DHCP +component: DHCPd + every: 10s + units: % + calc: $used + warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) ) + delay: down 5m + summary: ISC DHCP pool ${label:dhcp_pool_name} utilization + info: ISC DHCP pool ${label:dhcp_pool_name} utilization + to: sysadmin diff --git a/health/health.d/kubelet.conf b/src/health/health.d/kubelet.conf index 8adf5f7d4..8adf5f7d4 100644 --- a/health/health.d/kubelet.conf +++ b/src/health/health.d/kubelet.conf diff --git a/health/health.d/linux_power_supply.conf b/src/health/health.d/linux_power_supply.conf index b0d35e752..b0d35e752 100644 --- a/health/health.d/linux_power_supply.conf +++ b/src/health/health.d/linux_power_supply.conf diff --git a/src/health/health.d/load.conf b/src/health/health.d/load.conf new file mode 100644 index 000000000..e639c9ad5 --- /dev/null +++ b/src/health/health.d/load.conf @@ -0,0 +1,67 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +# Calculate the base trigger point for the load average alarms. +# This is the maximum number of CPU's in the system over the past 1 +# minute, with a special case for a single CPU of setting the trigger at 2. + alarm: load_cpu_number + on: system.load + class: Utilization + type: System + component: Load +host labels: _os=linux + calc: ($active_processors == nan or $active_processors == 0) ? (nan) : ( ($active_processors < 2) ? ( 2 ) : ( $active_processors ) ) + units: cpus + every: 1m + info: Number of active CPU cores in the system + +# Send alarms if the load average is unusually high. +# These intentionally _do not_ calculate the average over the sampled +# time period because the values being checked already are averages. + + alarm: load_average_15 + on: system.load + class: Utilization + type: System + component: Load +host labels: _os=linux + lookup: max -1m unaligned of load15 + calc: ($load_cpu_number == nan) ? (nan) : ($this) + units: load + every: 1m + warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 175 : 200) + delay: down 15m multiplier 1.5 max 1h + summary: System load average (15 minutes) + info: System load average for the past 15 minutes + to: silent + + alarm: load_average_5 + on: system.load + class: Utilization + type: System + component: Load +host labels: _os=linux + lookup: max -1m unaligned of load5 + calc: ($load_cpu_number == nan) ? (nan) : ($this) + units: load + every: 1m + warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 350 : 400) + delay: down 15m multiplier 1.5 max 1h + summary: System load average (5 minutes) + info: System load average for the past 5 minutes + to: silent + + alarm: load_average_1 + on: system.load + class: Utilization + type: System + component: Load +host labels: _os=linux + lookup: max -1m unaligned of load1 + calc: ($load_cpu_number == nan) ? (nan) : ($this) + units: load + every: 1m + warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 700 : 800) + delay: down 15m multiplier 1.5 max 1h + summary: System load average (1 minute) + info: System load average for the past 1 minute + to: silent diff --git a/src/health/health.d/lvm.conf b/src/health/health.d/lvm.conf new file mode 100644 index 000000000..570aa14d3 --- /dev/null +++ b/src/health/health.d/lvm.conf @@ -0,0 +1,31 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: lvm_lv_data_space_utilization + on: lvm.lv_data_space_utilization + class: Utilization + type: System + component: LVM + calc: $utilization + units: % + every: 1m + warn: $this > (($status >= $WARNING ) ? (85) : (90)) + crit: ($this > (($status == $CRITICAL) ? (90) : (98))) + delay: down 5m multiplier 1.5 max 1h + summary: LVM LV ${label:lv_name} on VG ${label:vg_name} high data space usage + info: LVM logical volume high data space usage (LV ${label:lv_name} VG ${label:vg_name} Type ${label:volume_type}) + to: sysadmin + + template: lvm_lv_metadata_space_utilization + on: lvm.lv_metadata_space_utilization + class: Utilization + type: System + component: LVM + calc: $utilization + units: % + every: 1m + warn: $this > (($status >= $WARNING ) ? (85) : (90)) + crit: ($this > (($status == $CRITICAL) ? (90) : (98))) + delay: down 5m multiplier 1.5 max 1h + summary: LVM LV ${label:lv_name} on VG ${label:vg_name} high metadata space usage + info: LVM logical volume high metadata space usage (LV ${label:lv_name} VG ${label:vg_name} Type ${label:volume_type}) + to: sysadmin diff --git a/health/health.d/mdstat.conf b/src/health/health.d/mdstat.conf index 90f97d851..90f97d851 100644 --- a/health/health.d/mdstat.conf +++ b/src/health/health.d/mdstat.conf diff --git a/src/health/health.d/megacli.conf b/src/health/health.d/megacli.conf new file mode 100644 index 000000000..27721fa9a --- /dev/null +++ b/src/health/health.d/megacli.conf @@ -0,0 +1,77 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +# Adapters (controllers) + + template: megacli_adapter_health_state + on: megacli.adapter_health_state + class: Errors + type: System +component: RAID + lookup: average -1m unaligned percentage of optimal + units: % + every: 10s + crit: $this < 100 + delay: down 5m multiplier 2 max 10m + summary: MegaCLI adapter ${label:adapter_number} health + info: MegaCLI adapter ${label:adapter_number} is in the degraded state + to: sysadmin + + template: megacli_phys_drive_media_errors + on: megacli.phys_drive_media_errors + class: Errors + type: System +component: RAID + lookup: sum -10s + units: media errors + every: 10s + warn: $this > 0 + delay: up 1m down 5m multiplier 2 max 10m + summary: MegaCLI PD adapter ${label:adapter_number} slot ${label:slot_number} media errors + info: MegaCLI physical drive adapter ${label:adapter_number} slot ${label:slot_number} media errors + to: sysadmin + +# Physical Drives + + template: megacli_phys_drive_predictive_failures + on: megacli.phys_drive_predictive_failures + class: Errors + type: System +component: RAID + lookup: sum -10s + units: failures + every: 10s + warn: $this > 0 + delay: up 1m down 5m multiplier 2 max 10m + summary: MegaCLI PD adapter ${label:adapter_number} slot ${label:slot_number} predictive failures + info: MegaCLI physical drive (adapter ${label:adapter_number} slot ${label:slot_number}) predictive failures + to: sysadmin + +# Backup Battery Unit + + template: megacli_bbu_charge + on: megacli.bbu_charge + class: Workload + type: System +component: RAID + lookup: average -10s + units: percent + every: 10s + warn: $this <= (($status >= $WARNING) ? (85) : (80)) + crit: $this <= (($status == $CRITICAL) ? (50) : (40)) + summary: MegaCLI BBU charge + info: MegaCLI Backup Battery Unit (adapter ${label:adapter_number}) average charge over the last minute + to: sysadmin + + template: megacli_bbu_recharge_cycles + on: megacli.bbu_recharge_cycles + class: Workload + type: System +component: RAID + lookup: average -10s + units: cycles + every: 10s + warn: $this >= 100 + crit: $this >= 500 + summary: MegaCLI BBU recharge cycles + info: MegaCLI Backup Battery Unit (adapter ${label:adapter_number}) recharge cycles + to: sysadmin diff --git a/health/health.d/memcached.conf b/src/health/health.d/memcached.conf index 77ca0afa9..77ca0afa9 100644 --- a/health/health.d/memcached.conf +++ b/src/health/health.d/memcached.conf diff --git a/src/health/health.d/memory.conf b/src/health/health.d/memory.conf new file mode 100644 index 000000000..2b2b4e4da --- /dev/null +++ b/src/health/health.d/memory.conf @@ -0,0 +1,76 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: 1hour_memory_hw_corrupted + on: mem.hwcorrupt + class: Errors + type: System + component: Memory +host labels: _os=linux + calc: $HardwareCorrupted + units: MB + every: 10s + warn: $this > 0 + delay: down 1h multiplier 1.5 max 1h + summary: System corrupted memory + info: Amount of memory corrupted due to a hardware failure + to: sysadmin + +## ECC Controller + + template: ecc_memory_mc_correctable + on: mem.edac_mc_errors + class: Errors + type: System + component: Memory +host labels: _os=linux + calc: $correctable + $correctable_noinfo + units: errors + every: 1m + warn: $this > 0 + summary: System ECC memory ${label:controller} correctable errors + info: Memory controller ${label:controller} ECC correctable errors + to: sysadmin + + template: ecc_memory_mc_uncorrectable + on: mem.edac_mc_errors + class: Errors + type: System + component: Memory +host labels: _os=linux + calc: $uncorrectable + $uncorrectable_noinfo + units: errors + every: 1m + crit: $this > 0 + summary: System ECC memory ${label:controller} uncorrectable errors + info: Memory controller ${label:controller} ECC uncorrectable errors + to: sysadmin + +## ECC DIMM + + template: ecc_memory_dimm_correctable + on: mem.edac_mc_dimm_errors + class: Errors + type: System + component: Memory +host labels: _os=linux + calc: $correctable + units: errors + every: 1m + warn: $this > 0 + summary: System ECC memory DIMM ${label:dimm} correctable errors + info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC correctable errors + to: sysadmin + + template: ecc_memory_dimm_uncorrectable + on: mem.edac_mc_dimm_errors + class: Errors + type: System + component: Memory +host labels: _os=linux + calc: $uncorrectable + units: errors + every: 1m + crit: $this > 0 + summary: System ECC memory DIMM ${label:dimm} uncorrectable errors + info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC uncorrectable errors + to: sysadmin diff --git a/health/health.d/ml.conf b/src/health/health.d/ml.conf index aef9b0368..b6a5df6dd 100644 --- a/health/health.d/ml.conf +++ b/src/health/health.d/ml.conf @@ -13,8 +13,6 @@ class: Workload type: System component: ML - os: * - hosts: * lookup: average -1m of anomaly_rate calc: $this units: % @@ -29,8 +27,6 @@ component: ML # if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error). # template: ml_5min_cpu_dims # on: system.cpu -# os: linux -# hosts: * # lookup: average -5m anomaly-bit foreach * # calc: $this # units: % @@ -44,8 +40,6 @@ component: ML # if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error). # template: ml_5min_cpu_chart # on: system.cpu -# os: linux -# hosts: * # lookup: average -5m anomaly-bit of * # calc: $this # units: % @@ -53,4 +47,3 @@ component: ML # warn: $this > (($status >= $WARNING) ? (5) : (20)) # crit: $this > (($status == $CRITICAL) ? (20) : (100)) # info: rolling 5min anomaly rate for system.cpu chart - diff --git a/health/health.d/mysql.conf b/src/health/health.d/mysql.conf index 572560b4e..572560b4e 100644 --- a/health/health.d/mysql.conf +++ b/src/health/health.d/mysql.conf diff --git a/src/health/health.d/net.conf b/src/health/health.d/net.conf new file mode 100644 index 000000000..448a3733d --- /dev/null +++ b/src/health/health.d/net.conf @@ -0,0 +1,239 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + +# ----------------------------------------------------------------------------- +# net traffic overflow + + template: interface_speed + on: net.net + class: Latency + type: System +component: Network + calc: ( $nic_speed_max > 0 ) ? ( $nic_speed_max / 1000) : ( nan ) + units: Mbit + every: 10s + info: Network interface ${label:device} current speed + + template: 1m_received_traffic_overflow + on: net.net + class: Workload + type: System + component: Network +host labels: _os=linux + lookup: average -1m unaligned absolute of received + calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan ) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (85) : (90)) + delay: up 1m down 1m multiplier 1.5 max 1h + summary: System network interface ${label:device} inbound utilization + info: Average inbound utilization for the network interface ${label:device} over the last minute + to: silent + + template: 1m_sent_traffic_overflow + on: net.net + class: Workload + type: System + component: Network +host labels: _os=linux + lookup: average -1m unaligned absolute of sent + calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan ) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (85) : (90)) + delay: up 1m down 1m multiplier 1.5 max 1h + summary: System network interface ${label:device} outbound utilization + info: Average outbound utilization for the network interface ${label:device} over the last minute + to: silent + +# ----------------------------------------------------------------------------- +# dropped packets + +# check if an interface is dropping packets +# the alarm is checked every 1 minute +# and examines the last 10 minutes of data +# +# it is possible to have expected packet drops on an interface for some network configurations +# look at the Monitoring Network Interfaces section in the proc.plugin documentation for more information + + template: net_interface_inbound_packets + on: net.packets + class: Workload + type: System +component: Network + lookup: sum -10m unaligned absolute of received + units: packets + every: 1m + summary: Network interface ${label:device} received packets + info: Received packets for the network interface ${label:device} in the last 10 minutes + + template: net_interface_outbound_packets + on: net.packets + class: Workload + type: System +component: Network + lookup: sum -10m unaligned absolute of sent + units: packets + every: 1m + summary: Network interface ${label:device} sent packets + info: Sent packets for the network interface ${label:device} in the last 10 minutes + + template: inbound_packets_dropped_ratio + on: net.drops + class: Errors + type: System + component: Network +chart labels: device=!wl* * + lookup: sum -10m unaligned absolute of inbound + calc: (($net_interface_inbound_packets > 10000) ? ($this * 100 / $net_interface_inbound_packets) : (0)) + units: % + every: 1m + warn: $this >= 2 + delay: up 1m down 1h multiplier 1.5 max 2h + summary: System network interface ${label:device} inbound drops + info: Ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes + to: silent + + template: outbound_packets_dropped_ratio + on: net.drops + class: Errors + type: System + component: Network +chart labels: device=!wl* * + lookup: sum -10m unaligned absolute of outbound + calc: (($net_interface_outbound_packets > 1000) ? ($this * 100 / $net_interface_outbound_packets) : (0)) + units: % + every: 1m + warn: $this >= 2 + delay: up 1m down 1h multiplier 1.5 max 2h + summary: System network interface ${label:device} outbound drops + info: Ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes + to: silent + + template: wifi_inbound_packets_dropped_ratio + on: net.drops + class: Errors + type: System + component: Network + host labels: _os=linux +chart labels: device=wl* + lookup: sum -10m unaligned absolute of received + calc: (($net_interface_inbound_packets > 10000) ? ($this * 100 / $net_interface_inbound_packets) : (0)) + units: % + every: 1m + warn: $this >= 10 + delay: up 1m down 1h multiplier 1.5 max 2h + summary: System network interface ${label:device} inbound drops ratio + info: Ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes + to: silent + + template: wifi_outbound_packets_dropped_ratio + on: net.drops + class: Errors + type: System + component: Network + host labels: _os=linux +chart labels: device=wl* + lookup: sum -10m unaligned absolute of sent + calc: (($net_interface_outbound_packets > 1000) ? ($this * 100 / $net_interface_outbound_packets) : (0)) + units: % + every: 1m + warn: $this >= 10 + delay: up 1m down 1h multiplier 1.5 max 2h + summary: System network interface ${label:device} outbound drops ratio + info: Ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes + to: silent + +# ----------------------------------------------------------------------------- +# interface errors + + template: interface_inbound_errors + on: net.errors + class: Errors + type: System + component: Network +host labels: _os=freebsd + lookup: sum -10m unaligned absolute of inbound + units: errors + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + summary: System network interface ${label:device} inbound errors + info: Number of inbound errors for the network interface ${label:device} in the last 10 minutes + to: silent + + template: interface_outbound_errors + on: net.errors + class: Errors + type: System + component: Network +host labels: _os=freebsd + lookup: sum -10m unaligned absolute of outbound + units: errors + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + summary: System network interface ${label:device} outbound errors + info: Number of outbound errors for the network interface ${label:device} in the last 10 minutes + to: silent + +# ----------------------------------------------------------------------------- +# FIFO errors + +# check if an interface is having FIFO +# buffer errors +# the alarm is checked every 1 minute +# and examines the last 10 minutes of data + + template: 10min_fifo_errors + on: net.fifo + class: Errors + type: System + component: Network +host labels: _os=linux + lookup: sum -10m unaligned absolute + units: errors + every: 1m + warn: $this > 0 + delay: down 1h multiplier 1.5 max 2h + summary: System network interface ${label:device} FIFO errors + info: Number of FIFO errors for the network interface ${label:device} in the last 10 minutes + to: silent + +# ----------------------------------------------------------------------------- +# check for packet storms + +# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate +# 2. do the same for the last 10s +# 3. raise an alarm if the later is 10x or 20x the first +# we assume the minimum packet storm should at least have +# 10000 packets/s, average of the last 10 seconds + + template: 1m_received_packets_rate + on: net.packets + class: Workload + type: System + component: Network +host labels: _os=linux freebsd + lookup: average -1m unaligned of received + units: packets + every: 10s + info: Average number of packets received by the network interface ${label:device} over the last minute + + template: 10s_received_packets_storm + on: net.packets + class: Workload + type: System + component: Network +host labels: _os=linux freebsd + lookup: average -10s unaligned of received + calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate)) + every: 10s + units: % + warn: $this > (($status >= $WARNING)?(200):(5000)) + crit: $this > (($status == $CRITICAL)?(5000):(6000)) + options: no-clear-notification + summary: System network interface ${label:device} inbound packet storm + info: Ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \ + compared to the rate over the last minute + to: silent diff --git a/src/health/health.d/netfilter.conf b/src/health/health.d/netfilter.conf new file mode 100644 index 000000000..e0a05c8de --- /dev/null +++ b/src/health/health.d/netfilter.conf @@ -0,0 +1,18 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: netfilter_conntrack_full + on: netfilter.conntrack_sockets + class: Workload + type: System + component: Network +host labels: _os=linux + lookup: max -10s unaligned of connections + calc: $this * 100 / $netfilter_conntrack_max + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (85) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (95)) + delay: down 5m multiplier 1.5 max 1h + summary: System Netfilter connection tracker utilization + info: Netfilter connection tracker table size utilization + to: sysadmin diff --git a/health/health.d/nvme.conf b/src/health/health.d/nvme.conf index aea402e88..aea402e88 100644 --- a/health/health.d/nvme.conf +++ b/src/health/health.d/nvme.conf diff --git a/health/health.d/pihole.conf b/src/health/health.d/pihole.conf index c4db835ce..c4db835ce 100644 --- a/health/health.d/pihole.conf +++ b/src/health/health.d/pihole.conf diff --git a/src/health/health.d/ping.conf b/src/health/health.d/ping.conf new file mode 100644 index 000000000..a91b231c3 --- /dev/null +++ b/src/health/health.d/ping.conf @@ -0,0 +1,50 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: ping_host_reachable + on: ping.host_packet_loss + class: Errors + type: Other +component: Network + lookup: average -30s unaligned of loss + calc: ($this == nan) ? (nan) : ($this < 100) + units: up/down + every: 10s + crit: $this == 0 + delay: down 30m multiplier 1.5 max 2h + summary: Host ${label:host} ping status + info: Network host ${label:host} reachability status + to: sysadmin + + template: ping_packet_loss + on: ping.host_packet_loss + class: Errors + type: Other +component: Network + lookup: average -10m unaligned of loss + green: 5 + red: 10 + units: % + every: 10s + warn: $this > $green + crit: $this > $red + delay: down 30m multiplier 1.5 max 2h + summary: Host ${label:host} ping packet loss + info: Packet loss percentage to the network host ${label:host} over the last 10 minutes + to: sysadmin + + template: ping_host_latency + on: ping.host_rtt + class: Latency + type: Other +component: Network + lookup: average -10s unaligned of avg + units: ms + every: 10s + green: 500 + red: 1000 + warn: $this > $green OR $max > $red + crit: $this > $red + delay: down 30m multiplier 1.5 max 2h + summary: Host ${label:host} ping latency + info: Average latency to the network host ${label:host} over the last 10 seconds + to: sysadmin diff --git a/health/health.d/plugin.conf b/src/health/health.d/plugin.conf index 8615a0213..8615a0213 100644 --- a/health/health.d/plugin.conf +++ b/src/health/health.d/plugin.conf diff --git a/health/health.d/portcheck.conf b/src/health/health.d/portcheck.conf index 281731c86..281731c86 100644 --- a/health/health.d/portcheck.conf +++ b/src/health/health.d/portcheck.conf diff --git a/src/health/health.d/postgres.conf b/src/health/health.d/postgres.conf new file mode 100644 index 000000000..17e418758 --- /dev/null +++ b/src/health/health.d/postgres.conf @@ -0,0 +1,216 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: postgres_total_connection_utilization + on: postgres.connections_utilization + class: Utilization + type: Database +component: PostgreSQL + lookup: average -1m unaligned of used + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (80) : (90)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL connection utilization + info: Average total connection utilization over the last minute + to: dba + + template: postgres_acquired_locks_utilization + on: postgres.locks_utilization + class: Utilization + type: Database +component: PostgreSQL + lookup: average -1m unaligned of used + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (15) : (20)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL acquired locks utilization + info: Average acquired locks utilization over the last minute + to: dba + + template: postgres_txid_exhaustion_perc + on: postgres.txid_exhaustion_perc + class: Utilization + type: Database +component: PostgreSQL + calc: $txid_exhaustion + units: % + every: 1m + warn: $this > 90 + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL TXID exhaustion + info: Percent towards TXID wraparound + to: dba + +# Database alarms + + template: postgres_db_cache_io_ratio + on: postgres.db_cache_io_ratio + class: Workload + type: Database +component: PostgreSQL + lookup: average -1m unaligned of miss + calc: 100 - $this + units: % + every: 1m + warn: $this < (($status >= $WARNING) ? (70) : (60)) + crit: $this < (($status == $CRITICAL) ? (60) : (50)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL DB ${label:database} cache hit ratio + info: Average cache hit ratio in db ${label:database} over the last minute + to: dba + + template: postgres_db_transactions_rollback_ratio + on: postgres.db_transactions_ratio + class: Workload + type: Database +component: PostgreSQL + lookup: average -5m unaligned of rollback + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (2)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL DB ${label:database} aborted transactions + info: Average aborted transactions percentage in db ${label:database} over the last five minutes + to: dba + + template: postgres_db_deadlocks_rate + on: postgres.db_deadlocks_rate + class: Errors + type: Database +component: PostgreSQL + lookup: sum -1m unaligned of deadlocks + units: deadlocks + every: 1m + warn: $this > (($status >= $WARNING) ? (0) : (10)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL DB ${label:database} deadlocks rate + info: Number of deadlocks detected in db ${label:database} in the last minute + to: dba + +# Table alarms + + template: postgres_table_cache_io_ratio + on: postgres.table_cache_io_ratio + class: Workload + type: Database +component: PostgreSQL + lookup: average -1m unaligned of miss + calc: 100 - $this + units: % + every: 1m + warn: $this < (($status >= $WARNING) ? (70) : (60)) + crit: $this < (($status == $CRITICAL) ? (60) : (50)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL table ${label:table} db ${label:database} cache hit ratio + info: Average cache hit ratio in db ${label:database} table ${label:table} over the last minute + to: dba + + template: postgres_table_index_cache_io_ratio + on: postgres.table_index_cache_io_ratio + class: Workload + type: Database +component: PostgreSQL + lookup: average -1m unaligned of miss + calc: 100 - $this + units: % + every: 1m + warn: $this < (($status >= $WARNING) ? (70) : (60)) + crit: $this < (($status == $CRITICAL) ? (60) : (50)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL table ${label:table} db ${label:database} index cache hit ratio + info: Average index cache hit ratio in db ${label:database} table ${label:table} over the last minute + to: dba + + template: postgres_table_toast_cache_io_ratio + on: postgres.table_toast_cache_io_ratio + class: Workload + type: Database +component: PostgreSQL + lookup: average -1m unaligned of miss + calc: 100 - $this + units: % + every: 1m + warn: $this < (($status >= $WARNING) ? (70) : (60)) + crit: $this < (($status == $CRITICAL) ? (60) : (50)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL table ${label:table} db ${label:database} toast cache hit ratio + info: Average TOAST hit ratio in db ${label:database} table ${label:table} over the last minute + to: dba + + template: postgres_table_toast_index_cache_io_ratio + on: postgres.table_toast_index_cache_io_ratio + class: Workload + type: Database +component: PostgreSQL + lookup: average -1m unaligned of miss + calc: 100 - $this + units: % + every: 1m + warn: $this < (($status >= $WARNING) ? (70) : (60)) + crit: $this < (($status == $CRITICAL) ? (60) : (50)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL table ${label:table} db ${label:database} index toast hit ratio + info: average index TOAST hit ratio in db ${label:database} table ${label:table} over the last minute + to: dba + + template: postgres_table_bloat_size_perc + on: postgres.table_bloat_size_perc + class: Errors + type: Database +component: PostgreSQL + calc: ($table_size > (1024 * 1024 * 100)) ? ($bloat) : (0) + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (60) : (70)) + crit: $this > (($status == $CRITICAL) ? (70) : (80)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL table ${label:table} db ${label:database} bloat size + info: Bloat size percentage in db ${label:database} table ${label:table} + to: dba + + template: postgres_table_last_autovacuum_time + on: postgres.table_autovacuum_since_time + class: Errors + type: Database +component: PostgreSQL +host labels: _hostname=!* + calc: $time + units: seconds + every: 1m + warn: $this != nan AND $this > (60 * 60 * 24 * 7) + summary: PostgreSQL table ${label:table} db ${label:database} last autovacuum + info: Time elapsed since db ${label:database} table ${label:table} was vacuumed by the autovacuum daemon + to: dba + + template: postgres_table_last_autoanalyze_time + on: postgres.table_autoanalyze_since_time + class: Errors + type: Database +component: PostgreSQL +host labels: _hostname=!* + calc: $time + units: seconds + every: 1m + warn: $this != nan AND $this > (60 * 60 * 24 * 7) + summary: PostgreSQL table ${label:table} db ${label:database} last autoanalyze + info: Time elapsed since db ${label:database} table ${label:table} was analyzed by the autovacuum daemon + to: dba + +# Index alarms + + template: postgres_index_bloat_size_perc + on: postgres.index_bloat_size_perc + class: Errors + type: Database +component: PostgreSQL + calc: ($index_size > (1024 * 1024 * 10)) ? ($bloat) : (0) + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (60) : (70)) + crit: $this > (($status == $CRITICAL) ? (70) : (80)) + delay: down 15m multiplier 1.5 max 1h + summary: PostgreSQL table ${label:table} db ${label:database} index bloat size + info: Bloat size percentage in db ${label:database} table ${label:table} index ${label:index} + to: dba diff --git a/health/health.d/processes.conf b/src/health/health.d/processes.conf index 8f2e0fda5..2029c76e4 100644 --- a/health/health.d/processes.conf +++ b/src/health/health.d/processes.conf @@ -5,7 +5,6 @@ class: Workload type: System component: Processes - hosts: * calc: $active * 100 / $pidmax units: % every: 5s diff --git a/src/health/health.d/python.d.plugin.conf b/src/health/health.d/python.d.plugin.conf new file mode 100644 index 000000000..f962b07f2 --- /dev/null +++ b/src/health/health.d/python.d.plugin.conf @@ -0,0 +1,17 @@ +# make sure python.d.plugin data collection job is running + + template: python.d_job_last_collected_secs + on: netdata.pythond_runtime + class: Errors + type: Netdata + component: python.d.plugin +host labels: _hostname=!* + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + summary: Python.d plugin last collection + info: Number of seconds since the last successful data collection + to: webmaster diff --git a/src/health/health.d/qos.conf b/src/health/health.d/qos.conf new file mode 100644 index 000000000..f524a1578 --- /dev/null +++ b/src/health/health.d/qos.conf @@ -0,0 +1,16 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +# check if a QoS class is dropping packets +# the alarm is checked every 10 seconds +# and examines the last minute of data + + template: 10min_qos_packet_drops + on: tc.qos_dropped +host labels: _os=linux + lookup: sum -5m unaligned absolute + every: 30s + warn: $this > 0 + units: packets + summary: QOS packet drops + info: Dropped packets in the last 5 minutes + to: silent diff --git a/src/health/health.d/ram.conf b/src/health/health.d/ram.conf new file mode 100644 index 000000000..573bc0aca --- /dev/null +++ b/src/health/health.d/ram.conf @@ -0,0 +1,76 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: ram_in_use + on: system.ram + class: Utilization + type: System + component: Memory +host labels: _os=linux + calc: $used * 100 / ($used + $cached + $free + $buffers) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + summary: System memory utilization + info: System memory utilization + to: sysadmin + + alarm: ram_available + on: mem.available + class: Utilization + type: System + component: Memory +host labels: _os=linux + calc: $avail * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers) + units: % + every: 10s + warn: $this < (($status >= $WARNING) ? (15) : (10)) + delay: down 15m multiplier 1.5 max 1h + summary: System available memory + info: Percentage of estimated amount of RAM available for userspace processes, without causing swapping + to: silent + + alarm: oom_kill + on: mem.oom_kill +host labels: _os=linux + lookup: sum -30m unaligned + units: kills + every: 5m + warn: $this > 0 + delay: down 10m + summary: System OOM kills + info: Number of out of memory kills in the last 30 minutes + to: silent + +## FreeBSD + alarm: ram_in_use + on: system.ram + class: Utilization + type: System + component: Memory +host labels: _os=freebsd + calc: ($active + $wired + $laundry + $buffers) * 100 / ($active + $wired + $laundry + $buffers + $cache + $free + $inactive) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + summary: System memory utilization + info: System memory utilization + to: sysadmin + + alarm: ram_available + on: mem.available + class: Utilization + type: System + component: Memory +host labels: _os=freebsd + calc: $avail * 100 / ($system.ram.free + $system.ram.active + $system.ram.inactive + $system.ram.wired + $system.ram.cache + $system.ram.laundry + $system.ram.buffers) + units: % + every: 10s + warn: $this < (($status >= $WARNING) ? (15) : (10)) + delay: down 15m multiplier 1.5 max 1h + summary: System available memory + info: Percentage of estimated amount of RAM available for userspace processes, without causing swapping + to: silent diff --git a/src/health/health.d/redis.conf b/src/health/health.d/redis.conf new file mode 100644 index 000000000..4f82830a9 --- /dev/null +++ b/src/health/health.d/redis.conf @@ -0,0 +1,58 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: redis_connections_rejected + on: redis.connections + class: Errors + type: KV Storage +component: Redis + lookup: sum -1m unaligned of rejected + every: 10s + units: connections + warn: $this > 0 + summary: Redis rejected connections + info: Connections rejected because of maxclients limit in the last minute + delay: down 5m multiplier 1.5 max 1h + to: dba + + template: redis_bgsave_broken + on: redis.bgsave_health + class: Errors + type: KV Storage +component: Redis + every: 10s + calc: $last_bgsave != nan AND $last_bgsave != 0 + crit: $this + units: ok/failed + summary: Redis background save + info: Status of the last RDB save operation (0: ok, 1: error) + delay: down 5m multiplier 1.5 max 1h + to: dba + + template: redis_bgsave_slow + on: redis.bgsave_now + class: Latency + type: KV Storage +component: Redis + every: 10s + calc: $current_bgsave_time + warn: $this > 600 + crit: $this > 1200 + units: seconds + summary: Redis slow background save + info: Duration of the on-going RDB save operation + delay: down 5m multiplier 1.5 max 1h + to: dba + + template: redis_master_link_down + on: redis.master_link_down_since_time + class: Errors + type: KV Storage +component: Redis + every: 10s + calc: $time + units: seconds + crit: $this != nan AND $this > 0 + summary: Redis master link down + info: Time elapsed since the link between master and slave is down + delay: down 5m multiplier 1.5 max 1h + to: dba diff --git a/health/health.d/retroshare.conf b/src/health/health.d/retroshare.conf index c665430fa..c665430fa 100644 --- a/health/health.d/retroshare.conf +++ b/src/health/health.d/retroshare.conf diff --git a/health/health.d/riakkv.conf b/src/health/health.d/riakkv.conf index 677e3cb4f..677e3cb4f 100644 --- a/health/health.d/riakkv.conf +++ b/src/health/health.d/riakkv.conf diff --git a/health/health.d/scaleio.conf b/src/health/health.d/scaleio.conf index b089cb85e..b089cb85e 100644 --- a/health/health.d/scaleio.conf +++ b/src/health/health.d/scaleio.conf diff --git a/src/health/health.d/softnet.conf b/src/health/health.d/softnet.conf new file mode 100644 index 000000000..03a4ceebd --- /dev/null +++ b/src/health/health.d/softnet.conf @@ -0,0 +1,53 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +# check for common /proc/net/softnet_stat errors + + alarm: 1min_netdev_backlog_exceeded + on: system.softnet_stat + class: Errors + type: System + component: Network +host labels: _os=linux + lookup: average -1m unaligned absolute of dropped + units: packets + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : (10)) + delay: down 1h multiplier 1.5 max 2h + summary: System netdev dropped packets + info: Average number of dropped packets in the last minute \ + due to exceeded net.core.netdev_max_backlog + to: silent + + alarm: 1min_netdev_budget_ran_outs + on: system.softnet_stat + class: Errors + type: System + component: Network +host labels: _os=linux + lookup: average -1m unaligned absolute of squeezed + units: events + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : (10)) + delay: down 1h multiplier 1.5 max 2h + summary: System netdev budget run outs + info: Average number of times ksoftirq ran out of sysctl net.core.netdev_budget or \ + net.core.netdev_budget_usecs with work remaining over the last minute \ + (this can be a cause for dropped packets) + to: silent + + alarm: 10min_netisr_backlog_exceeded + on: system.softnet_stat + class: Errors + type: System + component: Network +host labels: _os=freebsd + lookup: average -1m unaligned absolute of qdrops + units: packets + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : (10)) + delay: down 1h multiplier 1.5 max 2h + summary: System netisr drops + info: Average number of drops in the last minute \ + due to exceeded sysctl net.route.netisr_maxqlen \ + (this can be a cause for dropped packets) + to: silent diff --git a/src/health/health.d/storcli.conf b/src/health/health.d/storcli.conf new file mode 100644 index 000000000..be71b517e --- /dev/null +++ b/src/health/health.d/storcli.conf @@ -0,0 +1,61 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +# Controllers + + template: storcli_controller_health_status + on: storcli.controller_health_status + class: Errors + type: System +component: RAID + lookup: average -1m unaligned percentage of healthy + units: % + every: 10s + crit: $this < 100 + delay: down 5m multiplier 2 max 10m + summary: RAID controller ${label:controller_number} health + info: RAID controller ${label:controller_number} is unhealthy + to: sysadmin + + template: storcli_controller_bbu_status + on: storcli.controller_bbu_status + class: Errors + type: System +component: RAID + lookup: average -1m unaligned percentage of healthy,na + units: % + every: 10s + crit: $this < 100 + delay: down 5m multiplier 2 max 10m + summary: RAID controller ${label:controller_number} BBU health + info: RAID controller ${label:controller_number} BBU is unhealthy + to: sysadmin + +# Physical Drives + + template: storcli_phys_drive_errors + on: storcli.phys_drive_errors + class: Errors + type: System +component: RAID + lookup: sum -10s + units: errors + every: 10s + warn: $this > 0 + delay: up 1m down 5m multiplier 2 max 10m + summary: RAID PD c${label:controller_number}/e${label:enclosure_number}/s${label:slot_number} errors + info: RAID physical drive c${label:controller_number}/e${label:enclosure_number}/s${label:slot_number} errors + to: sysadmin + + template: storcli_phys_drive_predictive_failures + on: storcli.phys_drive_predictive_failures + class: Errors + type: System +component: RAID + lookup: sum -10s + units: failures + every: 10s + warn: $this > 0 + delay: up 1m down 5m multiplier 2 max 10m + summary: RAID PD c${label:controller_number}/e${label:enclosure_number}/s${label:slot_number} predictive failures + info: RAID physical drive c${label:controller_number}/e${label:enclosure_number}/s${label:slot_number} predictive failures + to: sysadmin diff --git a/src/health/health.d/swap.conf b/src/health/health.d/swap.conf new file mode 100644 index 000000000..297aebd1e --- /dev/null +++ b/src/health/health.d/swap.conf @@ -0,0 +1,34 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: 30min_ram_swapped_out + on: mem.swapio + class: Workload + type: System + component: Memory +host labels: _os=linux freebsd + lookup: sum -30m unaligned absolute of out + # we have to convert KB to MB by dividing $this (i.e. the result of the lookup) with 1024 + calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free ) + units: % of RAM + every: 1m + warn: $this > (($status >= $WARNING) ? (20) : (30)) + delay: down 15m multiplier 1.5 max 1h + summary: System memory swapped out + info: Percentage of the system RAM swapped in the last 30 minutes + to: silent + + alarm: used_swap + on: mem.swap + class: Utilization + type: System + component: Memory +host labels: _os=linux freebsd + calc: (($used + $free) > 0) ? ($used * 100 / ($used + $free)) : 0 + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: up 30s down 15m multiplier 1.5 max 1h + summary: System swap memory utilization + info: Swap memory utilization + to: sysadmin diff --git a/health/health.d/synchronization.conf b/src/health/health.d/synchronization.conf index 6c947d90b..28b1817ac 100644 --- a/health/health.d/synchronization.conf +++ b/src/health/health.d/synchronization.conf @@ -2,7 +2,6 @@ on: mem.sync lookup: sum -1m of sync units: calls - plugin: ebpf.plugin every: 1m warn: $this > 6 delay: up 1m down 10m multiplier 1.5 max 1h diff --git a/src/health/health.d/systemdunits.conf b/src/health/health.d/systemdunits.conf new file mode 100644 index 000000000..bb5c627e8 --- /dev/null +++ b/src/health/health.d/systemdunits.conf @@ -0,0 +1,177 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +## Service units + template: systemd_service_unit_failed_state + on: systemd.service_unit_state + class: Errors + type: Linux + component: Systemd units +chart labels: unit_name=!* + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd service unit in the failed state + to: sysadmin + +## Socket units + template: systemd_socket_unit_failed_state + on: systemd.socket_unit_state + class: Errors + type: Linux + component: Systemd units +chart labels: unit_name=!* + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd socket unit in the failed state + to: sysadmin + +## Target units + template: systemd_target_unit_failed_state + on: systemd.target_unit_state + class: Errors + type: Linux + component: Systemd units +chart labels: unit_name=!* + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd target unit in the failed state + to: sysadmin + +## Path units + template: systemd_path_unit_failed_state + on: systemd.path_unit_state + class: Errors + type: Linux + component: Systemd units +chart labels: unit_name=!* + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd path unit in the failed state + to: sysadmin + +## Device units + template: systemd_device_unit_failed_state + on: systemd.device_unit_state + class: Errors + type: Linux + component: Systemd units +chart labels: unit_name=!* + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd device unit in the failed state + to: sysadmin + +## Mount units + template: systemd_mount_unit_failed_state + on: systemd.mount_unit_state + class: Errors + type: Linux + component: Systemd units +chart labels: unit_name=!* + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd mount units in the failed state + to: sysadmin + +## Automount units + template: systemd_automount_unit_failed_state + on: systemd.automount_unit_state + class: Errors + type: Linux + component: Systemd units +chart labels: unit_name=!* + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd automount unit in the failed state + to: sysadmin + +## Swap units + template: systemd_swap_unit_failed_state + on: systemd.swap_unit_state + class: Errors + type: Linux + component: Systemd units +chart labels: unit_name=!* + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd swap units in the failed state + to: sysadmin + +## Scope units + template: systemd_scope_unit_failed_state + on: systemd.scope_unit_state + class: Errors + type: Linux + component: Systemd units +chart labels: unit_name=!* + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd scope units in the failed state + to: sysadmin + +## Slice units + template: systemd_slice_unit_failed_state + on: systemd.slice_unit_state + class: Errors + type: Linux + component: Systemd units +chart labels: unit_name=!* + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd slice units in the failed state + to: sysadmin + +## Timer units + template: systemd_timer_unit_failed_state + on: systemd.timer_unit_state + class: Errors + type: Linux + component: Systemd units +chart labels: unit_name=!* + calc: $failed + units: state + every: 10s + warn: $this != nan AND $this == 1 + delay: down 5m multiplier 1.5 max 1h + summary: systemd unit ${label:unit_name} state + info: systemd timer unit in the failed state + to: sysadmin diff --git a/src/health/health.d/tcp_conn.conf b/src/health/health.d/tcp_conn.conf new file mode 100644 index 000000000..fe4b98db0 --- /dev/null +++ b/src/health/health.d/tcp_conn.conf @@ -0,0 +1,21 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +# ${tcp_max_connections} may be nan or -1 if the system +# supports dynamic threshold for TCP connections. +# In this case, the alarm will always be zero. + + alarm: tcp_connections + on: ip.tcpsock + class: Workload + type: System + component: Network +host labels: _os=linux + calc: (${tcp_max_connections} > 0) ? ( ${connections} * 100 / ${tcp_max_connections} ) : 0 + units: % + every: 10s + warn: $this > (($status >= $WARNING ) ? ( 60 ) : ( 80 )) + crit: $this > (($status == $CRITICAL) ? ( 80 ) : ( 90 )) + delay: up 0 down 5m multiplier 1.5 max 1h + summary: System TCP connections utilization + info: IPv4 TCP connections utilization + to: sysadmin diff --git a/src/health/health.d/tcp_listen.conf b/src/health/health.d/tcp_listen.conf new file mode 100644 index 000000000..bdcce79d4 --- /dev/null +++ b/src/health/health.d/tcp_listen.conf @@ -0,0 +1,93 @@ +# There are two queues involved when incoming TCP connections are handled +# (both at the kernel): +# +# SYN queue +# The SYN queue tracks TCP handshakes until connections are fully established. +# It overflows when too many incoming TCP connection requests hang in the +# half-open state and the server is not configured to fall back to SYN cookies. +# Overflows are usually caused by SYN flood DoS attacks (i.e. someone sends +# lots of SYN packets and never completes the handshakes). +# +# Accept queue +# The accept queue holds fully established TCP connections waiting to be handled +# by the listening application. It overflows when the server application fails +# to accept new connections at the rate they are coming in. +# +# +# ----------------------------------------------------------------------------- +# tcp accept queue (at the kernel) + + alarm: 1m_tcp_accept_queue_overflows + on: ip.tcp_accept_queue + class: Workload + type: System + component: Network +host labels: _os=linux + lookup: average -60s unaligned absolute of ListenOverflows + units: overflows + every: 10s + warn: $this > 1 + crit: $this > (($status == $CRITICAL) ? (1) : (5)) + delay: up 0 down 5m multiplier 1.5 max 1h + summary: System TCP accept queue overflows + info: Average number of overflows in the TCP accept queue over the last minute + to: silent + +# THIS IS TOO GENERIC +# CHECK: https://github.com/netdata/netdata/issues/3234#issuecomment-423935842 + alarm: 1m_tcp_accept_queue_drops + on: ip.tcp_accept_queue + class: Workload + type: System + component: Network +host labels: _os=linux + lookup: average -60s unaligned absolute of ListenDrops + units: drops + every: 10s + warn: $this > 1 + crit: $this > (($status == $CRITICAL) ? (1) : (5)) + delay: up 0 down 5m multiplier 1.5 max 1h + summary: System TCP accept queue dropped packets + info: Average number of dropped packets in the TCP accept queue over the last minute + to: silent + +# ----------------------------------------------------------------------------- +# tcp SYN queue (at the kernel) + +# When the SYN queue is full, either TcpExtTCPReqQFullDoCookies or +# TcpExtTCPReqQFullDrop is incremented, depending on whether SYN cookies are +# enabled or not. In both cases this probably indicates a SYN flood attack, +# so i guess a notification should be sent. + + alarm: 1m_tcp_syn_queue_drops + on: ip.tcp_syn_queue + class: Workload + type: System + component: Network +host labels: _os=linux + lookup: average -60s unaligned absolute of TCPReqQFullDrop + units: drops + every: 10s + warn: $this > 1 + crit: $this > (($status == $CRITICAL) ? (0) : (5)) + delay: up 10 down 5m multiplier 1.5 max 1h + summary: System TCP SYN queue drops + info: Average number of SYN requests was dropped due to the full TCP SYN queue over the last minute \ + (SYN cookies were not enabled) + to: silent + + alarm: 1m_tcp_syn_queue_cookies + on: ip.tcp_syn_queue + class: Workload + type: System + component: Network +host labels: _os=linux + lookup: average -60s unaligned absolute of TCPReqQFullDoCookies + units: cookies + every: 10s + warn: $this > 1 + crit: $this > (($status == $CRITICAL) ? (0) : (5)) + delay: up 10 down 5m multiplier 1.5 max 1h + summary: System TCP SYN queue cookies + info: Average number of sent SYN cookies due to the full TCP SYN queue over the last minute + to: silent diff --git a/src/health/health.d/tcp_mem.conf b/src/health/health.d/tcp_mem.conf new file mode 100644 index 000000000..b9350e3cd --- /dev/null +++ b/src/health/health.d/tcp_mem.conf @@ -0,0 +1,22 @@ +# check +# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html +# +# We give a warning when TCP is under memory pressure +# and a critical when TCP is 90% of its upper memory limit +# + + alarm: tcp_memory + on: ipv4.sockstat_tcp_mem + class: Utilization + type: System + component: Network +host labels: _os=linux + calc: ${mem} * 100 / ${tcp_mem_high} + units: % + every: 10s + warn: ${mem} > (($status >= $WARNING ) ? ( ${tcp_mem_pressure} * 0.8 ) : ( ${tcp_mem_pressure} )) + crit: ${mem} > (($status == $CRITICAL ) ? ( ${tcp_mem_pressure} ) : ( ${tcp_mem_high} * 0.9 )) + delay: up 0 down 5m multiplier 1.5 max 1h + summary: System TCP memory utilization + info: TCP memory utilization + to: silent diff --git a/src/health/health.d/tcp_orphans.conf b/src/health/health.d/tcp_orphans.conf new file mode 100644 index 000000000..7b2d95edb --- /dev/null +++ b/src/health/health.d/tcp_orphans.conf @@ -0,0 +1,22 @@ +# check +# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html +# +# The kernel may penalize orphans by 2x or even 4x +# so we alarm warning at 25% and critical at 50% +# + + alarm: tcp_orphans + on: ipv4.sockstat_tcp_sockets + class: Errors + type: System + component: Network +host labels: _os=linux + calc: ${orphan} * 100 / ${tcp_max_orphans} + units: % + every: 10s + warn: $this > (($status >= $WARNING ) ? ( 20 ) : ( 25 )) + crit: $this > (($status == $CRITICAL) ? ( 25 ) : ( 50 )) + delay: up 0 down 5m multiplier 1.5 max 1h + summary: System TCP orphan sockets utilization + info: Orphan IPv4 TCP sockets utilization + to: silent diff --git a/src/health/health.d/tcp_resets.conf b/src/health/health.d/tcp_resets.conf new file mode 100644 index 000000000..63f798d78 --- /dev/null +++ b/src/health/health.d/tcp_resets.conf @@ -0,0 +1,66 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +# ----------------------------------------------------------------------------- +# tcp resets this host sends + + alarm: 1m_ip_tcp_resets_sent + on: ip.tcphandshake + class: Errors + type: System + component: Network +host labels: _os=linux + lookup: average -1m at -10s unaligned absolute of OutRsts + units: tcp resets/s + every: 10s + info: average number of sent TCP RESETS over the last minute + + alarm: 10s_ip_tcp_resets_sent + on: ip.tcphandshake + class: Errors + type: System + component: Network +host labels: _os=linux + lookup: average -10s unaligned absolute of OutRsts + units: tcp resets/s + every: 10s + warn: $netdata.uptime.uptime > (1 * 60) AND $this > ((($1m_ip_tcp_resets_sent < 5)?(5):($1m_ip_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (10))) + delay: up 20s down 60m multiplier 1.2 max 2h + options: no-clear-notification + summary: System TCP outbound resets + info: Average number of sent TCP RESETS over the last 10 seconds. \ + This can indicate a port scan, \ + or that a service running on this host has crashed. \ + Netdata will not send a clear notification for this alarm. + to: silent + +# ----------------------------------------------------------------------------- +# tcp resets this host receives + + alarm: 1m_ip_tcp_resets_received + on: ip.tcphandshake + class: Errors + type: System + component: Network +host labels: _os=linux freebsd + lookup: average -1m at -10s unaligned absolute of AttemptFails + units: tcp resets/s + every: 10s + info: average number of received TCP RESETS over the last minute + + alarm: 10s_ip_tcp_resets_received + on: ip.tcphandshake + class: Errors + type: System + component: Network +host labels: _os=linux freebsd + lookup: average -10s unaligned absolute of AttemptFails + units: tcp resets/s + every: 10s + warn: $netdata.uptime.uptime > (1 * 60) AND $this > ((($1m_ip_tcp_resets_received < 5)?(5):($1m_ip_tcp_resets_received)) * (($status >= $WARNING) ? (1) : (10))) + delay: up 20s down 60m multiplier 1.2 max 2h + options: no-clear-notification + summary: System TCP inbound resets + info: average number of received TCP RESETS over the last 10 seconds. \ + This can be an indication that a service this host needs has crashed. \ + Netdata will not send a clear notification for this alarm. + to: silent diff --git a/src/health/health.d/timex.conf b/src/health/health.d/timex.conf new file mode 100644 index 000000000..053dc9290 --- /dev/null +++ b/src/health/health.d/timex.conf @@ -0,0 +1,17 @@ +# It can take several minutes before ntpd selects a server to synchronize with; +# try checking after 17 minutes (1024 seconds). + + alarm: system_clock_sync_state + on: system.clock_sync_state + class: Errors + type: System + component: Clock +host labels: _os=linux + calc: $state + units: synchronization state + every: 10s + warn: $system.uptime.uptime > 17 * 60 AND $this == 0 + delay: down 5m + summary: System clock sync state + info: When set to 0, the system kernel believes the system clock is not properly synchronized to a reliable server + to: silent diff --git a/src/health/health.d/udp_errors.conf b/src/health/health.d/udp_errors.conf new file mode 100644 index 000000000..745c11e21 --- /dev/null +++ b/src/health/health.d/udp_errors.conf @@ -0,0 +1,37 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + +# ----------------------------------------------------------------------------- +# UDP receive buffer errors + + alarm: 1m_ipv4_udp_receive_buffer_errors + on: ipv4.udperrors + class: Errors + type: System + component: Network +host labels: _os=linux freebsd + lookup: average -1m unaligned absolute of RcvbufErrors + units: errors + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : (10)) + summary: System UDP receive buffer errors + info: Average number of UDP receive buffer errors over the last minute + delay: up 1m down 60m multiplier 1.2 max 2h + to: silent + +# ----------------------------------------------------------------------------- +# UDP send buffer errors + + alarm: 1m_ipv4_udp_send_buffer_errors + on: ipv4.udperrors + class: Errors + type: System + component: Network +host labels: _os=linux + lookup: average -1m unaligned absolute of SndbufErrors + units: errors + every: 10s + warn: $this > (($status >= $WARNING) ? (0) : (10)) + summary: System UDP send buffer errors + info: Average number of UDP send buffer errors over the last minute + delay: up 1m down 60m multiplier 1.2 max 2h + to: silent diff --git a/health/health.d/unbound.conf b/src/health/health.d/unbound.conf index 3c898f1d5..3c898f1d5 100644 --- a/health/health.d/unbound.conf +++ b/src/health/health.d/unbound.conf diff --git a/src/health/health.d/upsd.conf b/src/health/health.d/upsd.conf new file mode 100644 index 000000000..17eb5263d --- /dev/null +++ b/src/health/health.d/upsd.conf @@ -0,0 +1,46 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: upsd_10min_ups_load + on: upsd.ups_load + class: Utilization + type: Power Supply +component: UPS + lookup: average -10m unaligned of load + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 10m multiplier 1.5 max 1h + summary: UPS ${label:ups_name} load + info: UPS ${label:ups_name} average load over the last 10 minutes + to: sitemgr + + template: upsd_ups_battery_charge + on: upsd.ups_battery_charge + class: Errors + type: Power Supply +component: UPS + lookup: average -60s unaligned of charge + units: % + every: 60s + warn: $this < 75 + crit: $this < 40 + delay: down 10m multiplier 1.5 max 1h + summary: UPS ${label:ups_name} battery charge + info: UPS ${label:ups_name} average battery charge over the last minute + to: sitemgr + + template: upsd_ups_last_collected_secs + on: upsd.ups_load + class: Latency + type: Power Supply +component: UPS device + calc: $now - $last_collected_t + every: 10s + units: seconds ago + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + summary: UPS ${label:ups_name} last collected + info: UPS ${label:ups_name} number of seconds since the last successful data collection + to: sitemgr diff --git a/health/health.d/vcsa.conf b/src/health/health.d/vcsa.conf index 3e20bfd1e..3e20bfd1e 100644 --- a/health/health.d/vcsa.conf +++ b/src/health/health.d/vcsa.conf diff --git a/health/health.d/vernemq.conf b/src/health/health.d/vernemq.conf index 6ea9f99dc..6ea9f99dc 100644 --- a/health/health.d/vernemq.conf +++ b/src/health/health.d/vernemq.conf diff --git a/src/health/health.d/vsphere.conf b/src/health/health.d/vsphere.conf new file mode 100644 index 000000000..e22f0b620 --- /dev/null +++ b/src/health/health.d/vsphere.conf @@ -0,0 +1,66 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + +# -----------------------------------------------Virtual Machine-------------------------------------------------------- + + template: vsphere_vm_cpu_utilization + on: vsphere.vm_cpu_utilization + class: Utilization + type: Virtual Machine +component: CPU + lookup: average -10m unaligned match-names of used + units: % + every: 20s + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + summary: vSphere CPU utilization for VM ${label:vm} + info: CPU utilization VM ${label:vm} host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter} + to: silent + + template: vsphere_vm_mem_utilization + on: vsphere.vm_mem_utilization + class: Utilization + type: Virtual Machine +component: Memory + calc: $used + units: % + every: 20s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + summary: vSphere memory utilization for VM ${label:vm} + info: Memory utilization VM ${label:vm} host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter} + to: silent + +# -----------------------------------------------ESXI host-------------------------------------------------------------- + + template: vsphere_host_cpu_utilization + on: vsphere.host_cpu_utilization + class: Utilization + type: Virtual Machine +component: CPU + lookup: average -10m unaligned match-names of used + units: % + every: 20s + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + summary: vSphere ESXi CPU utilization for host ${label:host} + info: CPU utilization ESXi host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter} + to: sysadmin + + template: vsphere_host_mem_utilization + on: vsphere.host_mem_utilization + class: Utilization + type: Virtual Machine +component: Memory + calc: $used + units: % + every: 20s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + summary: vSphere ESXi Ram utilization for host ${label:host} + info: Memory utilization ESXi host ${label:host} cluster ${label:cluster} datacenter ${label:datacenter} + to: sysadmin diff --git a/health/health.d/web_log.conf b/src/health/health.d/web_log.conf index 78f1cc7f5..78f1cc7f5 100644 --- a/health/health.d/web_log.conf +++ b/src/health/health.d/web_log.conf diff --git a/src/health/health.d/whoisquery.conf b/src/health/health.d/whoisquery.conf new file mode 100644 index 000000000..6d87ad280 --- /dev/null +++ b/src/health/health.d/whoisquery.conf @@ -0,0 +1,14 @@ + + template: whoisquery_days_until_expiration + on: whoisquery.time_until_expiration + class: Utilization + type: Other +component: WHOIS + calc: $expiry / 86400 + units: days + every: 60s + warn: $this < $days_until_expiration_warning + crit: $this < $days_until_expiration_critical + summary: Whois expiration time for domain ${label:domain} + info: Time until the domain name registration for ${label:domain} expires + to: webmaster diff --git a/src/health/health.d/windows.conf b/src/health/health.d/windows.conf new file mode 100644 index 000000000..9dfda50c1 --- /dev/null +++ b/src/health/health.d/windows.conf @@ -0,0 +1,108 @@ +## CPU + + template: windows_10min_cpu_usage + on: windows.cpu_utilization_total + class: Utilization + type: Windows +component: CPU + lookup: average -10m unaligned match-names of dpc,user,privileged,interrupt + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + summary: CPU utilization + info: Average CPU utilization over the last 10 minutes + to: silent + +## Memory + + template: windows_ram_in_use + on: windows.memory_utilization + class: Utilization + type: Windows +component: Memory + calc: ($used) * 100 / ($used + $available) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + summary: Ram utilization + info: Memory utilization + to: sysadmin + +## Network + + template: windows_inbound_packets_discarded + on: windows.net_nic_discarded + class: Errors + type: Windows +component: Network + lookup: sum -10m unaligned absolute match-names of inbound + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + summary: Inbound network packets discarded + info: Number of inbound discarded packets for the network interface in the last 10 minutes + to: silent + + template: windows_outbound_packets_discarded + on: windows.net_nic_discarded + class: Errors + type: Windows +component: Network + lookup: sum -10m unaligned absolute match-names of outbound + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + summary: Outbound network packets discarded + info: Number of outbound discarded packets for the network interface in the last 10 minutes + to: silent + + template: windows_inbound_packets_errors + on: windows.net_nic_errors + class: Errors + type: Windows +component: Network + lookup: sum -10m unaligned absolute match-names of inbound + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + summary: Inbound network errors + info: Number of inbound errors for the network interface in the last 10 minutes + to: silent + + template: windows_outbound_packets_errors + on: windows.net_nic_errors + class: Errors + type: Windows +component: Network + lookup: sum -10m unaligned absolute match-names of outbound + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + summary: Outbound network errors + info: Number of outbound errors for the network interface in the last 10 minutes + to: silent + +## Disk + + template: windows_disk_in_use + on: windows.logical_disk_space_usage + class: Utilization + type: Windows +component: Disk + calc: ($used) * 100 / ($used + $free) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + summary: Disk space usage + info: Disk space utilization + to: sysadmin diff --git a/src/health/health.d/x509check.conf b/src/health/health.d/x509check.conf new file mode 100644 index 000000000..1d40c8602 --- /dev/null +++ b/src/health/health.d/x509check.conf @@ -0,0 +1,26 @@ + + template: x509check_days_until_expiration + on: x509check.time_until_expiration + class: Latency + type: Certificates +component: x509 certificates + calc: $expiry / 86400 + units: days + every: 60s + warn: $this < $days_until_expiration_warning + crit: $this < $days_until_expiration_critical + summary: x509 certificate expiration for ${label:source} + info: Time until x509 certificate expires for ${label:source} + to: webmaster + + template: x509check_revocation_status + on: x509check.revocation_status + class: Errors + type: Certificates +component: x509 certificates + calc: $revoked + every: 60s + crit: $this != nan AND $this != 0 + summary: x509 certificate revocation status for ${label:source} + info: x509 certificate revocation status (0: revoked, 1: valid) for ${label:source} + to: webmaster diff --git a/src/health/health.d/zfs.conf b/src/health/health.d/zfs.conf new file mode 100644 index 000000000..9c1f0018b --- /dev/null +++ b/src/health/health.d/zfs.conf @@ -0,0 +1,90 @@ + + alarm: zfs_memory_throttle + on: zfs.memory_ops + class: Utilization + type: System +component: File system + lookup: sum -10m unaligned absolute of throttled + units: events + every: 1m + warn: $this > 0 + delay: down 1h multiplier 1.5 max 2h + summary: ZFS ARC growth throttling + info: number of times ZFS had to limit the ARC growth in the last 10 minutes + to: silent + +# ZFS pool state + + template: zfs_pool_state_warn + on: zfspool.state + class: Errors + type: System +component: File system + calc: $degraded + units: boolean + every: 10s + warn: $this > 0 + delay: down 1m multiplier 1.5 max 1h + summary: ZFS pool ${label:pool} state + info: ZFS pool ${label:pool} state is degraded + to: sysadmin + + template: zfs_pool_state_crit + on: zfspool.state + class: Errors + type: System +component: File system + calc: $faulted + $unavail + units: boolean + every: 10s + crit: $this > 0 + delay: down 1m multiplier 1.5 max 1h + summary: Critical ZFS pool ${label:pool} state + info: ZFS pool ${label:pool} state is faulted or unavail + to: sysadmin + + +## go.d/zfspool + + template: zfs_pool_space_utilization + on: zfspool.pool_space_utilization + class: Utilization + type: System +component: File system + calc: $utilization + units: % + every: 1m + warn: $this > (($status >= $WARNING ) ? (85) : (90)) + crit: $this > (($status >= $WARNING ) ? (90) : (98)) + delay: down 1m multiplier 1.5 max 1h + summary: ZFS pool ${label:pool} space utilization + info: ZFS pool ${label:pool} is nearing capacity. Current space usage is above the threshold. + to: sysadmin + + template: zfs_pool_health_state_warn + on: zfspool.pool_health_state + class: Errors + type: System +component: File system + calc: $degraded + units: boolean + every: 10s + warn: $this > 0 + delay: down 1m multiplier 1.5 max 1h + summary: ZFS pool ${label:pool} state + info: ZFS pool ${label:pool} state is degraded + to: sysadmin + + template: zfs_pool_health_state_crit + on: zfspool.pool_health_state + class: Errors + type: System +component: File system + calc: $faulted + $unavail + units: boolean + every: 10s + crit: $this > 0 + delay: down 1m multiplier 1.5 max 1h + summary: Critical ZFS pool ${label:pool} state + info: ZFS pool ${label:pool} state is faulted or unavail + to: sysadmin |