diff options
Diffstat (limited to 'health/health.d')
-rw-r--r-- | health/health.d/dbengine.conf | 26 | ||||
-rw-r--r-- | health/health.d/disks.conf | 4 | ||||
-rw-r--r-- | health/health.d/dnsmasq_dhcp.conf | 12 | ||||
-rw-r--r-- | health/health.d/pihole.conf | 67 | ||||
-rw-r--r-- | health/health.d/processes.conf | 27 | ||||
-rw-r--r-- | health/health.d/ram.conf | 2 | ||||
-rw-r--r-- | health/health.d/riakkv.conf | 80 | ||||
-rw-r--r-- | health/health.d/wmi.conf | 130 | ||||
-rw-r--r-- | health/health.d/x509check.conf | 14 |
9 files changed, 359 insertions, 3 deletions
diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf new file mode 100644 index 000000000..7a623ba2b --- /dev/null +++ b/health/health.d/dbengine.conf @@ -0,0 +1,26 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: 10min_dbengine_global_fs_errors + on: netdata.dbengine_global_errors + os: linux freebsd macos + hosts: * + lookup: sum -10m unaligned of FS errors + units: errors + every: 10s + crit: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: number of File-System errors dbengine came across the last 10 minutes (too many open files, wrong permissions etc) + to: sysadmin + + alarm: 10min_dbengine_global_io_errors + on: netdata.dbengine_global_errors + os: linux freebsd macos + hosts: * + lookup: sum -10m unaligned of I/O errors + units: errors + every: 10s + crit: $this > 0 + delay: down 1h multiplier 1.5 max 3h + info: number of IO errors dbengine came across the last 10 minutes (out of space, bad disk etc) + to: sysadmin diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf index 26f85848a..9c194ced2 100644 --- a/health/health.d/disks.conf +++ b/health/health.d/disks.conf @@ -13,7 +13,7 @@ template: disk_space_usage on: disk.space os: linux freebsd hosts: * -families: * +families: !/dev !/dev/* !/run !/run/* * calc: $used * 100 / ($avail + $used) units: % every: 1m @@ -27,7 +27,7 @@ template: disk_inode_usage on: disk.inodes os: linux freebsd hosts: * -families: * +families: !/dev !/dev/* !/run !/run/* * calc: $used * 100 / ($avail + $used) units: % every: 1m diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf new file mode 100644 index 000000000..b7eb4e0a3 --- /dev/null +++ b/health/health.d/dnsmasq_dhcp.conf @@ -0,0 +1,12 @@ + # dhcp-range utilization + + template: dnsmasq_dhcp_dhcp_range_utilization + on: dnsmasq_dhcp.dhcp_range_utilization + every: 10s + units: % + calc: $used + warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) ) + crit: $this > ( ($status >= $CRITICAL) ? ( 90 ) : ( 95 ) ) + delay: down 5m + info: dhcp-range utilization above threshold! + to: sysadmin diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf new file mode 100644 index 000000000..4a1217239 --- /dev/null +++ b/health/health.d/pihole.conf @@ -0,0 +1,67 @@ + + # Make sure Pi-hole is responding. + +template: pihole_last_collected_secs + on: pihole.dns_queries_total + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + + # Blocked DNS queries. + + template: pihole_blocked_queries + on: pihole.dns_queries_percentage + every: 10s + units: % + calc: $blocked + warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) ) + crit: $this > ( ($status >= $CRITICAL) ? ( 55 ) : ( 75 ) ) + delay: up 2m down 5m + info: percentage of blocked dns queries for the last 24 hour + to: sysadmin + + + # Blocklist last update time. + # Default update interval is a week. + + template: pihole_blocklist_last_update + on: pihole.blocklist_last_update + every: 10s + units: seconds + calc: $ago + warn: $this > 60 * 60 * 24 * 8 + crit: $this > 60 * 60 * 24 * 8 * 2 + info: blocklist last update time + to: sysadmin + + + # Gravity file check (gravity.list). + + template: pihole_blocklist_gravity_file + on: pihole.blocklist_last_update + every: 10s + units: boolean + calc: $file_exists + crit: $this != 1 + delay: up 2m down 5m + info: gravity file existence + to: sysadmin + + + # Pi-hole's ability to block unwanted domains. + # Should be enabled. The whole point of Pi-hole! + + template: pihole_status + on: pihole.unwanted_domains_blocking_status + every: 10s + units: boolean + calc: $enabled + warn: $this != 1 + delay: up 2m down 5m + info: unwanted domains blocking status + to: sysadmin diff --git a/health/health.d/processes.conf b/health/health.d/processes.conf new file mode 100644 index 000000000..d96998fdf --- /dev/null +++ b/health/health.d/processes.conf @@ -0,0 +1,27 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + alarm: active_processes_limit_freebsd + on: system.active_processes + os: freebsd + hosts: * + calc: $active + units: processes + every: 5s + warn: $this > (($status >= $WARNING) ? (75000) : (80000)) + crit: $this > (($status == $CRITICAL) ? (85000) : (90000)) + delay: down 5m multiplier 1.5 max 1h + info: the number of active processes + to: sysadmin + + alarm: active_processes_limit + on: system.active_processes + os: linux + hosts: * + calc: $active + units: processes + every: 5s + warn: $this > (($status >= $WARNING) ? (25000) : (26000)) + crit: $this > (($status == $CRITICAL) ? (28000) : (30000)) + delay: down 5m multiplier 1.5 max 1h + info: number of active processes + to: sysadmin diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf index 93883f73b..4e41bb496 100644 --- a/health/health.d/ram.conf +++ b/health/health.d/ram.conf @@ -27,7 +27,7 @@ on: mem.available os: linux hosts: * - calc: ($avail + $used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers) + calc: ($avail + $system.ram.used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers) units: % every: 10s warn: $this < (($status >= $WARNING) ? (15) : (10)) diff --git a/health/health.d/riakkv.conf b/health/health.d/riakkv.conf new file mode 100644 index 000000000..745302778 --- /dev/null +++ b/health/health.d/riakkv.conf @@ -0,0 +1,80 @@ +# Ensure that Riak is running. template: riak_last_collected_secs +template: riak_last_collected_secs + on: riak.kv.throughput + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: dba + +# Warn if a list keys operation is running. +template: riak_list_keys_active + on: riak.core.fsm_active + calc: $list_fsm_active + units: state machines + every: 10s + warn: $list_fsm_active > 0 + info: number of currently running list keys finite state machines + to: dba + + +## Timing healthchecks +# KV GET +template: 1h_kv_get_mean_latency + on: riak.kv.latency.get + calc: $node_get_fsm_time_mean + lookup: average -1h unaligned of time + every: 30s + units: ms + info: mean average KV GET latency over the last hour + +template: riak_kv_get_slow + on: riak.kv.latency.get + calc: $mean + lookup: average -3m unaligned of time + units: ms + every: 10s + warn: ($this > ($1h_kv_get_mean_latency * 2) ) + crit: ($this > ($1h_kv_get_mean_latency * 3) ) + info: average KV GET time over the last 3 minutes, compared to the average over the last hour + delay: down 5m multiplier 1.5 max 1h + to: dba + +# KV PUT +template: 1h_kv_put_mean_latency + on: riak.kv.latency.put + calc: $node_put_fsm_time_mean + lookup: average -1h unaligned of time + every: 30s + units: ms + info: mean average KV PUT latency over the last hour + +template: riak_kv_put_slow + on: riak.kv.latency.put + calc: $mean + lookup: average -3m unaligned of time + units: ms + every: 10s + warn: ($this > ($1h_kv_put_mean_latency * 2) ) + crit: ($this > ($1h_kv_put_mean_latency * 3) ) + info: average KV PUT time over the last 3 minutes, compared to the average over the last hour + delay: down 5m multiplier 1.5 max 1h + to: dba + + +## VM healthchecks + +# Default Erlang VM process limit: 262144 +# On systems observed, this is < 2000, but may grow depending on load. +template: riak_vm_high_process_count + on: riak.vm + calc: $sys_process_count + units: processes + every: 10s + warn: $this > 10000 + crit: $this > 100000 + info: number of processes running in the Erlang VM (the default limit on ERTS 10.2.4 is 262144) + to: dba diff --git a/health/health.d/wmi.conf b/health/health.d/wmi.conf new file mode 100644 index 000000000..0441fc1f3 --- /dev/null +++ b/health/health.d/wmi.conf @@ -0,0 +1,130 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + +## Availability + +template: wmi_last_collected_secs + on: cpu.collector_duration + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sysadmin + +## CPU + +template: wmi_10min_cpu_usage + on: wmi.cpu_utilization_total + os: linux + hosts: * + lookup: average -10m unaligned match-names of dpc,user,privileged,interrupt + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (75) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 15m multiplier 1.5 max 1h + info: cpu utilization for the last 10 minutes + to: sysadmin + + +## Memory + +template: wmi_ram_in_use + on: wmi.memory_utilization + os: linux + hosts: * + calc: ($used) * 100 / ($used + $available) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: used RAM + to: sysadmin + +template: wmi_swap_in_use + on: wmi.memory_swap_utilization + os: linux + hosts: * + calc: ($used) * 100 / ($used + $available) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: used Swap + to: sysadmin + + +## Network + +template: inbound_packets_discarded + on: wmi.net_discarded + os: linux + hosts: * +families: * + lookup: sum -10m unaligned absolute match-names of inbound + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + info: interface inbound discarded packets in the last 10 minutes + to: sysadmin + +template: outbound_packets_discarded + on: wmi.net_discarded + os: linux + hosts: * +families: * + lookup: sum -10m unaligned absolute match-names of outbound + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + info: interface outbound discarded packets in the last 10 minutes + to: sysadmin + +template: inbound_packets_errors + on: wmi.net_errors + os: linux + hosts: * +families: * + lookup: sum -10m unaligned absolute match-names of inbound + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + info: interface inbound errors in the last 10 minutes + to: sysadmin + +template: outbound_packets_errors + on: wmi.net_errors + os: linux + hosts: * +families: * + lookup: sum -10m unaligned absolute match-names of outbound + units: packets + every: 1m + warn: $this >= 5 + delay: down 1h multiplier 1.5 max 2h + info: interface outbound errors in the last 10 minutes + to: sysadmin + + +## Disk + +template: wmi_disk_in_use + on: wmi.logical_disk_utilization + os: linux + hosts: * + calc: ($used) * 100 / ($used + $free) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (80) : (90)) + crit: $this > (($status == $CRITICAL) ? (90) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: used disk space + to: sysadmin diff --git a/health/health.d/x509check.conf b/health/health.d/x509check.conf index dc0e6c695..a56f48fc3 100644 --- a/health/health.d/x509check.conf +++ b/health/health.d/x509check.conf @@ -1,4 +1,18 @@ +# make sure x509check is running + +template: x509check_last_collected_secs + on: x509check.time_until_expiration + calc: $now - $last_collected_t + units: seconds ago + every: 60s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster + + template: x509check_days_until_expiration on: x509check.time_until_expiration calc: $expiry |