summaryrefslogtreecommitdiffstats
path: root/health/health.d
diff options
context:
space:
mode:
Diffstat (limited to 'health/health.d')
-rw-r--r--health/health.d/dbengine.conf26
-rw-r--r--health/health.d/disks.conf4
-rw-r--r--health/health.d/dnsmasq_dhcp.conf12
-rw-r--r--health/health.d/pihole.conf67
-rw-r--r--health/health.d/processes.conf27
-rw-r--r--health/health.d/ram.conf2
-rw-r--r--health/health.d/riakkv.conf80
-rw-r--r--health/health.d/wmi.conf130
-rw-r--r--health/health.d/x509check.conf14
9 files changed, 359 insertions, 3 deletions
diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf
new file mode 100644
index 000000000..7a623ba2b
--- /dev/null
+++ b/health/health.d/dbengine.conf
@@ -0,0 +1,26 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ alarm: 10min_dbengine_global_fs_errors
+ on: netdata.dbengine_global_errors
+ os: linux freebsd macos
+ hosts: *
+ lookup: sum -10m unaligned of FS errors
+ units: errors
+ every: 10s
+ crit: $this > 0
+ delay: down 15m multiplier 1.5 max 1h
+ info: number of File-System errors dbengine came across the last 10 minutes (too many open files, wrong permissions etc)
+ to: sysadmin
+
+ alarm: 10min_dbengine_global_io_errors
+ on: netdata.dbengine_global_errors
+ os: linux freebsd macos
+ hosts: *
+ lookup: sum -10m unaligned of I/O errors
+ units: errors
+ every: 10s
+ crit: $this > 0
+ delay: down 1h multiplier 1.5 max 3h
+ info: number of IO errors dbengine came across the last 10 minutes (out of space, bad disk etc)
+ to: sysadmin
diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf
index 26f85848a..9c194ced2 100644
--- a/health/health.d/disks.conf
+++ b/health/health.d/disks.conf
@@ -13,7 +13,7 @@ template: disk_space_usage
on: disk.space
os: linux freebsd
hosts: *
-families: *
+families: !/dev !/dev/* !/run !/run/* *
calc: $used * 100 / ($avail + $used)
units: %
every: 1m
@@ -27,7 +27,7 @@ template: disk_inode_usage
on: disk.inodes
os: linux freebsd
hosts: *
-families: *
+families: !/dev !/dev/* !/run !/run/* *
calc: $used * 100 / ($avail + $used)
units: %
every: 1m
diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf
new file mode 100644
index 000000000..b7eb4e0a3
--- /dev/null
+++ b/health/health.d/dnsmasq_dhcp.conf
@@ -0,0 +1,12 @@
+ # dhcp-range utilization
+
+ template: dnsmasq_dhcp_dhcp_range_utilization
+ on: dnsmasq_dhcp.dhcp_range_utilization
+ every: 10s
+ units: %
+ calc: $used
+ warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) )
+ crit: $this > ( ($status >= $CRITICAL) ? ( 90 ) : ( 95 ) )
+ delay: down 5m
+ info: dhcp-range utilization above threshold!
+ to: sysadmin
diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf
new file mode 100644
index 000000000..4a1217239
--- /dev/null
+++ b/health/health.d/pihole.conf
@@ -0,0 +1,67 @@
+
+ # Make sure Pi-hole is responding.
+
+template: pihole_last_collected_secs
+ on: pihole.dns_queries_total
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: webmaster
+
+ # Blocked DNS queries.
+
+ template: pihole_blocked_queries
+ on: pihole.dns_queries_percentage
+ every: 10s
+ units: %
+ calc: $blocked
+ warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) )
+ crit: $this > ( ($status >= $CRITICAL) ? ( 55 ) : ( 75 ) )
+ delay: up 2m down 5m
+ info: percentage of blocked dns queries for the last 24 hour
+ to: sysadmin
+
+
+ # Blocklist last update time.
+ # Default update interval is a week.
+
+ template: pihole_blocklist_last_update
+ on: pihole.blocklist_last_update
+ every: 10s
+ units: seconds
+ calc: $ago
+ warn: $this > 60 * 60 * 24 * 8
+ crit: $this > 60 * 60 * 24 * 8 * 2
+ info: blocklist last update time
+ to: sysadmin
+
+
+ # Gravity file check (gravity.list).
+
+ template: pihole_blocklist_gravity_file
+ on: pihole.blocklist_last_update
+ every: 10s
+ units: boolean
+ calc: $file_exists
+ crit: $this != 1
+ delay: up 2m down 5m
+ info: gravity file existence
+ to: sysadmin
+
+
+ # Pi-hole's ability to block unwanted domains.
+ # Should be enabled. The whole point of Pi-hole!
+
+ template: pihole_status
+ on: pihole.unwanted_domains_blocking_status
+ every: 10s
+ units: boolean
+ calc: $enabled
+ warn: $this != 1
+ delay: up 2m down 5m
+ info: unwanted domains blocking status
+ to: sysadmin
diff --git a/health/health.d/processes.conf b/health/health.d/processes.conf
new file mode 100644
index 000000000..d96998fdf
--- /dev/null
+++ b/health/health.d/processes.conf
@@ -0,0 +1,27 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ alarm: active_processes_limit_freebsd
+ on: system.active_processes
+ os: freebsd
+ hosts: *
+ calc: $active
+ units: processes
+ every: 5s
+ warn: $this > (($status >= $WARNING) ? (75000) : (80000))
+ crit: $this > (($status == $CRITICAL) ? (85000) : (90000))
+ delay: down 5m multiplier 1.5 max 1h
+ info: the number of active processes
+ to: sysadmin
+
+ alarm: active_processes_limit
+ on: system.active_processes
+ os: linux
+ hosts: *
+ calc: $active
+ units: processes
+ every: 5s
+ warn: $this > (($status >= $WARNING) ? (25000) : (26000))
+ crit: $this > (($status == $CRITICAL) ? (28000) : (30000))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of active processes
+ to: sysadmin
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
index 93883f73b..4e41bb496 100644
--- a/health/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -27,7 +27,7 @@
on: mem.available
os: linux
hosts: *
- calc: ($avail + $used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers)
+ calc: ($avail + $system.ram.used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers)
units: %
every: 10s
warn: $this < (($status >= $WARNING) ? (15) : (10))
diff --git a/health/health.d/riakkv.conf b/health/health.d/riakkv.conf
new file mode 100644
index 000000000..745302778
--- /dev/null
+++ b/health/health.d/riakkv.conf
@@ -0,0 +1,80 @@
+# Ensure that Riak is running. template: riak_last_collected_secs
+template: riak_last_collected_secs
+ on: riak.kv.throughput
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: dba
+
+# Warn if a list keys operation is running.
+template: riak_list_keys_active
+ on: riak.core.fsm_active
+ calc: $list_fsm_active
+ units: state machines
+ every: 10s
+ warn: $list_fsm_active > 0
+ info: number of currently running list keys finite state machines
+ to: dba
+
+
+## Timing healthchecks
+# KV GET
+template: 1h_kv_get_mean_latency
+ on: riak.kv.latency.get
+ calc: $node_get_fsm_time_mean
+ lookup: average -1h unaligned of time
+ every: 30s
+ units: ms
+ info: mean average KV GET latency over the last hour
+
+template: riak_kv_get_slow
+ on: riak.kv.latency.get
+ calc: $mean
+ lookup: average -3m unaligned of time
+ units: ms
+ every: 10s
+ warn: ($this > ($1h_kv_get_mean_latency * 2) )
+ crit: ($this > ($1h_kv_get_mean_latency * 3) )
+ info: average KV GET time over the last 3 minutes, compared to the average over the last hour
+ delay: down 5m multiplier 1.5 max 1h
+ to: dba
+
+# KV PUT
+template: 1h_kv_put_mean_latency
+ on: riak.kv.latency.put
+ calc: $node_put_fsm_time_mean
+ lookup: average -1h unaligned of time
+ every: 30s
+ units: ms
+ info: mean average KV PUT latency over the last hour
+
+template: riak_kv_put_slow
+ on: riak.kv.latency.put
+ calc: $mean
+ lookup: average -3m unaligned of time
+ units: ms
+ every: 10s
+ warn: ($this > ($1h_kv_put_mean_latency * 2) )
+ crit: ($this > ($1h_kv_put_mean_latency * 3) )
+ info: average KV PUT time over the last 3 minutes, compared to the average over the last hour
+ delay: down 5m multiplier 1.5 max 1h
+ to: dba
+
+
+## VM healthchecks
+
+# Default Erlang VM process limit: 262144
+# On systems observed, this is < 2000, but may grow depending on load.
+template: riak_vm_high_process_count
+ on: riak.vm
+ calc: $sys_process_count
+ units: processes
+ every: 10s
+ warn: $this > 10000
+ crit: $this > 100000
+ info: number of processes running in the Erlang VM (the default limit on ERTS 10.2.4 is 262144)
+ to: dba
diff --git a/health/health.d/wmi.conf b/health/health.d/wmi.conf
new file mode 100644
index 000000000..0441fc1f3
--- /dev/null
+++ b/health/health.d/wmi.conf
@@ -0,0 +1,130 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+## Availability
+
+template: wmi_last_collected_secs
+ on: cpu.collector_duration
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: sysadmin
+
+## CPU
+
+template: wmi_10min_cpu_usage
+ on: wmi.cpu_utilization_total
+ os: linux
+ hosts: *
+ lookup: average -10m unaligned match-names of dpc,user,privileged,interrupt
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (75) : (85))
+ crit: $this > (($status == $CRITICAL) ? (85) : (95))
+ delay: down 15m multiplier 1.5 max 1h
+ info: cpu utilization for the last 10 minutes
+ to: sysadmin
+
+
+## Memory
+
+template: wmi_ram_in_use
+ on: wmi.memory_utilization
+ os: linux
+ hosts: *
+ calc: ($used) * 100 / ($used + $available)
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: down 15m multiplier 1.5 max 1h
+ info: used RAM
+ to: sysadmin
+
+template: wmi_swap_in_use
+ on: wmi.memory_swap_utilization
+ os: linux
+ hosts: *
+ calc: ($used) * 100 / ($used + $available)
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: down 15m multiplier 1.5 max 1h
+ info: used Swap
+ to: sysadmin
+
+
+## Network
+
+template: inbound_packets_discarded
+ on: wmi.net_discarded
+ os: linux
+ hosts: *
+families: *
+ lookup: sum -10m unaligned absolute match-names of inbound
+ units: packets
+ every: 1m
+ warn: $this >= 5
+ delay: down 1h multiplier 1.5 max 2h
+ info: interface inbound discarded packets in the last 10 minutes
+ to: sysadmin
+
+template: outbound_packets_discarded
+ on: wmi.net_discarded
+ os: linux
+ hosts: *
+families: *
+ lookup: sum -10m unaligned absolute match-names of outbound
+ units: packets
+ every: 1m
+ warn: $this >= 5
+ delay: down 1h multiplier 1.5 max 2h
+ info: interface outbound discarded packets in the last 10 minutes
+ to: sysadmin
+
+template: inbound_packets_errors
+ on: wmi.net_errors
+ os: linux
+ hosts: *
+families: *
+ lookup: sum -10m unaligned absolute match-names of inbound
+ units: packets
+ every: 1m
+ warn: $this >= 5
+ delay: down 1h multiplier 1.5 max 2h
+ info: interface inbound errors in the last 10 minutes
+ to: sysadmin
+
+template: outbound_packets_errors
+ on: wmi.net_errors
+ os: linux
+ hosts: *
+families: *
+ lookup: sum -10m unaligned absolute match-names of outbound
+ units: packets
+ every: 1m
+ warn: $this >= 5
+ delay: down 1h multiplier 1.5 max 2h
+ info: interface outbound errors in the last 10 minutes
+ to: sysadmin
+
+
+## Disk
+
+template: wmi_disk_in_use
+ on: wmi.logical_disk_utilization
+ os: linux
+ hosts: *
+ calc: ($used) * 100 / ($used + $free)
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (80) : (90))
+ crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ delay: down 15m multiplier 1.5 max 1h
+ info: used disk space
+ to: sysadmin
diff --git a/health/health.d/x509check.conf b/health/health.d/x509check.conf
index dc0e6c695..a56f48fc3 100644
--- a/health/health.d/x509check.conf
+++ b/health/health.d/x509check.conf
@@ -1,4 +1,18 @@
+# make sure x509check is running
+
+template: x509check_last_collected_secs
+ on: x509check.time_until_expiration
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 60s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ delay: down 5m multiplier 1.5 max 1h
+ info: number of seconds since the last successful data collection
+ to: webmaster
+
+
template: x509check_days_until_expiration
on: x509check.time_until_expiration
calc: $expiry