9 files changed, 359 insertions, 3 deletions
diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf
new file mode 100644
index 00000000..7a623ba2
--- /dev/null
+++ b/health/health.d/dbengine.conf
@@ -0,0 +1,26 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+   alarm: 10min_dbengine_global_fs_errors
+      on: netdata.dbengine_global_errors
+      os: linux freebsd macos
+   hosts: *
+  lookup: sum -10m unaligned of FS errors
+   units: errors
+   every: 10s
+    crit: $this > 0
+   delay: down 15m multiplier 1.5 max 1h
+    info: number of File-System errors dbengine came across the last 10 minutes (too many open files, wrong permissions etc)
+      to: sysadmin
+
+   alarm: 10min_dbengine_global_io_errors
+      on: netdata.dbengine_global_errors
+      os: linux freebsd macos
+   hosts: *
+  lookup: sum -10m unaligned of I/O errors
+   units: errors
+   every: 10s
+    crit: $this > 0
+   delay: down 1h multiplier 1.5 max 3h
+    info: number of IO errors dbengine came across the last 10 minutes (out of space, bad disk etc)
+      to: sysadmin
diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf
index 26f85848..9c194ced 100644
--- a/health/health.d/disks.conf
+++ b/health/health.d/disks.conf
@@ -13,7 +13,7 @@ template: disk_space_usage
       on: disk.space
       os: linux freebsd
    hosts: *
-families: *
+families: !/dev !/dev/* !/run !/run/* *
     calc: $used * 100 / ($avail + $used)
    units: %
    every: 1m
@@ -27,7 +27,7 @@ template: disk_inode_usage
       on: disk.inodes
       os: linux freebsd
    hosts: *
-families: *
+families: !/dev !/dev/* !/run !/run/* *
     calc: $used * 100 / ($avail + $used)
    units: %
    every: 1m
diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf
new file mode 100644
index 00000000..b7eb4e0a
--- /dev/null
+++ b/health/health.d/dnsmasq_dhcp.conf
@@ -0,0 +1,12 @@
+ # dhcp-range utilization
+
+ template: dnsmasq_dhcp_dhcp_range_utilization
+      on: dnsmasq_dhcp.dhcp_range_utilization
+   every: 10s
+   units: %
+    calc: $used
+    warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) )
+    crit: $this > ( ($status >= $CRITICAL) ? ( 90 ) : ( 95 ) )
+   delay: down 5m
+    info: dhcp-range utilization above threshold!
+      to: sysadmin
diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf
new file mode 100644
index 00000000..4a121723
--- /dev/null
+++ b/health/health.d/pihole.conf
@@ -0,0 +1,67 @@
+
+ # Make sure Pi-hole is responding.
+
+template: pihole_last_collected_secs
+      on: pihole.dns_queries_total
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: webmaster
+
+  # Blocked DNS queries.
+
+ template: pihole_blocked_queries
+       on: pihole.dns_queries_percentage
+    every: 10s
+    units: %
+     calc: $blocked
+     warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) )
+     crit: $this > ( ($status >= $CRITICAL) ? ( 55 ) : ( 75 ) )
+    delay: up 2m down 5m
+     info: percentage of blocked dns queries for the last 24 hour
+       to: sysadmin
+
+
+  # Blocklist last update time.
+  # Default update interval is a week.
+
+ template: pihole_blocklist_last_update
+       on: pihole.blocklist_last_update
+    every: 10s
+    units: seconds
+     calc: $ago
+     warn: $this > 60 * 60 * 24 * 8
+     crit: $this > 60 * 60 * 24 * 8 * 2
+     info: blocklist last update time
+       to: sysadmin
+
+
+  # Gravity file check (gravity.list).
+
+ template: pihole_blocklist_gravity_file
+       on: pihole.blocklist_last_update
+    every: 10s
+    units: boolean
+     calc: $file_exists
+     crit: $this != 1
+    delay: up 2m down 5m
+     info: gravity file existence
+       to: sysadmin
+
+
+  # Pi-hole's ability to block unwanted domains.
+  # Should be enabled. The whole point of Pi-hole!
+
+ template: pihole_status
+       on: pihole.unwanted_domains_blocking_status
+    every: 10s
+    units: boolean
+     calc: $enabled
+     warn: $this != 1
+    delay: up 2m down 5m
+     info: unwanted domains blocking status
+       to: sysadmin
diff --git a/health/health.d/processes.conf b/health/health.d/processes.conf
new file mode 100644
index 00000000..d96998fd
--- /dev/null
+++ b/health/health.d/processes.conf
@@ -0,0 +1,27 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+   alarm: active_processes_limit_freebsd
+      on: system.active_processes
+      os: freebsd
+   hosts: *
+    calc: $active
+   units: processes
+   every: 5s
+    warn: $this > (($status >= $WARNING)  ? (75000) : (80000))
+    crit: $this > (($status == $CRITICAL) ? (85000) : (90000))
+   delay: down 5m multiplier 1.5 max 1h
+    info: the number of active processes
+      to: sysadmin
+
+   alarm: active_processes_limit
+      on: system.active_processes
+      os: linux
+   hosts: *
+    calc: $active
+   units: processes
+   every: 5s
+    warn: $this > (($status >= $WARNING)  ? (25000) : (26000))
+    crit: $this > (($status == $CRITICAL) ? (28000) : (30000))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of active processes
+      to: sysadmin
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
index 93883f73..4e41bb49 100644
--- a/health/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -27,7 +27,7 @@
       on: mem.available
       os: linux
    hosts: *
-    calc: ($avail + $used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers)
+    calc: ($avail + $system.ram.used_ram_to_ignore) * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers)
    units: %
    every: 10s
     warn: $this < (($status >= $WARNING)  ? (15) : (10))
diff --git a/health/health.d/riakkv.conf b/health/health.d/riakkv.conf
new file mode 100644
index 00000000..74530277
--- /dev/null
+++ b/health/health.d/riakkv.conf
@@ -0,0 +1,80 @@
+# Ensure that Riak is running.  template: riak_last_collected_secs
+template: riak_last_collected_secs
+      on: riak.kv.throughput
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: dba
+
+# Warn if a list keys operation is running.
+template: riak_list_keys_active
+      on: riak.core.fsm_active
+    calc: $list_fsm_active
+   units: state machines
+   every: 10s
+    warn: $list_fsm_active > 0
+    info: number of currently running list keys finite state machines
+      to: dba
+
+
+## Timing healthchecks
+# KV GET
+template: 1h_kv_get_mean_latency
+      on: riak.kv.latency.get
+    calc: $node_get_fsm_time_mean
+  lookup: average -1h unaligned of time
+   every: 30s
+   units: ms
+    info: mean average KV GET latency over the last hour
+
+template: riak_kv_get_slow
+      on: riak.kv.latency.get
+    calc: $mean
+  lookup: average -3m unaligned of time
+   units: ms
+   every: 10s
+    warn: ($this > ($1h_kv_get_mean_latency * 2) )
+    crit: ($this > ($1h_kv_get_mean_latency * 3) )
+    info: average KV GET time over the last 3 minutes, compared to the average over the last hour
+   delay: down 5m multiplier 1.5 max 1h
+      to: dba
+
+# KV PUT
+template: 1h_kv_put_mean_latency
+      on: riak.kv.latency.put
+    calc: $node_put_fsm_time_mean
+  lookup: average -1h unaligned of time
+   every: 30s
+   units: ms
+    info: mean average KV PUT latency over the last hour
+
+template: riak_kv_put_slow
+      on: riak.kv.latency.put
+    calc: $mean
+  lookup: average -3m unaligned of time
+   units: ms
+   every: 10s
+    warn: ($this > ($1h_kv_put_mean_latency * 2) )
+    crit: ($this > ($1h_kv_put_mean_latency * 3) )
+    info: average KV PUT time over the last 3 minutes, compared to the average over the last hour
+   delay: down 5m multiplier 1.5 max 1h
+      to: dba
+
+
+## VM healthchecks
+
+# Default Erlang VM process limit: 262144
+# On systems observed, this is < 2000, but may grow depending on load.
+template: riak_vm_high_process_count
+      on: riak.vm
+    calc: $sys_process_count
+   units: processes
+   every: 10s
+    warn: $this > 10000
+    crit: $this > 100000
+    info: number of processes running in the Erlang VM (the default limit on ERTS 10.2.4 is 262144)
+      to: dba
diff --git a/health/health.d/wmi.conf b/health/health.d/wmi.conf
new file mode 100644
index 00000000..0441fc1f
--- /dev/null
+++ b/health/health.d/wmi.conf
@@ -0,0 +1,130 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+## Availability
+
+template: wmi_last_collected_secs
+      on: cpu.collector_duration
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: sysadmin
+
+## CPU
+
+template: wmi_10min_cpu_usage
+      on: wmi.cpu_utilization_total
+      os: linux
+   hosts: *
+  lookup: average -10m unaligned match-names of dpc,user,privileged,interrupt
+   units: %
+   every: 1m
+    warn: $this > (($status >= $WARNING)  ? (75) : (85))
+    crit: $this > (($status == $CRITICAL) ? (85) : (95))
+   delay: down 15m multiplier 1.5 max 1h
+    info: cpu utilization for the last 10 minutes
+      to: sysadmin
+
+
+## Memory
+
+template: wmi_ram_in_use
+      on: wmi.memory_utilization
+      os: linux
+   hosts: *
+    calc: ($used) * 100 / ($used + $available)
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (80) : (90))
+    crit: $this > (($status == $CRITICAL) ? (90) : (98))
+   delay: down 15m multiplier 1.5 max 1h
+    info: used RAM
+      to: sysadmin
+
+template: wmi_swap_in_use
+      on: wmi.memory_swap_utilization
+      os: linux
+   hosts: *
+    calc: ($used) * 100 / ($used + $available)
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (80) : (90))
+    crit: $this > (($status == $CRITICAL) ? (90) : (98))
+   delay: down 15m multiplier 1.5 max 1h
+    info: used Swap
+      to: sysadmin
+
+
+## Network
+
+template: inbound_packets_discarded
+      on: wmi.net_discarded
+      os: linux
+   hosts: *
+families: *
+  lookup: sum -10m unaligned absolute match-names of inbound
+   units: packets
+   every: 1m
+    warn: $this >= 5
+   delay: down 1h multiplier 1.5 max 2h
+    info: interface inbound discarded packets in the last 10 minutes
+      to: sysadmin
+
+template: outbound_packets_discarded
+      on: wmi.net_discarded
+      os: linux
+   hosts: *
+families: *
+  lookup: sum -10m unaligned absolute match-names of outbound
+   units: packets
+   every: 1m
+    warn: $this >= 5
+   delay: down 1h multiplier 1.5 max 2h
+    info: interface outbound discarded packets in the last 10 minutes
+      to: sysadmin
+
+template: inbound_packets_errors
+      on: wmi.net_errors
+      os: linux
+   hosts: *
+families: *
+  lookup: sum -10m unaligned absolute match-names of inbound
+   units: packets
+   every: 1m
+    warn: $this >= 5
+   delay: down 1h multiplier 1.5 max 2h
+    info: interface inbound errors in the last 10 minutes
+      to: sysadmin
+
+template: outbound_packets_errors
+      on: wmi.net_errors
+      os: linux
+   hosts: *
+families: *
+  lookup: sum -10m unaligned absolute match-names of outbound
+   units: packets
+   every: 1m
+    warn: $this >= 5
+   delay: down 1h multiplier 1.5 max 2h
+    info: interface outbound errors in the last 10 minutes
+      to: sysadmin
+
+
+## Disk
+
+template: wmi_disk_in_use
+      on: wmi.logical_disk_utilization
+      os: linux
+   hosts: *
+    calc: ($used) * 100 / ($used + $free)
+   units: %
+   every: 10s
+    warn: $this > (($status >= $WARNING)  ? (80) : (90))
+    crit: $this > (($status == $CRITICAL) ? (90) : (98))
+   delay: down 15m multiplier 1.5 max 1h
+    info: used disk space
+      to: sysadmin
diff --git a/health/health.d/x509check.conf b/health/health.d/x509check.conf
index dc0e6c69..a56f48fc 100644
--- a/health/health.d/x509check.conf
+++ b/health/health.d/x509check.conf
@@ -1,4 +1,18 @@
 
+# make sure x509check is running
+
+template: x509check_last_collected_secs
+      on: x509check.time_until_expiration
+    calc: $now - $last_collected_t
+   units: seconds ago
+   every: 60s
+    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+   delay: down 5m multiplier 1.5 max 1h
+    info: number of seconds since the last successful data collection
+      to: webmaster
+
+
 template: x509check_days_until_expiration
       on: x509check.time_until_expiration
    calc:  $expiry