76 files changed, 4710 insertions, 0 deletions
diff --git a/health/health.d/adaptec_raid.conf b/health/health.d/adaptec_raid.conf
new file mode 100644
index 0000000..1d823ad
--- /dev/null
+++ b/health/health.d/adaptec_raid.conf
@@ -0,0 +1,30 @@
+
+# logical device status check
+
+ template: adaptec_raid_ld_status
+       on: adaptec_raid.ld_status
+    class: Errors
+     type: System
+component: RAID
+   lookup: max -10s foreach *
+    units: bool
+    every: 10s
+     crit: $this > 0
+    delay: down 5m multiplier 1.5 max 1h
+     info: logical device status is failed or degraded
+       to: sysadmin
+
+# physical device state check
+
+ template: adaptec_raid_pd_state
+       on: adaptec_raid.pd_state
+    class: Errors
+     type: System
+component: RAID
+   lookup: max -10s foreach *
+    units: bool
+    every: 10s
+     crit: $this > 0
+    delay: down 5m multiplier 1.5 max 1h
+     info: physical device state is not online
+       to: sysadmin
diff --git a/health/health.d/anomalies.conf b/health/health.d/anomalies.conf
new file mode 100644
index 0000000..269ae54
--- /dev/null
+++ b/health/health.d/anomalies.conf
@@ -0,0 +1,23 @@
+# raise a warning alarm if an anomaly probability is consistently above 50%
+
+ template: anomalies_anomaly_probabilities
+       on: anomalies.probability
+    class: Errors
+     type: Netdata
+component: ML
+   lookup: average -2m foreach *
+    every: 1m
+     warn: $this > 50
+     info: average anomaly probability over the last 2 minutes
+
+# raise a warning alarm if an anomaly flag is consistently firing
+
+ template: anomalies_anomaly_flags
+       on: anomalies.anomaly
+    class: Errors
+     type: Netdata
+component: ML
+   lookup: sum -2m foreach *
+    every: 1m
+     warn: $this > 10
+     info: number of anomalies in the last 2 minutes
diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf
new file mode 100644
index 0000000..65f1a69
--- /dev/null
+++ b/health/health.d/apcupsd.conf
@@ -0,0 +1,49 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: apcupsd_10min_ups_load
+       on: apcupsd.load
+    class: Utilization
+     type: Power Supply
+component: UPS
+       os: *
+    hosts: *
+   lookup: average -10m unaligned of percentage
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (70) : (80))
+     crit: $this > (($status == $CRITICAL) ? (85) : (95))
+    delay: down 10m multiplier 1.5 max 1h
+     info: average UPS load over the last 10 minutes
+       to: sitemgr
+
+# Discussion in https://github.com/netdata/netdata/pull/3928:
+# Fire the alarm as soon as it's going on battery (99% charge) and clear only when full.
+ template: apcupsd_ups_charge
+       on: apcupsd.charge
+    class: Errors
+     type: Power Supply
+component: UPS
+       os: *
+    hosts: *
+   lookup: average -60s unaligned of charge
+    units: %
+    every: 60s
+     warn: $this < 100
+     crit: $this < (($status == $CRITICAL) ? (60) : (50))
+    delay: down 10m multiplier 1.5 max 1h
+     info: average UPS charge over the last minute
+       to: sitemgr
+
+ template: apcupsd_last_collected_secs
+       on: apcupsd.load
+    class: Latency
+     type: Power Supply
+component: UPS device
+     calc: $now - $last_collected_t
+    every: 10s
+    units: seconds ago
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: sitemgr
diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf
new file mode 100644
index 0000000..49cb5ad
--- /dev/null
+++ b/health/health.d/bcache.conf
@@ -0,0 +1,30 @@
+
+ template: bcache_cache_errors
+       on: disk.bcache_cache_read_races
+    class: Errors
+     type: System
+component: Disk
+   lookup: sum -1m unaligned absolute
+    units: errors
+    every: 1m
+     warn: $this > 0
+    delay: up 2m down 1h multiplier 1.5 max 2h
+     info: number of times data was read from the cache, \
+           the bucket was reused and invalidated in the last 10 minutes \
+           (when this occurs the data is reread from the backing device)
+       to: sysadmin
+
+ template: bcache_cache_dirty
+       on: disk.bcache_cache_alloc
+    class: Utilization
+     type: System
+component: Disk
+     calc: $dirty + $metadata + $undefined
+    units: %
+    every: 1m
+     warn: $this > ( ($status >= $WARNING ) ? ( 70 ) : ( 90 ) )
+     crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
+    delay: up 1m down 1h multiplier 1.5 max 2h
+     info: percentage of cache space used for dirty data and metadata \
+           (this usually means your SSD cache is too small)
+       to: sysadmin
diff --git a/health/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf
new file mode 100644
index 0000000..13ac8c1
--- /dev/null
+++ b/health/health.d/beanstalkd.conf
@@ -0,0 +1,41 @@
+# get the number of buried jobs in all queues
+
+ template: beanstalk_server_buried_jobs
+       on: beanstalk.current_jobs
+    class: Workload
+     type: Messaging
+component: Beanstalk
+     calc: $buried
+    units: jobs
+    every: 10s
+     warn: $this > 0
+     crit: $this > 10
+    delay: up 0 down 5m multiplier 1.2 max 1h
+     info: number of buried jobs across all tubes. \
+           You need to manually kick them so they can be processed. \
+           Presence of buried jobs in a tube does not affect new jobs.
+       to: sysadmin
+      
+# get the number of buried jobs per queue
+
+#template: beanstalk_tube_buried_jobs
+#      on: beanstalk.jobs
+#    calc: $buried
+#   units: jobs
+#   every: 10s
+#    warn: $this > 0
+#    crit: $this > 10
+#   delay: up 0 down 5m multiplier 1.2 max 1h
+#    info: the number of jobs buried per tube
+#      to: sysadmin
+
+# get the current number of tubes
+
+#template: beanstalk_number_of_tubes
+#      on: beanstalk.current_tubes
+#    calc: $tubes
+#   every: 10s
+#    warn: $this < 5
+#   delay: up 0 down 5m multiplier 1.2 max 1h
+#    info: the current number of tubes on the server
+#      to: sysadmin
diff --git a/health/health.d/bind_rndc.conf b/health/health.d/bind_rndc.conf
new file mode 100644
index 0000000..7c09225
--- /dev/null
+++ b/health/health.d/bind_rndc.conf
@@ -0,0 +1,12 @@
+ template: bind_rndc_stats_file_size
+       on: bind_rndc.stats_size
+    class: Utilization
+     type: DNS
+component: BIND
+    units: megabytes
+    every: 60
+     calc: $stats_size
+     warn: $this > 512
+     crit: $this > 1024
+     info: BIND statistics-file size
+       to: sysadmin
diff --git a/health/health.d/boinc.conf b/health/health.d/boinc.conf
new file mode 100644
index 0000000..7d7a4fd
--- /dev/null
+++ b/health/health.d/boinc.conf
@@ -0,0 +1,74 @@
+# Alarms for various BOINC issues.
+
+# Warn on any compute errors encountered.
+ template: boinc_compute_errors
+       on: boinc.states
+    class: Errors
+     type: Computing
+component: BOINC
+       os: *
+    hosts: *
+ families: *
+   lookup: average -10m unaligned of comperror
+    units: tasks
+    every: 1m
+     warn: $this > 0
+     crit: $this > 1
+    delay: up 1m down 5m multiplier 1.5 max 1h
+     info: average number of compute errors over the last 10 minutes
+       to: sysadmin
+
+# Warn on lots of upload errors
+ template: boinc_upload_errors
+       on: boinc.states
+    class: Errors
+     type: Computing
+component: BOINC
+       os: *
+    hosts: *
+ families: *
+   lookup: average -10m unaligned of upload_failed
+    units: tasks
+    every: 1m
+     warn: $this > 0
+     crit: $this > 1
+    delay: up 1m down 5m multiplier 1.5 max 1h
+     info: average number of failed uploads over the last 10 minutes
+       to: sysadmin
+
+# Warn on the task queue being empty
+ template: boinc_total_tasks
+       on: boinc.tasks
+    class: Utilization
+     type: Computing
+component: BOINC
+       os: *
+    hosts: *
+ families: *
+   lookup: average -10m unaligned of total
+    units: tasks
+    every: 1m
+     warn: $this < 1
+     crit: $this < 0.1
+    delay: up 5m down 10m multiplier 1.5 max 1h
+     info: average number of total tasks over the last 10 minutes
+       to: sysadmin
+
+# Warn on no active tasks with a non-empty queue
+ template: boinc_active_tasks
+       on: boinc.tasks
+    class: Utilization
+     type: Computing
+component: BOINC
+       os: *
+    hosts: *
+ families: *
+   lookup: average -10m unaligned of active
+     calc: ($boinc_total_tasks >= 1) ? ($this) : (inf)
+    units: tasks
+    every: 1m
+     warn: $this < 1
+     crit: $this < 0.1
+    delay: up 5m down 10m multiplier 1.5 max 1h
+     info: average number of active tasks over the last 10 minutes
+       to: sysadmin
diff --git a/health/health.d/btrfs.conf b/health/health.d/btrfs.conf
new file mode 100644
index 0000000..8d197aa
--- /dev/null
+++ b/health/health.d/btrfs.conf
@@ -0,0 +1,68 @@
+
+ template: btrfs_allocated
+       on: btrfs.disk
+    class: Utilization
+     type: System
+component: File system
+       os: *
+    hosts: *
+ families: *
+     calc: 100 - ($unallocated * 100 / ($unallocated + $data_used + $data_free + $meta_used + $meta_free + $sys_used + $sys_free))
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (90) : (95))
+     crit: $this > (($status == $CRITICAL) ? (95) : (98))
+    delay: up 1m down 15m multiplier 1.5 max 1h
+     info: percentage of allocated BTRFS physical disk space
+       to: sysadmin
+
+ template: btrfs_data
+       on: btrfs.data
+    class: Utilization
+     type: System
+component: File system
+       os: *
+    hosts: *
+ families: *
+     calc: $used * 100 / ($used + $free)
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (90) : (95)) && $btrfs_allocated > 98
+     crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
+    delay: up 1m down 15m multiplier 1.5 max 1h
+     info: utilization of BTRFS data space
+       to: sysadmin
+
+ template: btrfs_metadata
+       on: btrfs.metadata
+    class: Utilization
+     type: System
+component: File system
+       os: *
+    hosts: *
+ families: *
+     calc: ($used + $reserved) * 100 / ($used + $free + $reserved)
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (90) : (95)) && $btrfs_allocated > 98
+     crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
+    delay: up 1m down 15m multiplier 1.5 max 1h
+     info: utilization of BTRFS metadata space
+       to: sysadmin
+
+ template: btrfs_system
+       on: btrfs.system
+    class: Utilization
+     type: System
+component: File system
+       os: *
+    hosts: *
+ families: *
+     calc: $used * 100 / ($used + $free)
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (90) : (95)) && $btrfs_allocated > 98
+     crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
+    delay: up 1m down 15m multiplier 1.5 max 1h
+     info: utilization of BTRFS system space
+       to: sysadmin
diff --git a/health/health.d/ceph.conf b/health/health.d/ceph.conf
new file mode 100644
index 0000000..1f9da25
--- /dev/null
+++ b/health/health.d/ceph.conf
@@ -0,0 +1,15 @@
+# low ceph disk available
+
+ template: ceph_cluster_space_usage
+       on: ceph.general_usage
+    class: Utilization
+     type: Storage
+component: Ceph
+     calc: $used * 100 / ($used + $avail)
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING ) ? (85) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: down 5m multiplier 1.2 max 1h
+     info: cluster disk space utilization
+       to: sysadmin
diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf
new file mode 100644
index 0000000..4bfe38b
--- /dev/null
+++ b/health/health.d/cgroups.conf
@@ -0,0 +1,141 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: cgroup_10min_cpu_usage
+       on: cgroup.cpu_limit
+    class: Utilization
+     type: Cgroups
+component: CPU
+       os: linux
+    hosts: *
+   lookup: average -10m unaligned
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (75) : (85))
+     crit: $this > (($status == $CRITICAL) ? (85) : (95))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average cgroup CPU utilization over the last 10 minutes
+       to: sysadmin
+
+ template: cgroup_ram_in_use
+       on: cgroup.mem_usage
+    class: Utilization
+     type: Cgroups
+component: Memory
+       os: linux
+    hosts: *
+     calc: ($ram) * 100 / $memory_limit
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (80) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: down 15m multiplier 1.5 max 1h
+     info: cgroup memory utilization
+       to: sysadmin
+
+# -----------------------------------------------------------------------------
+# check for packet storms
+
+# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
+# 2. do the same for the last 10s
+# 3. raise an alarm if the later is 10x or 20x the first
+# we assume the minimum packet storm should at least have
+# 10000 packets/s, average of the last 10 seconds
+
+ template: cgroup_1m_received_packets_rate
+       on: cgroup.net_packets
+    class: Workload
+     type: Cgroups
+component: Network
+    hosts: *
+   lookup: average -1m unaligned of received
+    units: packets
+    every: 10s
+     info: average number of packets received by the network interface $family over the last minute
+
+ template: cgroup_10s_received_packets_storm
+       on: cgroup.net_packets
+    class: Workload
+     type: Cgroups
+component: Network
+    hosts: *
+   lookup: average -10s unaligned of received
+     calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
+    every: 10s
+    units: %
+     warn: $this > (($status >= $WARNING)?(200):(5000))
+     crit: $this > (($status == $CRITICAL)?(5000):(6000))
+  options: no-clear-notification
+     info: ratio of average number of received packets for the network interface $family over the last 10 seconds, \
+           compared to the rate over the last minute
+       to: sysadmin
+
+# ---------------------------------K8s containers--------------------------------------------
+
+ template: k8s_cgroup_10min_cpu_usage
+       on: k8s.cgroup.cpu_limit
+    class: Utilization
+     type: Cgroups
+component: CPU
+       os: linux
+    hosts: *
+   lookup: average -10m unaligned
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (75) : (85))
+     crit: $this > (($status == $CRITICAL) ? (85) : (95))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average cgroup CPU utilization over the last 10 minutes
+       to: sysadmin
+
+ template: k8s_cgroup_ram_in_use
+       on: k8s.cgroup.mem_usage
+    class: Utilization
+     type: Cgroups
+component: Memory
+       os: linux
+    hosts: *
+     calc: ($ram) * 100 / $memory_limit
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (80) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: down 15m multiplier 1.5 max 1h
+     info: cgroup memory utilization
+       to: sysadmin
+
+# check for packet storms
+
+# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
+# 2. do the same for the last 10s
+# 3. raise an alarm if the later is 10x or 20x the first
+# we assume the minimum packet storm should at least have
+# 10000 packets/s, average of the last 10 seconds
+
+ template: k8s_cgroup_1m_received_packets_rate
+       on: k8s.cgroup.net_packets
+    class: Workload
+     type: Cgroups
+component: Network
+    hosts: *
+   lookup: average -1m unaligned of received
+    units: packets
+    every: 10s
+     info: average number of packets received by the network interface $family over the last minute
+
+ template: k8s_cgroup_10s_received_packets_storm
+       on: k8s.cgroup.net_packets
+    class: Workload
+     type: Cgroups
+component: Network
+    hosts: *
+   lookup: average -10s unaligned of received
+     calc: $this * 100 / (($k8s_cgroup_10s_received_packets_storm < 1000)?(1000):($k8s_cgroup_10s_received_packets_storm))
+    every: 10s
+    units: %
+     warn: $this > (($status >= $WARNING)?(200):(5000))
+     crit: $this > (($status == $CRITICAL)?(5000):(6000))
+  options: no-clear-notification
+     info: ratio of average number of received packets for the network interface $family over the last 10 seconds, \
+           compared to the rate over the last minute
+       to: sysadmin
diff --git a/health/health.d/cockroachdb.conf b/health/health.d/cockroachdb.conf
new file mode 100644
index 0000000..1f22784
--- /dev/null
+++ b/health/health.d/cockroachdb.conf
@@ -0,0 +1,73 @@
+
+# Capacity
+
+ template: cockroachdb_used_storage_capacity
+       on: cockroachdb.storage_used_capacity_percentage
+    class: Utilization
+     type: Database
+component: CockroachDB
+     calc: $capacity_used_percent
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (80) : (85))
+     crit: $this > (($status == $CRITICAL) ? (85) : (95))
+    delay: down 15m multiplier 1.5 max 1h
+     info: storage capacity utilization
+       to: dba
+
+ template: cockroachdb_used_usable_storage_capacity
+       on: cockroachdb.storage_used_capacity_percentage
+    class: Utilization
+     type: Database
+component: CockroachDB
+     calc: $capacity_usable_used_percent
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (80) : (85))
+     crit: $this > (($status == $CRITICAL) ? (85) : (95))
+    delay: down 15m multiplier 1.5 max 1h
+     info: storage usable space utilization
+       to: dba
+
+# Replication
+
+ template: cockroachdb_unavailable_ranges
+       on: cockroachdb.ranges_replication_problem
+    class: Errors
+     type: Database
+component: CockroachDB
+     calc: $ranges_unavailable
+    units: num
+    every: 10s
+     warn: $this > 0
+    delay: down 15m multiplier 1.5 max 1h
+     info: number of ranges with fewer live replicas than needed for quorum
+       to: dba
+
+ template: cockroachdb_underreplicated_ranges
+       on: cockroachdb.ranges_replication_problem
+    class: Errors
+     type: Database
+component: CockroachDB
+     calc: $ranges_underreplicated
+    units: num
+    every: 10s
+     warn: $this > 0
+    delay: down 15m multiplier 1.5 max 1h
+     info: number of ranges with fewer live replicas than the replication target
+       to: dba
+
+# FD
+
+ template: cockroachdb_open_file_descriptors_limit
+       on: cockroachdb.process_file_descriptors
+    class: Utilization
+     type: Database
+component: CockroachDB
+     calc: $sys_fd_open/$sys_fd_softlimit * 100
+    units: %
+    every: 10s
+     warn: $this > 80
+    delay: down 15m multiplier 1.5 max 1h
+     info: open file descriptors utilization (against softlimit)
+       to: dba
diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf
new file mode 100644
index 0000000..ad69528
--- /dev/null
+++ b/health/health.d/cpu.conf
@@ -0,0 +1,67 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: 10min_cpu_usage
+       on: system.cpu
+    class: Utilization
+     type: System
+component: CPU
+       os: linux
+    hosts: *
+   lookup: average -10m unaligned of user,system,softirq,irq,guest
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (75) : (85))
+     crit: $this > (($status == $CRITICAL) ? (85) : (95))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average CPU utilization over the last 10 minutes (excluding iowait, nice and steal)
+       to: sysadmin
+
+ template: 10min_cpu_iowait
+       on: system.cpu
+    class: Utilization
+     type: System
+component: CPU
+       os: linux
+    hosts: *
+   lookup: average -10m unaligned of iowait
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (20) : (40))
+     crit: $this > (($status == $CRITICAL) ? (40) : (50))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average CPU iowait time over the last 10 minutes
+       to: sysadmin
+
+ template: 20min_steal_cpu
+       on: system.cpu
+    class: Latency
+     type: System
+component: CPU
+       os: linux
+    hosts: *
+   lookup: average -20m unaligned of steal
+    units: %
+    every: 5m
+     warn: $this > (($status >= $WARNING)  ? (5)  : (10))
+     crit: $this > (($status == $CRITICAL) ? (20) : (30))
+    delay: down 1h multiplier 1.5 max 2h
+     info: average CPU steal time over the last 20 minutes
+       to: sysadmin
+
+## FreeBSD
+ template: 10min_cpu_usage
+       on: system.cpu
+    class: Utilization
+     type: System
+component: CPU
+       os: freebsd
+    hosts: *
+   lookup: average -10m unaligned of user,system,interrupt
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (75) : (85))
+     crit: $this > (($status == $CRITICAL) ? (85) : (95))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average CPU utilization over the last 10 minutes (excluding nice)
+       to: sysadmin
diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf
new file mode 100644
index 0000000..65c41b8
--- /dev/null
+++ b/health/health.d/dbengine.conf
@@ -0,0 +1,64 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+    alarm: 10min_dbengine_global_fs_errors
+       on: netdata.dbengine_global_errors
+    class: Errors
+     type: Netdata
+component: DB engine
+       os: linux freebsd macos
+    hosts: *
+   lookup: sum -10m unaligned of fs_errors
+    units: errors
+    every: 10s
+     crit: $this > 0
+    delay: down 15m multiplier 1.5 max 1h
+     info: number of filesystem errors in the last 10 minutes (too many open files, wrong permissions, etc)
+       to: sysadmin
+
+    alarm: 10min_dbengine_global_io_errors
+       on: netdata.dbengine_global_errors
+    class: Errors
+     type: Netdata
+component: DB engine
+       os: linux freebsd macos
+    hosts: *
+   lookup: sum -10m unaligned of io_errors
+    units: errors
+    every: 10s
+     crit: $this > 0
+    delay: down 1h multiplier 1.5 max 3h
+     info: number of IO errors in the last 10 minutes (CRC errors, out of space, bad disk, etc)
+       to: sysadmin
+
+    alarm: 10min_dbengine_global_flushing_warnings
+       on: netdata.dbengine_global_errors
+    class: Errors
+     type: Netdata
+component: DB engine
+       os: linux freebsd macos
+    hosts: *
+   lookup: sum -10m unaligned of pg_cache_over_half_dirty_events
+    units: errors
+    every: 10s
+     warn: $this > 0
+    delay: down 1h multiplier 1.5 max 3h
+     info: number of times when dbengine dirty pages were over 50% of the instance's page cache in the last 10 minutes. \
+           Metric data are at risk of not being stored in the database. To remedy, reduce disk load or use faster disks.
+       to: sysadmin
+
+    alarm: 10min_dbengine_global_flushing_errors
+       on: netdata.dbengine_long_term_page_stats
+    class: Errors
+     type: Netdata
+component: DB engine
+       os: linux freebsd macos
+    hosts: *
+   lookup: sum -10m unaligned of flushing_pressure_deletions
+    units: pages
+    every: 10s
+     crit: $this != 0
+    delay: down 1h multiplier 1.5 max 3h
+     info: number of pages deleted due to failure to flush data to disk in the last 10 minutes. \
+           Metric data were lost to unblock data collection. To fix, reduce disk load or use faster disks.
+       to: sysadmin
diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf
new file mode 100644
index 0000000..5daff61
--- /dev/null
+++ b/health/health.d/disks.conf
@@ -0,0 +1,173 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+
+# -----------------------------------------------------------------------------
+# low disk space
+
+# checking the latest collected values
+# raise an alarm if the disk is low on
+# available disk space
+
+ template: disk_space_usage
+       on: disk.space
+    class: Utilization
+     type: System
+component: Disk
+       os: linux freebsd
+    hosts: *
+ families: !/dev !/dev/* !/run !/run/* *
+     calc: $used * 100 / ($avail + $used)
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING ) ? (80) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: up 1m down 15m multiplier 1.5 max 1h
+     info: disk $family space utilization
+       to: sysadmin
+
+ template: disk_inode_usage
+       on: disk.inodes
+    class: Utilization
+     type: System
+component: Disk
+       os: linux freebsd
+    hosts: *
+ families: !/dev !/dev/* !/run !/run/* *
+     calc: $used * 100 / ($avail + $used)
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (80) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: up 1m down 15m multiplier 1.5 max 1h
+     info: disk $family inode utilization
+       to: sysadmin
+
+
+# -----------------------------------------------------------------------------
+# disk fill rate
+
+# calculate the rate the disk fills
+# use as base, the available space change
+# during the last hour
+
+# this is just a calculation - it has no alarm
+# we will use it in the next template to find
+# the hours remaining
+
+# template: disk_fill_rate
+#       on: disk.space
+#       os: linux freebsd
+#    hosts: *
+# families: *
+#   lookup: min -10m at -50m unaligned of avail
+#     calc: ($this - $avail) / (($now - $after) / 3600)
+#    every: 1m
+#    units: GB/hour
+#     info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour
+
+
+# calculate the hours remaining
+# if the disk continues to fill
+# in this rate
+
+# template: out_of_disk_space_time
+#       on: disk.space
+#       os: linux freebsd
+#    hosts: *
+# families: *
+#     calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf)
+#    units: hours
+#    every: 10s
+#     warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
+#     crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+#    delay: down 15m multiplier 1.2 max 1h
+#     info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour
+#       to: sysadmin
+
+
+# -----------------------------------------------------------------------------
+# disk inode fill rate
+
+# calculate the rate the disk inodes are allocated
+# use as base, the available inodes change
+# during the last hour
+
+# this is just a calculation - it has no alarm
+# we will use it in the next template to find
+# the hours remaining
+
+# template: disk_inode_rate
+#       on: disk.inodes
+#       os: linux freebsd
+#    hosts: *
+# families: *
+#   lookup: min -10m at -50m unaligned of avail
+#     calc: ($this - $avail) / (($now - $after) / 3600)
+#    every: 1m
+#    units: inodes/hour
+#     info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour
+
+# calculate the hours remaining
+# if the disk inodes are allocated
+# in this rate
+
+# template: out_of_disk_inodes_time
+#       on: disk.inodes
+#       os: linux freebsd
+#    hosts: *
+# families: *
+#     calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf)
+#    units: hours
+#    every: 10s
+#     warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
+#     crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+#    delay: down 15m multiplier 1.2 max 1h
+#     info: estimated time the disk will run out of inodes, if the system continues to allocate inodes with the rate of the last hour
+#       to: sysadmin
+
+
+# -----------------------------------------------------------------------------
+# disk congestion
+
+# raise an alarm if the disk is congested
+# by calculating the average disk utilization
+# for the last 10 minutes
+
+ template: 10min_disk_utilization
+       on: disk.util
+    class: Utilization
+     type: System
+component: Disk
+       os: linux freebsd
+    hosts: *
+ families: *
+   lookup: average -10m unaligned
+    units: %
+    every: 1m
+     warn: $this > 98 * (($status >= $WARNING)  ? (0.7) : (1))
+    delay: down 15m multiplier 1.2 max 1h
+     info: average percentage of time $family disk was busy over the last 10 minutes
+       to: silent
+
+
+# raise an alarm if the disk backlog
+# is above 1000ms (1s) per second
+# for 10 minutes
+# (i.e. the disk cannot catch up)
+
+ template: 10min_disk_backlog
+       on: disk.backlog
+    class: Latency
+     type: System
+component: Disk
+       os: linux
+    hosts: *
+ families: *
+   lookup: average -10m unaligned
+    units: ms
+    every: 1m
+     warn: $this > 5000 * (($status >= $WARNING)  ? (0.7) : (1))
+    delay: down 15m multiplier 1.2 max 1h
+     info: average backlog size of the $family disk over the last 10 minutes
+       to: silent
diff --git a/health/health.d/dns_query.conf b/health/health.d/dns_query.conf
new file mode 100644
index 0000000..b9d6c23
--- /dev/null
+++ b/health/health.d/dns_query.conf
@@ -0,0 +1,14 @@
+# detect dns query failure
+
+ template: dns_query_query_status
+       on: dns_query.query_status
+    class: Errors
+     type: DNS
+component: DNS
+     calc: $success
+    units: status
+    every: 10s
+     warn: $this != nan && $this != 1
+    delay: up 30s down 5m multiplier 1.5 max 1h
+     info: DNS request type $label:record_type to server $label:server is unsuccessful
+       to: sysadmin
diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf
new file mode 100644
index 0000000..010b945
--- /dev/null
+++ b/health/health.d/dnsmasq_dhcp.conf
@@ -0,0 +1,15 @@
+# dhcp-range utilization
+
+ template: dnsmasq_dhcp_dhcp_range_utilization
+       on: dnsmasq_dhcp.dhcp_range_utilization
+    class: Utilization
+     type: DHCP
+component: Dnsmasq
+    every: 10s
+    units: %
+     calc: $used
+     warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) )
+     crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
+    delay: down 5m
+     info: DHCP range utilization
+       to: sysadmin
diff --git a/health/health.d/dockerd.conf b/health/health.d/dockerd.conf
new file mode 100644
index 0000000..220ddd6
--- /dev/null
+++ b/health/health.d/dockerd.conf
@@ -0,0 +1,11 @@
+ template: docker_unhealthy_containers
+       on: docker.unhealthy_containers
+    class: Errors
+     type: Containers
+component: Docker
+    units: unhealthy containers
+    every: 10s
+   lookup: average -10s
+     crit: $this > 0
+     info: average number of unhealthy docker containers over the last 10 seconds
+       to: sysadmin
diff --git a/health/health.d/entropy.conf b/health/health.d/entropy.conf
new file mode 100644
index 0000000..13b0fcd
--- /dev/null
+++ b/health/health.d/entropy.conf
@@ -0,0 +1,19 @@
+
+# check if entropy is too low
+# the alarm is checked every 1 minute
+# and examines the last hour of data
+
+    alarm: lowest_entropy
+       on: system.entropy
+    class: Utilization
+     type: System
+component: Cryptography
+       os: linux
+    hosts: *
+   lookup: min -5m unaligned
+    units: entries
+    every: 5m
+     warn: $this < (($status >= $WARNING) ? (200) : (100))
+    delay: down 1h multiplier 1.5 max 2h
+     info: minimum number of entries in the random numbers pool in the last 5 minutes
+       to: silent
diff --git a/health/health.d/exporting.conf b/health/health.d/exporting.conf
new file mode 100644
index 0000000..06f398c
--- /dev/null
+++ b/health/health.d/exporting.conf
@@ -0,0 +1,29 @@
+
+ template: exporting_last_buffering
+ families: *
+       on: exporting_data_size
+    class: Latency
+     type: Netdata
+component: Exporting engine
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful buffering of exporting data
+       to: dba
+
+ template: exporting_metrics_sent
+ families: *
+       on: exporting_data_size
+    class: Workload
+     type: Netdata
+component: Exporting engine
+    units: %
+     calc: abs($sent) * 100 / abs($buffered)
+    every: 10s
+     warn: $this != 100
+    delay: down 5m multiplier 1.5 max 1h
+     info: percentage of metrics sent to the external database server
+       to: dba
diff --git a/health/health.d/fping.conf b/health/health.d/fping.conf
new file mode 100644
index 0000000..bb22419
--- /dev/null
+++ b/health/health.d/fping.conf
@@ -0,0 +1,64 @@
+
+ template: fping_last_collected_secs
+ families: *
+       on: fping.latency
+    class: Latency
+     type: Other
+component: Network
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: sysadmin
+
+ template: fping_host_reachable
+ families: *
+       on: fping.latency
+    class: Errors
+     type: Other
+component: Network
+     calc: $average != nan
+    units: up/down
+    every: 10s
+     crit: $this == 0
+    delay: down 30m multiplier 1.5 max 2h
+     info: reachability status of the network host (0: unreachable, 1: reachable)
+       to: sysadmin
+
+ template: fping_host_latency
+ families: *
+       on: fping.latency
+    class: Latency
+     type: Other
+component: Network
+   lookup: average -10s unaligned of average
+    units: ms
+    every: 10s
+    green: 500
+      red: 1000
+     warn: $this > $green OR $max > $red
+     crit: $this > $red
+    delay: down 30m multiplier 1.5 max 2h
+     info: average latency to the network host over the last 10 seconds
+       to: sysadmin
+
+ template: fping_packet_loss
+ families: *
+       on: fping.quality
+    class: Errors
+     type: System
+component: Network
+   lookup: average -10m unaligned of returned
+     calc: 100 - $this
+    green: 1
+      red: 10
+    units: %
+    every: 10s
+     warn: $this > $green
+     crit: $this > $red
+    delay: down 30m multiplier 1.5 max 2h
+     info: packet loss ratio to the network host over the last 10 minutes
+       to: sysadmin
diff --git a/health/health.d/gearman.conf b/health/health.d/gearman.conf
new file mode 100644
index 0000000..14010d4
--- /dev/null
+++ b/health/health.d/gearman.conf
@@ -0,0 +1,14 @@
+
+ template: gearman_workers_queued
+       on: gearman.single_job
+    class: Latency
+     type: Computing
+component: Gearman
+   lookup: average -10m unaligned match-names of Pending
+    units: workers
+    every: 10s
+     warn: $this > 30000
+     crit: $this > 100000
+    delay: down 5m multiplier 1.5 max 1h
+     info: average number of queued jobs over the last 10 minutes
+       to: sysadmin
diff --git a/health/health.d/geth.conf b/health/health.d/geth.conf
new file mode 100644
index 0000000..dd1eb47
--- /dev/null
+++ b/health/health.d/geth.conf
@@ -0,0 +1,12 @@
+#chainhead_header is expected momenterarily to be ahead. If its considerably ahead (e.g more than 5 blocks), then the node is definitely out of sync. 
+ template: geth_chainhead_diff_between_header_block
+       on: geth.chainhead
+    class: Workload
+     type: ethereum_node
+component: geth
+    every: 10s
+     calc: $chain_head_block -  $chain_head_header
+    units: blocks
+     warn: $this != 0
+     crit: $this > 5
+    delay: down 1m multiplier 1.5 max 1h
diff --git a/health/health.d/go.d.plugin.conf b/health/health.d/go.d.plugin.conf
new file mode 100644
index 0000000..cd87fe0
--- /dev/null
+++ b/health/health.d/go.d.plugin.conf
@@ -0,0 +1,17 @@
+
+# make sure go.d.plugin data collection job is running
+
+ template: go.d_job_last_collected_secs
+       on: netdata.go_plugin_execution_time
+    class: Errors
+     type: Netdata
+component: go.d.plugin
+   module: !* *
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: webmaster
diff --git a/health/health.d/haproxy.conf b/health/health.d/haproxy.conf
new file mode 100644
index 0000000..a0ab52b
--- /dev/null
+++ b/health/health.d/haproxy.conf
@@ -0,0 +1,23 @@
+ template: haproxy_backend_server_status
+       on: haproxy_hs.down
+    class: Errors
+     type: Web Proxy
+component: HAProxy
+    units: failed servers
+    every: 10s
+   lookup: average -10s
+     crit: $this > 0
+     info: average number of failed haproxy backend servers over the last 10 seconds
+       to: sysadmin
+
+ template: haproxy_backend_status
+       on: haproxy_hb.down
+    class: Errors
+     type: Web Proxy
+component: HAProxy
+    units: failed backend
+    every: 10s
+   lookup: average -10s
+     crit: $this > 0
+     info: average number of failed haproxy backends over the last 10 seconds
+       to: sysadmin
diff --git a/health/health.d/hdfs.conf b/health/health.d/hdfs.conf
new file mode 100644
index 0000000..ca8df31
--- /dev/null
+++ b/health/health.d/hdfs.conf
@@ -0,0 +1,76 @@
+
+# Common
+
+ template: hdfs_capacity_usage
+       on: hdfs.capacity
+    class: Utilization
+     type: Storage
+component: HDFS
+     calc: ($used) * 100 / ($used + $remaining)
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (70) : (80))
+     crit: $this > (($status == $CRITICAL) ? (80) : (98))
+    delay: down 15m multiplier 1.5 max 1h
+     info: summary datanodes space capacity utilization
+       to: sysadmin
+
+
+# NameNode
+
+ template: hdfs_missing_blocks
+       on: hdfs.blocks
+    class: Errors
+     type: Storage
+component: HDFS
+     calc: $missing
+    units: missing blocks
+    every: 10s
+     warn: $this > 0
+    delay: down 15m multiplier 1.5 max 1h
+     info: number of missing blocks
+       to: sysadmin
+
+
+ template: hdfs_stale_nodes
+       on: hdfs.data_nodes
+    class: Errors
+     type: Storage
+component: HDFS
+     calc: $stale
+    units: dead nodes
+    every: 10s
+     warn: $this > 0
+    delay: down 15m multiplier 1.5 max 1h
+     info: number of datanodes marked stale due to delayed heartbeat
+       to: sysadmin
+
+
+ template: hdfs_dead_nodes
+       on: hdfs.data_nodes
+    class: Errors
+     type: Storage
+component: HDFS
+     calc: $dead
+    units: dead nodes
+    every: 10s
+     crit: $this > 0
+    delay: down 15m multiplier 1.5 max 1h
+     info: number of datanodes which are currently dead
+       to: sysadmin
+
+
+# DataNode
+
+ template: hdfs_num_failed_volumes
+       on: hdfs.num_failed_volumes
+    class: Errors
+     type: Storage
+component: HDFS
+     calc: $fsds_num_failed_volumes
+    units: failed volumes
+    every: 10s
+     warn: $this > 0
+    delay: down 15m multiplier 1.5 max 1h
+     info: number of failed volumes
+       to: sysadmin
diff --git a/health/health.d/httpcheck.conf b/health/health.d/httpcheck.conf
new file mode 100644
index 0000000..599c47a
--- /dev/null
+++ b/health/health.d/httpcheck.conf
@@ -0,0 +1,112 @@
+
+# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
+ template: httpcheck_web_service_up
+ families: *
+       on: httpcheck.status
+    class: Utilization
+     type: Web Server
+component: HTTP endpoint
+   lookup: average -1m unaligned percentage of success
+     calc: ($this < 75) ? (0) : ($this)
+    every: 5s
+    units: up/down
+     info: average ratio of successful HTTP requests over the last minute (at least 75%)
+       to: silent
+
+ template: httpcheck_web_service_bad_content
+ families: *
+       on: httpcheck.status
+    class: Workload
+     type: Web Server
+component: HTTP endpoint
+   lookup: average -5m unaligned percentage of bad_content
+    every: 10s
+    units: %
+     warn: $this >= 10 AND $this < 40
+     crit: $this >= 40
+    delay: down 5m multiplier 1.5 max 1h
+     info: average ratio of HTTP responses with unexpected content over the last 5 minutes
+  options: no-clear-notification
+       to: webmaster
+
+ template: httpcheck_web_service_bad_status
+ families: *
+       on: httpcheck.status
+    class: Workload
+     type: Web Server
+component: HTTP endpoint
+   lookup: average -5m unaligned percentage of bad_status
+    every: 10s
+    units: %
+     warn: $this >= 10 AND $this < 40
+     crit: $this >= 40
+    delay: down 5m multiplier 1.5 max 1h
+     info: average ratio of HTTP responses with unexpected status over the last 5 minutes
+  options: no-clear-notification
+       to: webmaster
+
+ template: httpcheck_web_service_timeouts
+ families: *
+       on: httpcheck.status
+    class: Latency
+     type: Web Server
+component: HTTP endpoint
+   lookup: average -5m unaligned percentage of timeout
+    every: 10s
+    units: %
+     info: average ratio of HTTP request timeouts over the last 5 minutes
+
+ template: httpcheck_no_web_service_connections
+ families: *
+       on: httpcheck.status
+    class: Errors
+     type: Other
+component: HTTP endpoint
+   lookup: average -5m unaligned percentage of no_connection
+    every: 10s
+    units: %
+     info: average ratio of failed requests during the last 5 minutes
+
+# combined timeout & no connection alarm
+ template: httpcheck_web_service_unreachable
+ families: *
+       on: httpcheck.status
+    class: Errors
+     type: Web Server
+component: HTTP endpoint
+     calc: ($httpcheck_no_web_service_connections >= $httpcheck_web_service_timeouts) ? ($httpcheck_no_web_service_connections) : ($httpcheck_web_service_timeouts)
+    units: %
+    every: 10s
+     warn: ($httpcheck_no_web_service_connections >= 10 OR $httpcheck_web_service_timeouts >= 10) AND ($httpcheck_no_web_service_connections < 40 OR $httpcheck_web_service_timeouts < 40)
+     crit: $httpcheck_no_web_service_connections >= 40 OR $httpcheck_web_service_timeouts >= 40
+    delay: down 5m multiplier 1.5 max 1h
+     info: ratio of failed requests either due to timeouts or no connection over the last 5 minutes
+  options: no-clear-notification
+       to: webmaster
+
+ template: httpcheck_1h_web_service_response_time
+ families: *
+       on: httpcheck.responsetime
+    class: Latency
+     type: Other
+component: HTTP endpoint
+   lookup: average -1h unaligned of time
+    every: 30s
+    units: ms
+     info: average HTTP response time over the last hour
+
+ template: httpcheck_web_service_slow
+ families: *
+       on: httpcheck.responsetime
+    class: Latency
+     type: Web Server
+component: HTTP endpoint
+   lookup: average -3m unaligned of time
+    units: ms
+    every: 10s
+     warn: ($this > ($httpcheck_1h_web_service_response_time * 2) )
+     crit: ($this > ($httpcheck_1h_web_service_response_time * 3) )
+    delay: down 5m multiplier 1.5 max 1h
+     info: average HTTP response time over the last 3 minutes, compared to the average over the last hour
+  options: no-clear-notification
+       to: webmaster
diff --git a/health/health.d/ioping.conf b/health/health.d/ioping.conf
new file mode 100644
index 0000000..8b498ad
--- /dev/null
+++ b/health/health.d/ioping.conf
@@ -0,0 +1,16 @@
+ template: ioping_disk_latency
+ families: *
+       on: ioping.latency
+    class: Latency
+     type: System
+component: Disk
+   lookup: average -10s unaligned of latency
+    units: microseconds
+    every: 10s
+    green: 5000
+      red: 10000
+     warn: $this > $green
+     crit: $this > $red
+    delay: down 30m multiplier 1.5 max 2h
+     info: average I/O latency over the last 10 seconds
+       to: sysadmin
diff --git a/health/health.d/ipc.conf b/health/health.d/ipc.conf
new file mode 100644
index 0000000..c178a41
--- /dev/null
+++ b/health/health.d/ipc.conf
@@ -0,0 +1,34 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+    alarm: semaphores_used
+       on: system.ipc_semaphores
+    class: Utilization
+     type: System
+component: IPC
+       os: linux
+    hosts: *
+     calc: $semaphores * 100 / $ipc_semaphores_max
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (70) : (80))
+     crit: $this > (($status == $CRITICAL) ? (70) : (90))
+    delay: down 5m multiplier 1.5 max 1h
+     info: IPC semaphore utilization
+       to: sysadmin
+
+    alarm: semaphore_arrays_used
+       on: system.ipc_semaphore_arrays
+    class: Utilization
+     type: System
+component: IPC
+       os: linux
+    hosts: *
+     calc: $arrays * 100 / $ipc_semaphores_arrays_max
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (70) : (80))
+     crit: $this > (($status == $CRITICAL) ? (70) : (90))
+    delay: down 5m multiplier 1.5 max 1h
+     info: IPC semaphore arrays utilization
+       to: sysadmin
diff --git a/health/health.d/ipfs.conf b/health/health.d/ipfs.conf
new file mode 100644
index 0000000..a514ddf
--- /dev/null
+++ b/health/health.d/ipfs.conf
@@ -0,0 +1,14 @@
+
+ template: ipfs_datastore_usage
+       on: ipfs.repo_size
+    class: Utilization
+     type: Data Sharing
+component: IPFS
+     calc: $size * 100 / $avail
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (80) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: down 15m multiplier 1.5 max 1h
+     info: IPFS datastore utilization
+       to: sysadmin
diff --git a/health/health.d/ipmi.conf b/health/health.d/ipmi.conf
new file mode 100644
index 0000000..feadba1
--- /dev/null
+++ b/health/health.d/ipmi.conf
@@ -0,0 +1,26 @@
+    alarm: ipmi_sensors_states
+       on: ipmi.sensors_states
+    class: Errors
+     type: System
+component: IPMI
+     calc: $warning + $critical
+    units: sensors
+    every: 10s
+     warn: $this > 0
+     crit: $critical > 0
+    delay: up 5m down 15m multiplier 1.5 max 1h
+     info: number of IPMI sensors in non-nominal state
+       to: sysadmin
+
+    alarm: ipmi_events
+       on: ipmi.events
+    class: Utilization
+     type: System
+component: IPMI
+     calc: $events
+    units: events
+    every: 10s
+     warn: $this > 0
+    delay: up 5m down 15m multiplier 1.5 max 1h
+     info: number of events in the IPMI System Event Log (SEL)
+       to: sysadmin
diff --git a/health/health.d/isc_dhcpd.conf b/health/health.d/isc_dhcpd.conf
new file mode 100644
index 0000000..d1f9396
--- /dev/null
+++ b/health/health.d/isc_dhcpd.conf
@@ -0,0 +1,10 @@
+# template: isc_dhcpd_leases_size
+#      on: isc_dhcpd.leases_total
+#   units: KB
+#   every: 60
+#    calc: $leases_size
+#    warn: $this > 3072
+#    crit: $this > 6144
+#   delay: up 2m down 5m
+#    info: dhcpd.leases file too big! Module can slow down your server.
+#      to: sysadmin
diff --git a/health/health.d/kubelet.conf b/health/health.d/kubelet.conf
new file mode 100644
index 0000000..c2778cc
--- /dev/null
+++ b/health/health.d/kubelet.conf
@@ -0,0 +1,145 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# -----------------------------------------------------------------------------
+
+# True (1) if the node is experiencing a configuration-related error, false (0) otherwise.
+
+ template: kubelet_node_config_error
+       on: k8s_kubelet.kubelet_node_config_error
+    class: Errors
+     type: Kubernetes
+component: Kubelet
+     calc: $kubelet_node_config_error
+    units: bool
+    every: 10s
+     warn: $this == 1
+    delay: down 1m multiplier 1.5 max 2h
+     info: the node is experiencing a configuration-related error (0: false, 1: true)
+       to: sysadmin
+
+# Failed Token() requests to the alternate token source
+
+ template: kubelet_token_requests
+   lookup: sum -10s of token_fail_count
+       on: k8s_kubelet.kubelet_token_requests
+    class: Errors
+     type: Kubernetes
+component: Kubelet
+    units: failed requests
+    every: 10s
+     warn: $this > 0
+    delay: down 1m multiplier 1.5 max 2h
+     info: number of failed Token() requests to the alternate token source
+       to: sysadmin
+
+# Docker and runtime operation errors
+
+ template: kubelet_operations_error
+   lookup: sum -1m
+       on: k8s_kubelet.kubelet_operations_errors
+    class: Errors
+     type: Kubernetes
+component: Kubelet
+    units: errors
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (0) : (20))
+    delay: up 30s down 1m multiplier 1.5 max 2h
+     info: number of Docker or runtime operation errors
+       to: sysadmin
+
+# -----------------------------------------------------------------------------
+
+# Pod Lifecycle Event Generator Relisting Latency
+
+# 1. calculate the pleg relisting latency for 1m (quantile 0.5, quantile 0.9, quantile 0.99)
+# 2. do the same for the last 10s
+# 3. raise an alarm if the later is:
+# - 2x the first for quantile 0.5
+# - 4x the first for quantile 0.9
+# - 8x the first for quantile 0.99
+#
+# we assume the minimum latency is 1000 microseconds
+
+# quantile 0.5
+
+ template: kubelet_1m_pleg_relist_latency_quantile_05
+       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
+    class: Latency
+     type: Kubernetes
+component: Kubelet
+   lookup: average -1m unaligned of kubelet_pleg_relist_latency_05
+    units: microseconds
+    every: 10s
+     info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.5)
+
+ template: kubelet_10s_pleg_relist_latency_quantile_05
+       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
+    class: Latency
+     type: Kubernetes
+component: Kubelet
+   lookup: average -10s unaligned of kubelet_pleg_relist_latency_05
+     calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05))
+    every: 10s
+    units: %
+     warn: $this > (($status >= $WARNING)?(100):(200))
+     crit: $this > (($status >= $WARNING)?(200):(400))
+    delay: down 1m multiplier 1.5 max 2h
+     info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
+           compared to the last minute (quantile 0.5)
+       to: sysadmin
+
+# quantile 0.9
+
+ template: kubelet_1m_pleg_relist_latency_quantile_09
+       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
+    class: Latency
+     type: Kubernetes
+component: Kubelet
+   lookup: average -1m unaligned of kubelet_pleg_relist_latency_09
+    units: microseconds
+    every: 10s
+     info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.9)
+
+ template: kubelet_10s_pleg_relist_latency_quantile_09
+       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
+    class: Latency
+     type: Kubernetes
+component: Kubelet
+   lookup: average -10s unaligned of kubelet_pleg_relist_latency_09
+     calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09))
+    every: 10s
+    units: %
+     warn: $this > (($status >= $WARNING)?(200):(400))
+     crit: $this > (($status >= $WARNING)?(400):(800))
+    delay: down 1m multiplier 1.5 max 2h
+     info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
+           compared to the last minute (quantile 0.9)
+       to: sysadmin
+
+# quantile 0.99
+
+ template: kubelet_1m_pleg_relist_latency_quantile_099
+       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
+    class: Latency
+     type: Kubernetes
+component: Kubelet
+   lookup: average -1m unaligned of kubelet_pleg_relist_latency_099
+    units: microseconds
+    every: 10s
+     info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.99)
+
+ template: kubelet_10s_pleg_relist_latency_quantile_099
+       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
+    class: Latency
+     type: Kubernetes
+component: Kubelet
+   lookup: average -10s unaligned of kubelet_pleg_relist_latency_099
+     calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099))
+    every: 10s
+    units: %
+     warn: $this > (($status >= $WARNING)?(400):(800))
+     crit: $this > (($status >= $WARNING)?(800):(1200))
+    delay: down 1m multiplier 1.5 max 2h
+     info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
+           compared to the last minute (quantile 0.99)
+       to: sysadmin
diff --git a/health/health.d/linux_power_supply.conf b/health/health.d/linux_power_supply.conf
new file mode 100644
index 0000000..c0bc6de
--- /dev/null
+++ b/health/health.d/linux_power_supply.conf
@@ -0,0 +1,15 @@
+# Alert on low battery capacity.
+
+ template: linux_power_supply_capacity
+       on: powersupply.capacity
+    class: Utilization
+     type: Power Supply
+component: Battery
+     calc: $capacity
+    units: %
+    every: 10s
+     warn: $this < 10
+     crit: $this < 5
+    delay: up 30s down 5m multiplier 1.2 max 1h
+     info: percentage of remaining power supply capacity
+       to: sysadmin
diff --git a/health/health.d/load.conf b/health/health.d/load.conf
new file mode 100644
index 0000000..0bd872f
--- /dev/null
+++ b/health/health.d/load.conf
@@ -0,0 +1,66 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# Calculate the base trigger point for the load average alarms.
+# This is the maximum number of CPU's in the system over the past 1
+# minute, with a special case for a single CPU of setting the trigger at 2.
+    alarm: load_cpu_number
+       on: system.load
+    class: Utilization
+     type: System
+component: Load
+       os: linux
+    hosts: *
+     calc: ($active_processors == nan or $active_processors == inf or $active_processors < 2) ? ( 2 ) : ( $active_processors )
+    units: cpus
+    every: 1m
+     info: number of active CPU cores in the system
+
+# Send alarms if the load average is unusually high.
+# These intentionally _do not_ calculate the average over the sampled
+# time period because the values being checked already are averages.
+
+    alarm: load_average_15
+       on: system.load
+    class: Utilization
+     type: System
+component: Load
+       os: linux
+    hosts: *
+   lookup: max -1m unaligned of load15
+    units: load
+    every: 1m
+     warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 175 : 200)
+    delay: down 15m multiplier 1.5 max 1h
+     info: system fifteen-minute load average
+       to: sysadmin
+
+    alarm: load_average_5
+       on: system.load
+    class: Utilization
+     type: System
+component: Load
+       os: linux
+    hosts: *
+   lookup: max -1m unaligned of load5
+    units: load
+    every: 1m
+     warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 350 : 400)
+    delay: down 15m multiplier 1.5 max 1h
+     info: system five-minute load average
+       to: sysadmin
+
+    alarm: load_average_1
+       on: system.load
+    class: Utilization
+     type: System
+component: Load
+       os: linux
+    hosts: *
+   lookup: max -1m unaligned of load1
+    units: load
+    every: 1m
+     warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 700 : 800)
+    delay: down 15m multiplier 1.5 max 1h
+     info: system one-minute load average
+       to: sysadmin
diff --git a/health/health.d/mdstat.conf b/health/health.d/mdstat.conf
new file mode 100644
index 0000000..cedaa00
--- /dev/null
+++ b/health/health.d/mdstat.conf
@@ -0,0 +1,52 @@
+ template: mdstat_last_collected
+       on: md.disks
+    class: Latency
+     type: System
+component: RAID
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+     info: number of seconds since the last successful data collection
+       to: sysadmin
+
+ template: mdstat_disks
+       on: md.disks
+    class: Errors
+     type: System
+component: RAID
+    units: failed devices
+    every: 10s
+     calc: $down
+     crit: $this > 0
+     info: number of devices in the down state for the $family array. \
+           Any number > 0 indicates that the array is degraded.
+       to: sysadmin
+
+ template: mdstat_mismatch_cnt
+       on: md.mismatch_cnt
+    class: Errors
+     type: System
+component: RAID
+ families: !*(raid1) !*(raid10) *
+    units: unsynchronized blocks
+     calc: $count
+    every: 60s
+     warn: $this > 1024
+    delay: up 30m
+     info: number of unsynchronized blocks for the $family array
+       to: sysadmin
+
+ template: mdstat_nonredundant_last_collected
+       on: md.nonredundant
+    class: Latency
+     type: System
+component: RAID
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+     info: number of seconds since the last successful data collection
+       to: sysadmin
diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf
new file mode 100644
index 0000000..9fbcfdb
--- /dev/null
+++ b/health/health.d/megacli.conf
@@ -0,0 +1,71 @@
+
+## Adapters (controllers)
+
+ template: megacli_adapter_state
+       on: megacli.adapter_degraded
+    class: Errors
+     type: System
+component: RAID
+   lookup: max -10s foreach *
+    units: boolean
+    every: 10s
+     crit: $this > 0
+    delay: down 5m multiplier 2 max 10m
+     info: adapter is in the degraded state (0: false, 1: true)
+       to: sysadmin
+
+## Physical Disks
+
+ template: megacli_pd_predictive_failures
+       on: megacli.pd_predictive_failure
+    class: Errors
+     type: System
+component: RAID
+   lookup: sum -10s foreach *
+    units: predictive failures
+    every: 10s
+     warn: $this > 0
+    delay: up 1m down 5m multiplier 2 max 10m
+     info: number of physical drive predictive failures
+       to: sysadmin
+
+ template: megacli_pd_media_errors
+       on: megacli.pd_media_error
+    class: Errors
+     type: System
+component: RAID
+   lookup: sum -10s foreach *
+    units: media errors
+    every: 10s
+     warn: $this > 0
+    delay: up 1m down 5m multiplier 2 max 10m
+     info: number of physical drive media errors
+       to: sysadmin
+
+## Battery Backup Units (BBU)
+
+ template: megacli_bbu_relative_charge
+       on: megacli.bbu_relative_charge
+    class: Workload
+     type: System
+component: RAID
+   lookup: average -10s
+    units: percent
+    every: 10s
+     warn: $this <= (($status >= $WARNING)  ? (85) : (80))
+     crit: $this <= (($status == $CRITICAL)  ? (50) : (40))
+     info: average battery backup unit (BBU) relative state of charge over the last 10 seconds
+       to: sysadmin
+
+ template: megacli_bbu_cycle_count
+       on: megacli.bbu_cycle_count
+    class: Workload
+     type: System
+component: RAID
+   lookup: average -10s
+    units: cycles
+    every: 10s
+     warn: $this >= 100
+     crit: $this >= 500
+     info: average battery backup unit (BBU) charge cycles count over the last 10 seconds
+       to: sysadmin
diff --git a/health/health.d/memcached.conf b/health/health.d/memcached.conf
new file mode 100644
index 0000000..2a2fe4b
--- /dev/null
+++ b/health/health.d/memcached.conf
@@ -0,0 +1,48 @@
+
+# detect if memcached cache is full
+
+ template: memcached_cache_memory_usage
+       on: memcached.cache
+    class: Utilization
+     type: KV Storage
+component: Memcached
+     calc: $used * 100 / ($used + $available)
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (70) : (80))
+     crit: $this > (($status == $CRITICAL) ? (80) : (90))
+    delay: up 0 down 15m multiplier 1.5 max 1h
+     info: cache memory utilization
+       to: dba
+
+
+# find the rate memcached cache is filling
+
+ template: memcached_cache_fill_rate
+       on: memcached.cache
+    class: Utilization
+     type: KV Storage
+component: Memcached
+   lookup: min -10m at -50m unaligned of available
+     calc: ($this - $available) / (($now - $after) / 3600)
+    units: KB/hour
+    every: 1m
+     info: average rate the cache fills up (positive), or frees up (negative) space over the last hour
+
+
+# find the hours remaining until memcached cache is full
+
+ template: memcached_out_of_cache_space_time
+       on: memcached.cache
+    class: Utilization
+     type: KV Storage
+component: Memcached
+     calc: ($memcached_cache_fill_rate > 0) ? ($available / $memcached_cache_fill_rate) : (inf)
+    units: hours
+    every: 10s
+     warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
+     crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+    delay: down 15m multiplier 1.5 max 1h
+     info: estimated time the cache will run out of space \
+           if the system continues to add data at the same rate as the past hour
+       to: dba
diff --git a/health/health.d/memory.conf b/health/health.d/memory.conf
new file mode 100644
index 0000000..010cbbd
--- /dev/null
+++ b/health/health.d/memory.conf
@@ -0,0 +1,47 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+    alarm: 1hour_ecc_memory_correctable
+       on: mem.ecc_ce
+    class: Errors
+     type: System
+component: Memory
+       os: linux
+    hosts: *
+   lookup: sum -10m unaligned
+    units: errors
+    every: 1m
+     warn: $this > 0
+    delay: down 1h multiplier 1.5 max 1h
+     info: number of ECC correctable errors in the last 10 minutes
+       to: sysadmin
+
+    alarm: 1hour_ecc_memory_uncorrectable
+       on: mem.ecc_ue
+    class: Errors
+     type: System
+component: Memory
+       os: linux
+    hosts: *
+   lookup: sum -10m unaligned
+    units: errors
+    every: 1m
+     crit: $this > 0
+    delay: down 1h multiplier 1.5 max 1h
+     info: number of ECC uncorrectable errors in the last 10 minutes
+       to: sysadmin
+
+    alarm: 1hour_memory_hw_corrupted
+       on: mem.hwcorrupt
+    class: Errors
+     type: System
+component: Memory
+       os: linux
+    hosts: *
+     calc: $HardwareCorrupted
+    units: MB
+    every: 10s
+     warn: $this > 0
+    delay: down 1h multiplier 1.5 max 1h
+     info: amount of memory corrupted due to a hardware failure
+       to: sysadmin
diff --git a/health/health.d/ml.conf b/health/health.d/ml.conf
new file mode 100644
index 0000000..6836ce7
--- /dev/null
+++ b/health/health.d/ml.conf
@@ -0,0 +1,53 @@
+# below are some examples of using the `anomaly-bit` option to define alerts based on anomaly 
+# rates as opposed to raw metric values. You can read more about the anomaly-bit and Netdata's 
+# native anomaly detection here: 
+# https://learn.netdata.cloud/docs/agent/ml#anomaly-bit---100--anomalous-0--normal
+
+# examples below are commented, you would need to uncomment and adjust as desired to enable them.
+
+# node level anomaly rate example
+# https://learn.netdata.cloud/docs/agent/ml#node-anomaly-rate
+# if node level anomaly rate is between 1-5% then warning (pick your own threshold that works best via tial and error).
+# if node level anomaly rate is above 5% then critical (pick your own threshold that works best via tial and error).
+# template: ml_1min_node_ar
+#       on: anomaly_detection.anomaly_rate
+#       os: linux
+#    hosts: *
+#   lookup: average -1m foreach anomaly_rate
+#     calc: $this
+#    units: %
+#    every: 30s
+#     warn: $this > (($status >= $WARNING)  ? (1) : (5))
+#     crit: $this > (($status == $CRITICAL) ? (5) : (100))
+#     info: rolling 1min node level anomaly rate
+
+# alert per dimension example
+# if anomaly rate is between 5-20% then warning (pick your own threshold that works best via tial and error).
+# if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error).
+# template: ml_5min_cpu_dims
+#       on: system.cpu
+#       os: linux
+#    hosts: *
+#   lookup: average -5m anomaly-bit foreach *
+#     calc: $this
+#    units: %
+#    every: 30s
+#     warn: $this > (($status >= $WARNING)  ? (5) : (20))
+#     crit: $this > (($status == $CRITICAL) ? (20) : (100))
+#     info: rolling 5min anomaly rate for each system.cpu dimension
+
+# alert per chart example
+# if anomaly rate is between 5-20% then warning (pick your own threshold that works best via tial and error).
+# if anomaly rate is above 20% then critical (pick your own threshold that works best via tial and error).
+# template: ml_5min_cpu_chart
+#       on: system.cpu
+#       os: linux
+#    hosts: *
+#   lookup: average -5m anomaly-bit of *
+#     calc: $this
+#    units: %
+#    every: 30s
+#     warn: $this > (($status >= $WARNING)  ? (5) : (20))
+#     crit: $this > (($status == $CRITICAL) ? (20) : (100))
+#     info: rolling 5min anomaly rate for system.cpu chart
+
diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf
new file mode 100644
index 0000000..3941c71
--- /dev/null
+++ b/health/health.d/mysql.conf
@@ -0,0 +1,176 @@
+
+# slow queries
+
+ template: mysql_10s_slow_queries
+       on: mysql.queries
+    class: Latency
+     type: Database
+component: MySQL
+   lookup: sum -10s of slow_queries
+    units: slow queries
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (5)  : (10))
+     crit: $this > (($status == $CRITICAL) ? (10) : (20))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of slow queries in the last 10 seconds
+       to: dba
+
+
+# -----------------------------------------------------------------------------
+# lock waits
+
+ template: mysql_10s_table_locks_immediate
+       on: mysql.table_locks
+    class: Utilization
+     type: Database
+component: MySQL
+   lookup: sum -10s absolute of immediate
+    units: immediate locks
+    every: 10s
+     info: number of table immediate locks in the last 10 seconds
+       to: dba
+
+ template: mysql_10s_table_locks_waited
+       on: mysql.table_locks
+    class: Latency
+     type: Database
+component: MySQL
+   lookup: sum -10s absolute of waited
+    units: waited locks
+    every: 10s
+     info: number of table waited locks in the last 10 seconds
+       to: dba
+
+ template: mysql_10s_waited_locks_ratio
+       on: mysql.table_locks
+    class: Latency
+     type: Database
+component: MySQL
+     calc: ( ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate) > 0 ) ? (($mysql_10s_table_locks_waited * 100) / ($mysql_10s_table_locks_waited + $mysql_10s_table_locks_immediate)) : 0
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (10) : (25))
+     crit: $this > (($status == $CRITICAL) ? (25) : (50))
+    delay: down 30m multiplier 1.5 max 1h
+     info: ratio of waited table locks over the last 10 seconds
+       to: dba
+
+
+# -----------------------------------------------------------------------------
+# connections
+
+ template: mysql_connections
+       on: mysql.connections_active
+    class: Utilization
+     type: Database
+component: MySQL
+     calc: $active * 100 / $limit
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (60) : (70))
+     crit: $this > (($status == $CRITICAL) ? (80) : (90))
+    delay: down 15m multiplier 1.5 max 1h
+     info: client connections utilization
+       to: dba
+
+
+# -----------------------------------------------------------------------------
+# replication
+
+ template: mysql_replication
+       on: mysql.slave_status
+    class: Errors
+     type: Database
+component: MySQL
+     calc: ($sql_running <= 0 OR $io_running <= 0)?0:1
+    units: ok/failed
+    every: 10s
+     crit: $this == 0
+    delay: down 5m multiplier 1.5 max 1h
+     info: replication status (0: stopped, 1: working)
+       to: dba
+
+ template: mysql_replication_lag
+       on: mysql.slave_behind
+    class: Latency
+     type: Database
+component: MySQL
+     calc: $seconds
+    units: seconds
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (5)  : (10))
+     crit: $this > (($status == $CRITICAL) ? (10) : (30))
+    delay: down 15m multiplier 1.5 max 1h
+     info: difference between the timestamp of the latest transaction processed by the SQL thread and \
+           the timestamp of the same transaction when it was processed on the master
+       to: dba
+
+
+# -----------------------------------------------------------------------------
+# galera cluster size
+
+ template: mysql_galera_cluster_size_max_2m
+       on: mysql.galera_cluster_size
+    class: Utilization
+     type: Database
+component: MySQL
+   lookup: max -2m at -1m unaligned
+    units: nodes
+    every: 10s
+     info: maximum galera cluster size in the last 2 minutes starting one minute ago
+       to: dba
+
+ template: mysql_galera_cluster_size
+       on: mysql.galera_cluster_size
+    class: Utilization
+     type: Database
+component: MySQL
+     calc: $nodes
+    units: nodes
+    every: 10s
+     warn: $this > $mysql_galera_cluster_size_max_2m
+     crit: $this < $mysql_galera_cluster_size_max_2m
+    delay: up 20s down 5m multiplier 1.5 max 1h
+     info: current galera cluster size, compared to the maximum size in the last 2 minutes
+       to: dba
+
+# galera node state
+
+ template: mysql_galera_cluster_state_warn
+       on: mysql.galera_cluster_state
+    class: Errors
+     type: Database
+component: MySQL
+     calc: $donor + $joined
+    every: 10s
+     warn: $this != nan AND $this != 0
+    delay: up 30s down 5m multiplier 1.5 max 1h
+     info: galera node state is either Donor/Desynced or Joined.
+       to: dba
+
+ template: mysql_galera_cluster_state_crit
+       on: mysql.galera_cluster_state
+    class: Errors
+     type: Database
+component: MySQL
+     calc: $undefined + $joining + $error
+    every: 10s
+     crit: $this != nan AND $this != 0
+    delay: up 30s down 5m multiplier 1.5 max 1h
+     info: galera node state is either Undefined or Joining or Error.
+       to: dba
+
+# galera node status
+
+ template: mysql_galera_cluster_status
+       on: mysql.galera_cluster_status
+    class: Errors
+     type: Database
+component: MySQL
+     calc: $primary
+    every: 10s
+     crit: $this != nan AND $this != 1
+    delay: up 30s down 5m multiplier 1.5 max 1h
+     info: galera node is part of a nonoperational component. \
+           This occurs in cases of multiple membership changes that result in a loss of Quorum or in cases of split-brain situations.
+       to: dba
diff --git a/health/health.d/net.conf b/health/health.d/net.conf
new file mode 100644
index 0000000..9d5b3b8
--- /dev/null
+++ b/health/health.d/net.conf
@@ -0,0 +1,256 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# -----------------------------------------------------------------------------
+# net traffic overflow
+
+ template: interface_speed
+       on: net.net
+    class: Latency
+     type: System
+component: Network
+       os: *
+    hosts: *
+ families: *
+     calc: ( $nic_speed_max > 0 ) ? ( $nic_speed_max) : ( nan )
+    units: Mbit
+    every: 10s
+     info: network interface $family current speed
+
+ template: 1m_received_traffic_overflow
+       on: net.net
+    class: Workload
+     type: System
+component: Network
+       os: linux
+    hosts: *
+ families: *
+   lookup: average -1m unaligned absolute of received
+     calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed)) : ( nan )
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (85) : (90))
+    delay: up 1m down 1m multiplier 1.5 max 1h
+     info: average inbound utilization for the network interface $family over the last minute
+       to: sysadmin
+
+ template: 1m_sent_traffic_overflow
+       on: net.net
+    class: Workload
+     type: System
+component: Network
+       os: linux
+    hosts: *
+ families: *
+   lookup: average -1m unaligned absolute of sent
+     calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed)) : ( nan )
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (85) : (90))
+    delay: up 1m down 1m multiplier 1.5 max 1h
+     info: average outbound utilization for the network interface $family over the last minute
+       to: sysadmin
+
+# -----------------------------------------------------------------------------
+# dropped packets
+
+# check if an interface is dropping packets
+# the alarm is checked every 1 minute
+# and examines the last 10 minutes of data
+#
+# it is possible to have expected packet drops on an interface for some network configurations
+# look at the Monitoring Network Interfaces section in the proc.plugin documentation for more information
+
+ template: inbound_packets_dropped
+       on: net.drops
+    class: Errors
+     type: System
+component: Network
+       os: linux
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute of inbound
+    units: packets
+    every: 1m
+     info: number of inbound dropped packets for the network interface $family in the last 10 minutes
+
+ template: outbound_packets_dropped
+       on: net.drops
+    class: Errors
+     type: System
+component: Network
+       os: linux
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute of outbound
+    units: packets
+    every: 1m
+     info: number of outbound dropped packets for the network interface $family in the last 10 minutes
+
+ template: inbound_packets_dropped_ratio
+       on: net.packets
+    class: Errors
+     type: System
+component: Network
+       os: linux
+    hosts: *
+ families: !wl* *
+   lookup: sum -10m unaligned absolute of received
+     calc: (($inbound_packets_dropped != nan AND $this > 10000) ? ($inbound_packets_dropped * 100 / $this) : (0))
+    units: %
+    every: 1m
+     warn: $this >= 2
+    delay: up 1m down 1h multiplier 1.5 max 2h
+     info: ratio of inbound dropped packets for the network interface $family over the last 10 minutes
+       to: sysadmin
+
+ template: outbound_packets_dropped_ratio
+       on: net.packets
+    class: Errors
+     type: System
+component: Network
+       os: linux
+    hosts: *
+ families: !wl* *
+   lookup: sum -10m unaligned absolute of sent
+     calc: (($outbound_packets_dropped != nan AND $this > 1000) ? ($outbound_packets_dropped * 100 / $this) : (0))
+    units: %
+    every: 1m
+     warn: $this >= 2
+    delay: up 1m down 1h multiplier 1.5 max 2h
+     info: ratio of outbound dropped packets for the network interface $family over the last 10 minutes
+       to: sysadmin
+
+ template: wifi_inbound_packets_dropped_ratio
+       on: net.packets
+    class: Errors
+     type: System
+component: Network
+       os: linux
+    hosts: *
+ families: wl*
+   lookup: sum -10m unaligned absolute of received
+     calc: (($inbound_packets_dropped != nan AND $this > 10000) ? ($inbound_packets_dropped * 100 / $this) : (0))
+    units: %
+    every: 1m
+     warn: $this >= 10
+    delay: up 1m down 1h multiplier 1.5 max 2h
+     info: ratio of inbound dropped packets for the network interface $family over the last 10 minutes
+       to: sysadmin
+
+ template: wifi_outbound_packets_dropped_ratio
+       on: net.packets
+    class: Errors
+     type: System
+component: Network
+       os: linux
+    hosts: *
+ families: wl*
+   lookup: sum -10m unaligned absolute of sent
+     calc: (($outbound_packets_dropped != nan AND $this > 1000) ? ($outbound_packets_dropped * 100 / $this) : (0))
+    units: %
+    every: 1m
+     warn: $this >= 10
+    delay: up 1m down 1h multiplier 1.5 max 2h
+     info: ratio of outbound dropped packets for the network interface $family over the last 10 minutes
+       to: sysadmin
+
+# -----------------------------------------------------------------------------
+# interface errors
+
+ template: interface_inbound_errors
+       on: net.errors
+    class: Errors
+     type: System
+component: Network
+       os: freebsd
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute of inbound
+    units: errors
+    every: 1m
+     warn: $this >= 5
+    delay: down 1h multiplier 1.5 max 2h
+     info: number of inbound errors for the network interface $family in the last 10 minutes
+       to: sysadmin
+
+ template: interface_outbound_errors
+       on: net.errors
+    class: Errors
+     type: System
+component: Network
+       os: freebsd
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute of outbound
+    units: errors
+    every: 1m
+     warn: $this >= 5
+    delay: down 1h multiplier 1.5 max 2h
+     info: number of outbound errors for the network interface $family in the last 10 minutes
+       to: sysadmin
+
+# -----------------------------------------------------------------------------
+# FIFO errors
+
+# check if an interface is having FIFO
+# buffer errors
+# the alarm is checked every 1 minute
+# and examines the last 10 minutes of data
+
+ template: 10min_fifo_errors
+       on: net.fifo
+    class: Errors
+     type: System
+component: Network
+       os: linux
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute
+    units: errors
+    every: 1m
+     warn: $this > 0
+    delay: down 1h multiplier 1.5 max 2h
+     info: number of FIFO errors for the network interface $family in the last 10 minutes
+       to: sysadmin
+
+# -----------------------------------------------------------------------------
+# check for packet storms
+
+# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
+# 2. do the same for the last 10s
+# 3. raise an alarm if the later is 10x or 20x the first
+# we assume the minimum packet storm should at least have
+# 10000 packets/s, average of the last 10 seconds
+
+ template: 1m_received_packets_rate
+       on: net.packets
+    class: Workload
+     type: System
+component: Network
+       os: linux freebsd
+    hosts: *
+ families: *
+   lookup: average -1m unaligned of received
+    units: packets
+    every: 10s
+     info: average number of packets received by the network interface $family over the last minute
+
+ template: 10s_received_packets_storm
+       on: net.packets
+    class: Workload
+     type: System
+component: Network
+       os: linux freebsd
+    hosts: *
+ families: *
+   lookup: average -10s unaligned of received
+     calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
+    every: 10s
+    units: %
+     warn: $this > (($status >= $WARNING)?(200):(5000))
+     crit: $this > (($status == $CRITICAL)?(5000):(6000))
+  options: no-clear-notification
+     info: ratio of average number of received packets for the network interface $family over the last 10 seconds, \
+           compared to the rate over the last minute
+       to: sysadmin
diff --git a/health/health.d/netfilter.conf b/health/health.d/netfilter.conf
new file mode 100644
index 0000000..7de383f
--- /dev/null
+++ b/health/health.d/netfilter.conf
@@ -0,0 +1,19 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+    alarm: netfilter_conntrack_full
+       on: netfilter.conntrack_sockets
+    class: Workload
+     type: System
+component: Network
+       os: linux
+    hosts: *
+   lookup: max -10s unaligned of connections
+     calc: $this * 100 / $netfilter_conntrack_max
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (85) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (95))
+    delay: down 5m multiplier 1.5 max 1h
+     info: netfilter connection tracker table size utilization
+       to: sysadmin
diff --git a/health/health.d/nut.conf b/health/health.d/nut.conf
new file mode 100644
index 0000000..6231dd9
--- /dev/null
+++ b/health/health.d/nut.conf
@@ -0,0 +1,47 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: nut_10min_ups_load
+       on: nut.load
+    class: Utilization
+     type: Power Supply
+component: UPS
+       os: *
+    hosts: *
+   lookup: average -10m unaligned of load
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (70) : (80))
+     crit: $this > (($status == $CRITICAL) ? (85) : (95))
+    delay: down 10m multiplier 1.5 max 1h
+     info: average UPS load over the last 10 minutes
+       to: sitemgr
+
+ template: nut_ups_charge
+       on: nut.charge
+    class: Errors
+     type: Power Supply
+component: UPS
+       os: *
+    hosts: *
+   lookup: average -60s unaligned of battery_charge
+    units: %
+    every: 60s
+     warn: $this < 100
+     crit: $this < (($status == $CRITICAL) ? (60) : (50))
+    delay: down 10m multiplier 1.5 max 1h
+     info: average UPS charge over the last minute
+       to: sitemgr
+
+ template: nut_last_collected_secs
+       on: nut.load
+    class: Latency
+     type: Power Supply
+component: UPS device
+     calc: $now - $last_collected_t
+    every: 10s
+    units: seconds ago
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: sitemgr
diff --git a/health/health.d/nvme.conf b/health/health.d/nvme.conf
new file mode 100644
index 0000000..5f729d5
--- /dev/null
+++ b/health/health.d/nvme.conf
@@ -0,0 +1,15 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: nvme_device_critical_warnings_state
+ families: *
+       on: nvme.device_critical_warnings_state
+    class: Errors
+     type: System
+component: Disk
+   lookup: max -30s unaligned
+    units: state
+    every: 10s
+     crit: $this != nan AND $this != 0
+    delay: down 5m multiplier 1.5 max 2h
+     info: NVMe device $label:device has critical warnings
+       to: sysadmin
diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf
new file mode 100644
index 0000000..ee6c57c
--- /dev/null
+++ b/health/health.d/pihole.conf
@@ -0,0 +1,32 @@
+
+# Blocklist last update time.
+# Default update interval is a week.
+
+ template: pihole_blocklist_last_update
+       on: pihole.blocklist_last_update
+    class: Errors
+     type: Ad Filtering
+component: Pi-hole
+    every: 10s
+    units: seconds
+     calc: $ago
+     warn: $this > 60 * 60 * 24 * 8
+     crit: $this > 60 * 60 * 24 * 8 * 2
+     info: gravity.list (blocklist) file last update time
+       to: sysadmin
+
+# Pi-hole's ability to block unwanted domains.
+# Should be enabled. The whole point of Pi-hole!
+
+ template: pihole_status
+       on: pihole.unwanted_domains_blocking_status
+    class: Errors
+     type: Ad Filtering
+component: Pi-hole
+    every: 10s
+    units: status
+     calc: $disabled
+     warn: $this != nan AND $this == 1
+    delay: up 2m down 5m
+     info: unwanted domains blocking is disabled
+       to: sysadmin
diff --git a/health/health.d/ping.conf b/health/health.d/ping.conf
new file mode 100644
index 0000000..cbe7c30
--- /dev/null
+++ b/health/health.d/ping.conf
@@ -0,0 +1,50 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: ping_host_reachable
+ families: *
+       on: ping.host_packet_loss
+    class: Errors
+     type: Other
+component: Network
+   lookup: average -30s unaligned of loss
+     calc: $this != nan AND $this < 100
+    units: up/down
+    every: 10s
+     crit: $this == 0
+    delay: down 30m multiplier 1.5 max 2h
+     info: network host $label:host reachability status
+       to: sysadmin
+
+ template: ping_packet_loss
+ families: *
+       on: ping.host_packet_loss
+    class: Errors
+     type: Other
+component: Network
+   lookup: average -10m unaligned of loss
+    green: 5
+      red: 10
+    units: %
+    every: 10s
+     warn: $this > $green
+     crit: $this > $red
+    delay: down 30m multiplier 1.5 max 2h
+     info: packet loss percentage to the network host $label:host over the last 10 minutes
+       to: sysadmin
+
+ template: ping_host_latency
+ families: *
+       on: ping.host_rtt
+    class: Latency
+     type: Other
+component: Network
+   lookup: average -10s unaligned of avg
+    units: ms
+    every: 10s
+    green: 500
+      red: 1000
+     warn: $this > $green OR $max > $red
+     crit: $this > $red
+    delay: down 30m multiplier 1.5 max 2h
+     info: average latency to the network host $label:host over the last 10 seconds
+       to: sysadmin
diff --git a/health/health.d/portcheck.conf b/health/health.d/portcheck.conf
new file mode 100644
index 0000000..8cbd772
--- /dev/null
+++ b/health/health.d/portcheck.conf
@@ -0,0 +1,44 @@
+
+# This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
+ template: portcheck_service_reachable
+ families: *
+       on: portcheck.status
+    class: Workload
+     type: Other
+component: TCP endpoint
+   lookup: average -1m unaligned percentage of success
+     calc: ($this < 75) ? (0) : ($this)
+    every: 5s
+    units: up/down
+     info: average ratio of successful connections over the last minute (at least 75%)
+       to: silent
+
+ template: portcheck_connection_timeouts
+ families: *
+       on: portcheck.status
+    class: Errors
+     type: Other
+component: TCP endpoint
+   lookup: average -5m unaligned percentage of timeout
+    every: 10s
+    units: %
+     warn: $this >= 10 AND $this < 40
+     crit: $this >= 40
+    delay: down 5m multiplier 1.5 max 1h
+     info: average ratio of timeouts over the last 5 minutes
+       to: sysadmin
+
+ template: portcheck_connection_fails
+ families: *
+       on: portcheck.status
+    class: Errors
+     type: Other
+component: TCP endpoint
+   lookup: average -5m unaligned percentage of no_connection,failed
+    every: 10s
+    units: %
+     warn: $this >= 10 AND $this < 40
+     crit: $this >= 40
+    delay: down 5m multiplier 1.5 max 1h
+     info: average ratio of failed connections over the last 5 minutes
+       to: sysadmin
diff --git a/health/health.d/postgres.conf b/health/health.d/postgres.conf
new file mode 100644
index 0000000..66d034c
--- /dev/null
+++ b/health/health.d/postgres.conf
@@ -0,0 +1,214 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: postgres_total_connection_utilization
+       on: postgres.connections_utilization
+    class: Utilization
+     type: Database
+component: PostgreSQL
+    hosts: *
+   lookup: average -1m unaligned of used
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (70) : (80))
+     crit: $this > (($status == $CRITICAL) ? (80) : (90))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average total connection utilization over the last minute
+       to: dba
+
+ template: postgres_acquired_locks_utilization
+       on: postgres.locks_utilization
+    class: Utilization
+     type: Database
+component: PostgreSQL
+    hosts: *
+   lookup: average -1m unaligned of used
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (15) : (20))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average acquired locks utilization over the last minute
+       to: dba
+
+ template: postgres_txid_exhaustion_perc
+       on: postgres.txid_exhaustion_perc
+    class: Utilization
+     type: Database
+component: PostgreSQL
+    hosts: *
+     calc: $txid_exhaustion	
+    units: %
+    every: 1m
+     warn: $this > 90
+    delay: down 15m multiplier 1.5 max 1h
+     info: percent towards TXID wraparound
+       to: dba
+
+# Database alarms
+
+ template: postgres_db_cache_io_ratio
+       on: postgres.db_cache_io_ratio
+    class: Workload
+     type: Database
+component: PostgreSQL
+    hosts: *
+   lookup: average -1m unaligned of miss
+     calc: 100 - $this
+    units: %
+    every: 1m
+     warn: $this < (($status >= $WARNING)  ? (70) : (60))
+     crit: $this < (($status == $CRITICAL) ? (60) : (50))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average cache hit ratio in db $label:database over the last minute
+       to: dba
+
+ template: postgres_db_transactions_rollback_ratio	
+       on: postgres.db_transactions_ratio
+    class: Workload
+     type: Database
+component: PostgreSQL
+    hosts: *
+   lookup: average -5m unaligned of rollback
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (0) : (2))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average aborted transactions percentage in db $label:database over the last five minutes
+       to: dba
+
+ template: postgres_db_deadlocks_rate
+       on: postgres.db_deadlocks_rate
+    class: Errors
+     type: Database
+component: PostgreSQL
+    hosts: *
+   lookup: sum -1m unaligned of deadlocks
+    units: deadlocks
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (0) : (10))
+    delay: down 15m multiplier 1.5 max 1h
+     info: number of deadlocks detected in db $label:database in the last minute
+       to: dba
+
+# Table alarms
+
+ template: postgres_table_cache_io_ratio
+       on: postgres.table_cache_io_ratio
+    class: Workload
+     type: Database
+component: PostgreSQL
+    hosts: *
+   lookup: average -1m unaligned of miss
+     calc: 100 - $this
+    units: %
+    every: 1m
+     warn: $this < (($status >= $WARNING)  ? (70) : (60))
+     crit: $this < (($status == $CRITICAL) ? (60) : (50))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average cache hit ratio in db $label:database table $label:table over the last minute
+       to: dba
+
+ template: postgres_table_index_cache_io_ratio
+       on: postgres.table_index_cache_io_ratio
+    class: Workload
+     type: Database
+component: PostgreSQL
+    hosts: *
+   lookup: average -1m unaligned of miss
+     calc: 100 - $this
+    units: %
+    every: 1m
+     warn: $this < (($status >= $WARNING)  ? (70) : (60))
+     crit: $this < (($status == $CRITICAL) ? (60) : (50))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average index cache hit ratio in db $label:database table $label:table over the last minute
+       to: dba
+
+ template: postgres_table_toast_cache_io_ratio
+       on: postgres.table_toast_cache_io_ratio
+    class: Workload
+     type: Database
+component: PostgreSQL
+    hosts: *
+   lookup: average -1m unaligned of miss
+     calc: 100 - $this
+    units: %
+    every: 1m
+     warn: $this < (($status >= $WARNING)  ? (70) : (60))
+     crit: $this < (($status == $CRITICAL) ? (60) : (50))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average TOAST hit ratio in db $label:database table $label:table over the last minute
+       to: dba
+
+ template: postgres_table_toast_index_cache_io_ratio
+       on: postgres.table_toast_index_cache_io_ratio
+    class: Workload
+     type: Database
+component: PostgreSQL
+    hosts: *
+   lookup: average -1m unaligned of miss
+     calc: 100 - $this
+    units: %
+    every: 1m
+     warn: $this < (($status >= $WARNING)  ? (70) : (60))
+     crit: $this < (($status == $CRITICAL) ? (60) : (50))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average index TOAST hit ratio in db $label:database table $label:table over the last minute
+       to: dba
+
+ template: postgres_table_bloat_size_perc
+       on: postgres.table_bloat_size_perc
+    class: Errors
+     type: Database
+component: PostgreSQL
+    hosts: *
+     calc: $bloat
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (60) : (70))
+     crit: $this > (($status == $CRITICAL) ? (70) : (80))
+    delay: down 15m multiplier 1.5 max 1h
+     info: bloat size percentage in db $label:database table $label:table
+       to: dba
+
+ template: postgres_table_last_autovacuum_time
+       on: postgres.table_autovacuum_since_time
+    class: Errors
+     type: Database
+component: PostgreSQL
+    hosts: !*
+     calc: $time
+    units: seconds
+    every: 1m
+     warn: $this != nan AND $this > (60 * 60 * 24 * 7)
+     info: time elapsed since db $label:database table $label:table was vacuumed by the autovacuum daemon
+       to: dba
+
+ template: postgres_table_last_autoanalyze_time
+       on: postgres.table_autoanalyze_since_time
+    class: Errors
+     type: Database
+component: PostgreSQL
+    hosts: !*
+     calc: $time
+    units: seconds
+    every: 1m
+     warn: $this != nan AND $this > (60 * 60 * 24 * 7)
+     info: time elapsed since db $label:database table $label:table was analyzed by the autovacuum daemon
+       to: dba
+
+# Index alarms
+
+ template: postgres_index_bloat_size_perc
+       on: postgres.index_bloat_size_perc
+    class: Errors
+     type: Database
+component: PostgreSQL
+    hosts: *
+     calc: $bloat
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (60) : (70))
+     crit: $this > (($status == $CRITICAL) ? (70) : (80))
+    delay: down 15m multiplier 1.5 max 1h
+     info: bloat size percentage in db $label:database table $label:table index $label:index
+       to: dba
diff --git a/health/health.d/processes.conf b/health/health.d/processes.conf
new file mode 100644
index 0000000..2929ee3
--- /dev/null
+++ b/health/health.d/processes.conf
@@ -0,0 +1,16 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+    alarm: active_processes
+       on: system.active_processes
+    class: Workload
+     type: System
+component: Processes
+    hosts: *
+     calc: $active * 100 / $pidmax
+    units: %
+    every: 5s
+     warn: $this > (($status >= $WARNING)  ? (85) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (95))
+    delay: down 5m multiplier 1.5 max 1h
+     info: system process IDs (PID) space utilization
+       to: sysadmin
diff --git a/health/health.d/python.d.plugin.conf b/health/health.d/python.d.plugin.conf
new file mode 100644
index 0000000..0e81a48
--- /dev/null
+++ b/health/health.d/python.d.plugin.conf
@@ -0,0 +1,17 @@
+
+# make sure python.d.plugin data collection job is running
+
+ template: python.d_job_last_collected_secs
+       on: netdata.pythond_runtime
+    class: Errors
+     type: Netdata
+component: python.d.plugin
+   module: !* *
+     calc: $now - $last_collected_t
+    units: seconds ago
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
+     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+    delay: down 5m multiplier 1.5 max 1h
+     info: number of seconds since the last successful data collection
+       to: webmaster
diff --git a/health/health.d/qos.conf b/health/health.d/qos.conf
new file mode 100644
index 0000000..7290d15
--- /dev/null
+++ b/health/health.d/qos.conf
@@ -0,0 +1,18 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# check if a QoS class is dropping packets
+# the alarm is checked every 10 seconds
+# and examines the last minute of data
+
+#template: 10min_qos_packet_drops
+#      on: tc.qos_dropped
+#      os: linux
+#   hosts: *
+#  lookup: sum -10m unaligned absolute
+#   every: 30s
+#    warn: $this > 0
+#   delay: up 0 down 30m multiplier 1.5 max 1h
+#   units: packets
+#    info: dropped packets in the last 30 minutes
+#      to: sysadmin
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
new file mode 100644
index 0000000..ab382c4
--- /dev/null
+++ b/health/health.d/ram.conf
@@ -0,0 +1,80 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+    alarm: ram_in_use
+       on: system.ram
+    class: Utilization
+     type: System
+component: Memory
+       os: linux
+    hosts: *
+     calc: $used * 100 / ($used + $cached + $free + $buffers)
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (80) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: down 15m multiplier 1.5 max 1h
+     info: system memory utilization
+       to: sysadmin
+
+    alarm: ram_available
+       on: mem.available
+    class: Utilization
+     type: System
+component: Memory
+       os: linux
+    hosts: *
+     calc: $avail * 100 / ($system.ram.used + $system.ram.cached + $system.ram.free + $system.ram.buffers)
+    units: %
+    every: 10s
+     warn: $this < (($status >= $WARNING)  ? (15) : (10))
+     crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
+    delay: down 15m multiplier 1.5 max 1h
+     info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
+       to: sysadmin
+
+      alarm: oom_kill
+         on: mem.oom_kill
+         os: linux
+      hosts: *
+     lookup: sum -30m unaligned
+      units: kills
+      every: 5m
+       warn: $this > 0
+      delay: down 10m
+host labels: _is_k8s_node = false
+       info: number of out of memory kills in the last 30 minutes
+         to: sysadmin
+
+## FreeBSD
+    alarm: ram_in_use
+       on: system.ram
+    class: Utilization
+     type: System
+component: Memory
+       os: freebsd
+    hosts: *
+     calc: ($active + $wired + $laundry + $buffers) * 100 / ($active + $wired + $laundry + $buffers + $cache + $free + $inactive)
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (80) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: down 15m multiplier 1.5 max 1h
+     info: system memory utilization
+       to: sysadmin
+
+    alarm: ram_available
+       on: mem.available
+    class: Utilization
+     type: System
+component: Memory
+       os: freebsd
+    hosts: *
+     calc: $avail * 100 / ($system.ram.free + $system.ram.active + $system.ram.inactive + $system.ram.wired + $system.ram.cache + $system.ram.laundry + $system.ram.buffers)
+    units: %
+    every: 10s
+     warn: $this < (($status >= $WARNING)  ? (15) : (10))
+     crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
+    delay: down 15m multiplier 1.5 max 1h
+     info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
+       to: sysadmin
diff --git a/health/health.d/redis.conf b/health/health.d/redis.conf
new file mode 100644
index 0000000..34d00b5
--- /dev/null
+++ b/health/health.d/redis.conf
@@ -0,0 +1,57 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: redis_connections_rejected
+ families: *
+       on: redis.connections
+    class: Errors
+     type: KV Storage
+component: Redis
+   lookup: sum -1m unaligned of rejected
+    every: 10s
+    units: connections
+     warn: $this > 0
+     info: connections rejected because of maxclients limit in the last minute
+    delay: down 5m multiplier 1.5 max 1h
+       to: dba
+
+ template: redis_bgsave_broken
+ families: *
+       on: redis.bgsave_health
+    class: Errors
+     type: KV Storage
+component: Redis
+    every: 10s
+     crit: $last_bgsave != nan AND $last_bgsave != 0
+    units: ok/failed
+     info: status of the last RDB save operation (0: ok, 1: error)
+    delay: down 5m multiplier 1.5 max 1h
+       to: dba
+
+ template: redis_bgsave_slow
+ families: *
+       on: redis.bgsave_now
+    class: Latency
+     type: KV Storage
+component: Redis
+    every: 10s
+     calc: $current_bgsave_time
+     warn: $this > 600
+     crit: $this > 1200
+    units: seconds
+     info: duration of the on-going RDB save operation
+    delay: down 5m multiplier 1.5 max 1h
+       to: dba
+
+ template: redis_master_link_down
+ families: *
+       on: redis.master_link_down_since_time
+    class: Errors
+     type: KV Storage
+component: Redis
+    every: 10s
+     calc: $time
+    units: seconds
+     crit: $this != nan AND $this > 0
+     info: time elapsed since the link between master and slave is down
+    delay: down 5m multiplier 1.5 max 1h
+       to: dba
diff --git a/health/health.d/retroshare.conf b/health/health.d/retroshare.conf
new file mode 100644
index 0000000..14aa76b
--- /dev/null
+++ b/health/health.d/retroshare.conf
@@ -0,0 +1,16 @@
+
+# make sure the DHT is fine when active
+
+ template: retroshare_dht_working
+       on: retroshare.dht
+    class: Utilization
+     type: Data Sharing
+component: Retroshare
+     calc: $dht_size_all
+    units: peers
+    every: 1m
+     warn: $this < (($status >= $WARNING)  ? (120) : (100))
+     crit: $this < (($status == $CRITICAL) ? (10)  : (1))
+    delay: up 0 down 15m multiplier 1.5 max 1h
+     info: number of DHT peers
+       to: sysadmin
diff --git a/health/health.d/riakkv.conf b/health/health.d/riakkv.conf
new file mode 100644
index 0000000..261fd48
--- /dev/null
+++ b/health/health.d/riakkv.conf
@@ -0,0 +1,93 @@
+
+# Warn if a list keys operation is running.
+ template: riakkv_list_keys_active
+       on: riak.core.fsm_active
+    class: Utilization
+     type: Database
+component: Riak KV
+     calc: $list_fsm_active
+    units: state machines
+    every: 10s
+     warn: $list_fsm_active > 0
+     info: number of currently running list keys finite state machines
+       to: dba
+
+
+## Timing healthchecks
+# KV GET
+ template: riakkv_1h_kv_get_mean_latency
+       on: riak.kv.latency.get
+    class: Latency
+     type: Database
+component: Riak KV
+     calc: $node_get_fsm_time_mean
+   lookup: average -1h unaligned of time
+    every: 30s
+    units: ms
+     info: average time between reception of client GET request and \
+           subsequent response to client over the last hour
+
+ template: riakkv_kv_get_slow
+       on: riak.kv.latency.get
+    class: Latency
+     type: Database
+component: Riak KV
+     calc: $mean
+   lookup: average -3m unaligned of time
+    units: ms
+    every: 10s
+     warn: ($this > ($riakkv_1h_kv_get_mean_latency * 2) )
+     crit: ($this > ($riakkv_1h_kv_get_mean_latency * 3) )
+     info: average time between reception of client GET request and \
+           subsequent response to the client over the last 3 minutes, \
+           compared to the average over the last hour
+    delay: down 5m multiplier 1.5 max 1h
+       to: dba
+
+# KV PUT
+ template: riakkv_1h_kv_put_mean_latency
+       on: riak.kv.latency.put
+    class: Latency
+     type: Database
+component: Riak KV
+     calc: $node_put_fsm_time_mean
+   lookup: average -1h unaligned of time
+    every: 30s
+    units: ms
+     info: average time between reception of client PUT request and \
+           subsequent response to the client over the last hour
+
+ template: riakkv_kv_put_slow
+       on: riak.kv.latency.put
+    class: Latency
+     type: Database
+component: Riak KV
+     calc: $mean
+   lookup: average -3m unaligned of time
+    units: ms
+    every: 10s
+     warn: ($this > ($riakkv_1h_kv_put_mean_latency * 2) )
+     crit: ($this > ($riakkv_1h_kv_put_mean_latency * 3) )
+     info: average time between reception of client PUT request and \
+           subsequent response to the client over the last 3 minutes, \
+           compared to the average over the last hour
+    delay: down 5m multiplier 1.5 max 1h
+       to: dba
+
+
+## VM healthchecks
+
+# Default Erlang VM process limit: 262144
+# On systems observed, this is < 2000, but may grow depending on load.
+ template: riakkv_vm_high_process_count
+       on: riak.vm
+    class: Utilization
+     type: Database
+component: Riak KV
+     calc: $sys_process_count
+    units: processes
+    every: 10s
+     warn: $this > 10000
+     crit: $this > 100000
+     info: number of processes running in the Erlang VM
+       to: dba
diff --git a/health/health.d/scaleio.conf b/health/health.d/scaleio.conf
new file mode 100644
index 0000000..ab110bf
--- /dev/null
+++ b/health/health.d/scaleio.conf
@@ -0,0 +1,31 @@
+
+# make sure Storage Pool capacity utilization is under limit
+
+ template: scaleio_storage_pool_capacity_utilization
+       on: scaleio.storage_pool_capacity_utilization
+    class: Utilization
+     type: Storage
+component: ScaleIO
+     calc: $used
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (80) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: down 15m multiplier 1.5 max 1h
+     info: storage pool capacity utilization
+       to: sysadmin
+
+
+# make sure Sdc is connected to MDM
+
+ template: scaleio_sdc_mdm_connection_state
+       on: scaleio.sdc_mdm_connection_state
+    class: Utilization
+     type: Storage
+component: ScaleIO
+     calc: $connected
+    every: 10s
+     warn: $this != 1
+    delay: up 30s down 5m multiplier 1.5 max 1h
+     info: Data Client (SDC) to Metadata Manager (MDM) connection state (0: disconnected, 1: connected)
+       to: sysadmin
diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf
new file mode 100644
index 0000000..345f875
--- /dev/null
+++ b/health/health.d/softnet.conf
@@ -0,0 +1,54 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# check for common /proc/net/softnet_stat errors
+
+    alarm: 1min_netdev_backlog_exceeded
+       on: system.softnet_stat
+    class: Errors
+     type: System
+component: Network
+       os: linux
+    hosts: *
+   lookup: average -1m unaligned absolute of dropped
+    units: packets
+    every: 10s
+     warn: $this > (($status >= $WARNING) ? (0) : (10))
+    delay: down 1h multiplier 1.5 max 2h
+     info: average number of dropped packets in the last minute \
+           due to exceeded net.core.netdev_max_backlog
+       to: sysadmin
+
+    alarm: 1min_netdev_budget_ran_outs
+       on: system.softnet_stat
+    class: Errors
+     type: System
+component: Network
+       os: linux
+    hosts: *
+   lookup: average -1m unaligned absolute of squeezed
+    units: events
+    every: 10s
+     warn: $this > (($status >= $WARNING) ? (0) : (10))
+    delay: down 1h multiplier 1.5 max 2h
+     info: average number of times ksoftirq ran out of sysctl net.core.netdev_budget or \
+           net.core.netdev_budget_usecs with work remaining over the last minute \
+           (this can be a cause for dropped packets)
+       to: silent
+
+    alarm: 10min_netisr_backlog_exceeded
+       on: system.softnet_stat
+    class: Errors
+     type: System
+component: Network
+       os: freebsd
+    hosts: *
+   lookup: average -1m unaligned absolute of qdrops
+    units: packets
+    every: 10s
+     warn: $this > (($status >= $WARNING) ? (0) : (10))
+    delay: down 1h multiplier 1.5 max 2h
+     info: average number of drops in the last minute \
+           due to exceeded sysctl net.route.netisr_maxqlen \
+           (this can be a cause for dropped packets)
+       to: sysadmin
diff --git a/health/health.d/swap.conf b/health/health.d/swap.conf
new file mode 100644
index 0000000..d30c74c
--- /dev/null
+++ b/health/health.d/swap.conf
@@ -0,0 +1,35 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+    alarm: 30min_ram_swapped_out
+       on: system.swapio
+    class: Workload
+     type: System
+component: Memory
+       os: linux freebsd
+    hosts: *
+   lookup: sum -30m unaligned absolute of out
+           # we have to convert KB to MB by dividing $this (i.e. the result of the lookup) with 1024
+     calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
+    units: % of RAM
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (20) : (30))
+    delay: down 15m multiplier 1.5 max 1h
+     info: percentage of the system RAM swapped in the last 30 minutes
+       to: sysadmin
+
+    alarm: used_swap
+       on: system.swap
+    class: Utilization
+     type: System
+component: Memory
+       os: linux freebsd
+    hosts: *
+     calc: (($used + $free) > 0) ? ($used * 100 / ($used + $free)) : 0
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (80) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: up 30s down 15m multiplier 1.5 max 1h
+     info: swap memory utilization
+       to: sysadmin
diff --git a/health/health.d/synchronization.conf b/health/health.d/synchronization.conf
new file mode 100644
index 0000000..417624a
--- /dev/null
+++ b/health/health.d/synchronization.conf
@@ -0,0 +1,12 @@
+   alarm: sync_freq
+      on: mem.sync
+  lookup: sum -1m of sync
+   units: calls
+  plugin: ebpf.plugin
+   every: 1m
+    warn: $this > 6
+   delay: up 1m down 10m multiplier 1.5 max 1h
+    info: number of sync() system calls. \
+          Every call causes all pending modifications to filesystem metadata and \
+          cached file data to be written to the underlying filesystems.
+      to: sysadmin
diff --git a/health/health.d/systemdunits.conf b/health/health.d/systemdunits.conf
new file mode 100644
index 0000000..531d62f
--- /dev/null
+++ b/health/health.d/systemdunits.conf
@@ -0,0 +1,141 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+## Service units
+ template: systemd_service_unit_failed_state
+       on: systemd.service_unit_state
+    class: Errors
+     type: Linux
+component: Systemd units
+     calc: $failed
+    units: state
+    every: 10s
+     warn: $this != nan AND $this == 1
+    delay: down 5m multiplier 1.5 max 1h
+     info: systemd service unit in the failed state
+       to: sysadmin
+
+## Socket units
+ template: systemd_socket_unit_failed_state
+       on: systemd.socket_unit_state
+    class: Errors
+     type: Linux
+component: Systemd units
+     calc: $failed
+    units: state
+    every: 10s
+     warn: $this != nan AND $this == 1
+    delay: down 5m multiplier 1.5 max 1h
+     info: systemd socket unit in the failed state
+       to: sysadmin
+
+## Target units
+ template: systemd_target_unit_failed_state
+       on: systemd.target_unit_state
+    class: Errors
+     type: Linux
+component: Systemd units
+     calc: $failed
+    units: state
+    every: 10s
+     warn: $this != nan AND $this == 1
+    delay: down 5m multiplier 1.5 max 1h
+     info: systemd target unit in the failed state
+       to: sysadmin
+
+## Path units
+ template: systemd_path_unit_failed_state
+       on: systemd.path_unit_state
+    class: Errors
+     type: Linux
+component: Systemd units
+     calc: $failed
+    units: state
+    every: 10s
+     warn: $this != nan AND $this == 1
+    delay: down 5m multiplier 1.5 max 1h
+     info: systemd path unit in the failed state
+       to: sysadmin
+
+## Device units
+ template: systemd_device_unit_failed_state
+       on: systemd.device_unit_state
+    class: Errors
+     type: Linux
+component: Systemd units
+     calc: $failed
+    units: state
+    every: 10s
+     warn: $this != nan AND $this == 1
+    delay: down 5m multiplier 1.5 max 1h
+     info: systemd device unit in the failed state
+       to: sysadmin
+
+## Mount units
+ template: systemd_mount_unit_failed_state
+       on: systemd.mount_unit_state
+    class: Errors
+     type: Linux
+component: Systemd units
+     calc: $failed
+    units: state
+    every: 10s
+     warn: $this != nan AND $this == 1
+    delay: down 5m multiplier 1.5 max 1h
+     info: systemd mount units in the failed state
+       to: sysadmin
+
+## Automount units
+ template: systemd_automount_unit_failed_state
+       on: systemd.automount_unit_state
+    class: Errors
+     type: Linux
+component: Systemd units
+     calc: $failed
+    units: state
+    every: 10s
+     warn: $this != nan AND $this == 1
+    delay: down 5m multiplier 1.5 max 1h
+     info: systemd automount unit in the failed state
+       to: sysadmin
+
+## Swap units
+ template: systemd_swap_unit_failed_state
+       on: systemd.swap_unit_state
+    class: Errors
+     type: Linux
+component: Systemd units
+     calc: $failed
+    units: state
+    every: 10s
+     warn: $this != nan AND $this == 1
+    delay: down 5m multiplier 1.5 max 1h
+     info: systemd swap units in the failed state
+       to: sysadmin
+
+## Scope units
+ template: systemd_scope_unit_failed_state
+       on: systemd.scope_unit_state
+    class: Errors
+     type: Linux
+component: Systemd units
+     calc: $failed
+    units: state
+    every: 10s
+     warn: $this != nan AND $this == 1
+    delay: down 5m multiplier 1.5 max 1h
+     info: systemd scope units in the failed state
+       to: sysadmin
+
+## Slice units
+ template: systemd_slice_unit_failed_state
+       on: systemd.slice_unit_state
+    class: Errors
+     type: Linux
+component: Systemd units
+     calc: $failed
+    units: state
+    every: 10s
+     warn: $this != nan AND $this == 1
+    delay: down 5m multiplier 1.5 max 1h
+     info: systemd slice units in the failed state
+       to: sysadmin
diff --git a/health/health.d/tcp_conn.conf b/health/health.d/tcp_conn.conf
new file mode 100644
index 0000000..67b3bee
--- /dev/null
+++ b/health/health.d/tcp_conn.conf
@@ -0,0 +1,22 @@
+
+#
+# ${tcp_max_connections} may be nan or -1 if the system
+# supports dynamic threshold for TCP connections.
+# In this case, the alarm will always be zero.
+#
+
+    alarm: tcp_connections
+       on: ipv4.tcpsock
+    class: Workload
+     type: System
+component: Network
+       os: linux
+    hosts: *
+     calc: (${tcp_max_connections} > 0) ? ( ${connections} * 100 / ${tcp_max_connections} ) : 0
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING ) ? ( 60 ) : ( 80 ))
+     crit: $this > (($status == $CRITICAL) ? ( 80 ) : ( 90 ))
+    delay: up 0 down 5m multiplier 1.5 max 1h
+     info: IPv4 TCP connections utilization
+       to: sysadmin
diff --git a/health/health.d/tcp_listen.conf b/health/health.d/tcp_listen.conf
new file mode 100644
index 0000000..d4bcfa2
--- /dev/null
+++ b/health/health.d/tcp_listen.conf
@@ -0,0 +1,96 @@
+#
+# There are two queues involved when incoming TCP connections are handled
+# (both at the kernel):
+#
+# SYN queue
+# The SYN queue tracks TCP handshakes until connections are fully established.
+# It overflows when too many incoming TCP connection requests hang in the
+# half-open state and the server is not configured to fall back to SYN cookies.
+# Overflows are usually caused by SYN flood DoS attacks (i.e. someone sends
+# lots of SYN packets and never completes the handshakes).
+#
+# Accept queue
+# The accept queue holds fully established TCP connections waiting to be handled
+# by the listening application. It overflows when the server application fails
+# to accept new connections at the rate they are coming in.
+#
+#
+# -----------------------------------------------------------------------------
+# tcp accept queue (at the kernel)
+
+    alarm: 1m_tcp_accept_queue_overflows
+       on: ip.tcp_accept_queue
+    class: Workload
+     type: System
+component: Network
+       os: linux
+    hosts: *
+   lookup: average -60s unaligned absolute of ListenOverflows
+    units: overflows
+    every: 10s
+     warn: $this > 1
+     crit: $this > (($status == $CRITICAL) ? (1) : (5))
+    delay: up 0 down 5m multiplier 1.5 max 1h
+     info: average number of overflows in the TCP accept queue over the last minute
+       to: sysadmin
+
+# THIS IS TOO GENERIC
+# CHECK: https://github.com/netdata/netdata/issues/3234#issuecomment-423935842
+    alarm: 1m_tcp_accept_queue_drops
+       on: ip.tcp_accept_queue
+    class: Workload
+     type: System
+component: Network
+       os: linux
+    hosts: *
+   lookup: average -60s unaligned absolute of ListenDrops
+    units: drops
+    every: 10s
+     warn: $this > 1
+     crit: $this > (($status == $CRITICAL) ? (1) : (5))
+    delay: up 0 down 5m multiplier 1.5 max 1h
+     info: average number of dropped packets in the TCP accept queue over the last minute
+       to: sysadmin
+
+
+# -----------------------------------------------------------------------------
+# tcp SYN queue (at the kernel)
+
+# When the SYN queue is full, either TcpExtTCPReqQFullDoCookies or
+# TcpExtTCPReqQFullDrop is incremented, depending on whether SYN cookies are
+# enabled or not. In both cases this probably indicates a SYN flood attack,
+# so i guess a notification should be sent.
+
+    alarm: 1m_tcp_syn_queue_drops
+       on: ip.tcp_syn_queue
+    class: Workload
+     type: System
+component: Network
+       os: linux
+    hosts: *
+   lookup: average -60s unaligned absolute of TCPReqQFullDrop
+    units: drops
+    every: 10s
+     warn: $this > 1
+     crit: $this > (($status == $CRITICAL) ? (0) : (5))
+    delay: up 10 down 5m multiplier 1.5 max 1h
+     info: average number of SYN requests was dropped due to the full TCP SYN queue over the last minute \
+           (SYN cookies were not enabled)
+       to: sysadmin
+
+    alarm: 1m_tcp_syn_queue_cookies
+       on: ip.tcp_syn_queue
+    class: Workload
+     type: System
+component: Network
+       os: linux
+    hosts: *
+   lookup: average -60s unaligned absolute of TCPReqQFullDoCookies
+    units: cookies
+    every: 10s
+     warn: $this > 1
+     crit: $this > (($status == $CRITICAL) ? (0) : (5))
+    delay: up 10 down 5m multiplier 1.5 max 1h
+     info: average number of sent SYN cookies due to the full TCP SYN queue over the last minute
+       to: sysadmin
+
diff --git a/health/health.d/tcp_mem.conf b/health/health.d/tcp_mem.conf
new file mode 100644
index 0000000..318be20
--- /dev/null
+++ b/health/health.d/tcp_mem.conf
@@ -0,0 +1,23 @@
+#
+# check
+# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html
+#
+# We give a warning when TCP is under memory pressure
+# and a critical when TCP is 90% of its upper memory limit
+#
+
+    alarm: tcp_memory
+       on: ipv4.sockstat_tcp_mem
+    class: Utilization
+     type: System
+component: Network
+       os: linux
+    hosts: *
+     calc: ${mem} * 100 / ${tcp_mem_high}
+    units: %
+    every: 10s
+     warn: ${mem} > (($status >= $WARNING  ) ? ( ${tcp_mem_pressure} * 0.8 ) : ( ${tcp_mem_pressure}   ))
+     crit: ${mem} > (($status == $CRITICAL ) ? ( ${tcp_mem_pressure}       ) : ( ${tcp_mem_high} * 0.9 ))
+    delay: up 0 down 5m multiplier 1.5 max 1h
+     info: TCP memory utilization
+       to: sysadmin
diff --git a/health/health.d/tcp_orphans.conf b/health/health.d/tcp_orphans.conf
new file mode 100644
index 0000000..cbd628d
--- /dev/null
+++ b/health/health.d/tcp_orphans.conf
@@ -0,0 +1,24 @@
+
+#
+# check
+# http://blog.tsunanet.net/2011/03/out-of-socket-memory.html
+#
+# The kernel may penalize orphans by 2x or even 4x
+# so we alarm warning at 25% and critical at 50%
+#
+
+    alarm: tcp_orphans
+       on: ipv4.sockstat_tcp_sockets
+    class: Errors
+     type: System
+component: Network
+       os: linux
+    hosts: *
+     calc: ${orphan} * 100 / ${tcp_max_orphans}
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING ) ? ( 20 ) : ( 25 ))
+     crit: $this > (($status == $CRITICAL) ? ( 25 ) : ( 50 ))
+    delay: up 0 down 5m multiplier 1.5 max 1h
+     info: orphan IPv4 TCP sockets utilization
+       to: sysadmin
diff --git a/health/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf
new file mode 100644
index 0000000..ff116db
--- /dev/null
+++ b/health/health.d/tcp_resets.conf
@@ -0,0 +1,69 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# -----------------------------------------------------------------------------
+# tcp resets this host sends
+
+    alarm: 1m_ipv4_tcp_resets_sent
+       on: ipv4.tcphandshake
+    class: Errors
+     type: System
+component: Network
+       os: linux
+    hosts: *
+   lookup: average -1m at -10s unaligned absolute of OutRsts
+    units: tcp resets/s
+    every: 10s
+     info: average number of sent TCP RESETS over the last minute
+
+    alarm: 10s_ipv4_tcp_resets_sent
+       on: ipv4.tcphandshake
+    class: Errors
+     type: System
+component: Network
+       os: linux
+    hosts: *
+   lookup: average -10s unaligned absolute of OutRsts
+    units: tcp resets/s
+    every: 10s
+     warn: $netdata.uptime.uptime > (1 * 60) AND $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING)  ? (1) : (10)))
+    delay: up 20s down 60m multiplier 1.2 max 2h
+  options: no-clear-notification
+     info: average number of sent TCP RESETS over the last 10 seconds. \
+           This can indicate a port scan, \
+           or that a service running on this host has crashed. \
+           Netdata will not send a clear notification for this alarm.
+       to: sysadmin
+
+# -----------------------------------------------------------------------------
+# tcp resets this host receives
+
+    alarm: 1m_ipv4_tcp_resets_received
+       on: ipv4.tcphandshake
+    class: Errors
+     type: System
+component: Network
+       os: linux freebsd
+    hosts: *
+   lookup: average -1m at -10s unaligned absolute of AttemptFails
+    units: tcp resets/s
+    every: 10s
+     info: average number of received TCP RESETS over the last minute
+
+    alarm: 10s_ipv4_tcp_resets_received
+       on: ipv4.tcphandshake
+    class: Errors
+     type: System
+component: Network
+       os: linux freebsd
+    hosts: *
+   lookup: average -10s unaligned absolute of AttemptFails
+    units: tcp resets/s
+    every: 10s
+     warn: $netdata.uptime.uptime > (1 * 60) AND $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING)  ? (1) : (10)))
+    delay: up 20s down 60m multiplier 1.2 max 2h
+  options: no-clear-notification
+     info: average number of received TCP RESETS over the last 10 seconds. \
+           This can be an indication that a service this host needs has crashed. \
+           Netdata will not send a clear notification for this alarm.
+       to: sysadmin
diff --git a/health/health.d/timex.conf b/health/health.d/timex.conf
new file mode 100644
index 0000000..2e9b1a3
--- /dev/null
+++ b/health/health.d/timex.conf
@@ -0,0 +1,17 @@
+
+# It can take several minutes before ntpd selects a server to synchronize with;
+# try checking after 17 minutes (1024 seconds).
+
+    alarm: system_clock_sync_state
+       on: system.clock_sync_state
+       os: linux
+    class: Errors
+     type: System
+component: Clock
+     calc: $state
+    units: synchronization state
+    every: 10s
+     warn: $system.uptime.uptime > 17 * 60 AND $this == 0
+    delay: down 5m
+     info: when set to 0, the system kernel believes the system clock is not properly synchronized to a reliable server
+       to: silent
diff --git a/health/health.d/udp_errors.conf b/health/health.d/udp_errors.conf
new file mode 100644
index 0000000..64f47df
--- /dev/null
+++ b/health/health.d/udp_errors.conf
@@ -0,0 +1,38 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# -----------------------------------------------------------------------------
+# UDP receive buffer errors
+
+    alarm: 1m_ipv4_udp_receive_buffer_errors
+       on: ipv4.udperrors
+    class: Errors
+     type: System
+component: Network
+       os: linux freebsd
+    hosts: *
+   lookup: average -1m unaligned absolute of RcvbufErrors
+    units: errors
+    every: 10s
+     warn: $this > (($status >= $WARNING) ? (0) : (10))
+     info: average number of UDP receive buffer errors over the last minute
+    delay: up 1m down 60m multiplier 1.2 max 2h
+       to: sysadmin
+
+# -----------------------------------------------------------------------------
+# UDP send buffer errors
+
+    alarm: 1m_ipv4_udp_send_buffer_errors
+       on: ipv4.udperrors
+    class: Errors
+     type: System
+component: Network
+       os: linux
+    hosts: *
+   lookup: average -1m unaligned absolute of SndbufErrors
+    units: errors
+    every: 10s
+     warn: $this > (($status >= $WARNING) ? (0) : (10))
+     info: average number of UDP send buffer errors over the last minute
+    delay: up 1m down 60m multiplier 1.2 max 2h
+       to: sysadmin
diff --git a/health/health.d/unbound.conf b/health/health.d/unbound.conf
new file mode 100644
index 0000000..4e8d164
--- /dev/null
+++ b/health/health.d/unbound.conf
@@ -0,0 +1,28 @@
+
+# make sure there is no overwritten/dropped queries in the request-list
+
+ template: unbound_request_list_overwritten
+       on: unbound.request_list_jostle_list
+    class: Errors
+     type: DNS
+component: Unbound
+   lookup: average -60s unaligned absolute match-names of overwritten
+    units: queries
+    every: 10s
+     warn: $this > 5
+    delay: up 10 down 5m multiplier 1.5 max 1h
+     info: number of overwritten queries in the request-list
+       to: sysadmin
+
+ template: unbound_request_list_dropped
+       on: unbound.request_list_jostle_list
+    class: Errors
+     type: DNS
+component: Unbound
+   lookup: average -60s unaligned absolute match-names of dropped
+    units: queries
+    every: 10s
+     warn: $this > 0
+    delay: up 10 down 5m multiplier 1.5 max 1h
+     info: number of dropped queries in the request-list
+       to: sysadmin
diff --git a/health/health.d/vcsa.conf b/health/health.d/vcsa.conf
new file mode 100644
index 0000000..a9cc7ce
--- /dev/null
+++ b/health/health.d/vcsa.conf
@@ -0,0 +1,141 @@
+
+# Overall system health:
+#  - 0: all components are healthy.
+#  - 1: one or more components might become overloaded soon.
+#  - 2: one or more components in the appliance might be degraded.
+#  - 3: one or more components might be in an unusable status and the appliance might become unresponsive soon.
+#  - 4: no health data is available.
+
+ template: vcsa_system_health
+       on: vcsa.system_health
+    class: Errors
+     type: Virtual Machine
+component: VMware vCenter
+   lookup: max -10s unaligned of system
+    units: status
+    every: 10s
+     warn: ($this == 1) || ($this == 2)
+     crit: $this == 3
+    delay: down 1m multiplier 1.5 max 1h
+     info: overall system health status \
+           (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
+       to: sysadmin
+
+# Components health:
+#  - 0: healthy.
+#  - 1: healthy, but may have some problems.
+#  - 2: degraded, and may have serious problems.
+#  - 3: unavailable, or will stop functioning soon.
+#  - 4: no health data is available.
+
+ template: vcsa_swap_health
+       on: vcsa.components_health
+    class: Errors
+     type: Virtual Machine
+component: VMware vCenter
+   lookup: max -10s unaligned of swap
+    units: status
+    every: 10s
+     warn: $this == 1
+     crit: ($this == 2) || ($this == 3)
+    delay: down 1m multiplier 1.5 max 1h
+     info: swap health status \
+           (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
+       to: sysadmin
+
+ template: vcsa_storage_health
+       on: vcsa.components_health
+    class: Errors
+     type: Virtual Machine
+component: VMware vCenter
+   lookup: max -10s unaligned of storage
+    units: status
+    every: 10s
+     warn: $this == 1
+     crit: ($this == 2) || ($this == 3)
+    delay: down 1m multiplier 1.5 max 1h
+     info: storage health status \
+           (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
+       to: sysadmin
+
+ template: vcsa_mem_health
+       on: vcsa.components_health
+    class: Errors
+     type: Virtual Machine
+component: VMware vCenter
+   lookup: max -10s unaligned of mem
+    units: status
+    every: 10s
+     warn: $this == 1
+     crit: ($this == 2) || ($this == 3)
+    delay: down 1m multiplier 1.5 max 1h
+     info: memory health status \
+           (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
+       to: sysadmin
+
+ template: vcsa_load_health
+       on: vcsa.components_health
+    class: Utilization
+     type: Virtual Machine
+component: VMware vCenter
+   lookup: max -10s unaligned of load
+    units: status
+    every: 10s
+     warn: $this == 1
+     crit: ($this == 2) || ($this == 3)
+    delay: down 1m multiplier 1.5 max 1h
+     info: load health status \
+           (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
+       to: sysadmin
+
+ template: vcsa_database_storage_health
+       on: vcsa.components_health
+    class: Errors
+     type: Virtual Machine
+component: VMware vCenter
+   lookup: max -10s unaligned of database_storage
+    units: status
+    every: 10s
+     warn: $this == 1
+     crit: ($this == 2) || ($this == 3)
+    delay: down 1m multiplier 1.5 max 1h
+     info: database storage health status \
+           (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
+       to: sysadmin
+
+ template: vcsa_applmgmt_health
+       on: vcsa.components_health
+    class: Errors
+     type: Virtual Machine
+component: VMware vCenter
+   lookup: max -10s unaligned of applmgmt
+    units: status
+    every: 10s
+     warn: $this == 1
+     crit: ($this == 2) || ($this == 3)
+    delay: down 1m multiplier 1.5 max 1h
+     info: applmgmt health status \
+           (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
+       to: sysadmin
+
+
+# Software updates health:
+#  - 0: no updates available.
+#  - 2: non-security updates are available.
+#  - 3: security updates are available.
+#  - 4: an error retrieving information on software updates.
+
+ template: vcsa_software_updates_health
+       on: vcsa.software_updates_health
+    class: Errors
+     type: Virtual Machine
+component: VMware vCenter
+   lookup: max -10s unaligned of software_packages
+    units: status
+    every: 10s
+     warn: $this == 4
+     crit: $this == 3
+    delay: down 1m multiplier 1.5 max 1h
+     info: software updates availability status \
+           (-1: unknown, 0: green, 2: orange, 3: red, 4: grey)
+       to: sysadmin
diff --git a/health/health.d/vernemq.conf b/health/health.d/vernemq.conf
new file mode 100644
index 0000000..cfbe2a5
--- /dev/null
+++ b/health/health.d/vernemq.conf
@@ -0,0 +1,365 @@
+
+# Socket errors
+
+ template: vernemq_socket_errors
+       on: vernemq.socket_errors
+    class: Errors
+     type: Messaging
+component: VerneMQ
+   lookup: sum -1m unaligned absolute of socket_error
+    units: errors
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 2m down 5m multiplier 1.5 max 2h
+     info: number of socket errors in the last minute
+       to: sysadmin
+
+# Queues dropped/expired/unhandled PUBLISH messages
+
+ template: vernemq_queue_message_drop
+       on: vernemq.queue_undelivered_messages
+    class: Errors
+     type: Messaging
+component: VerneMQ
+   lookup: average -1m unaligned absolute of queue_message_drop
+    units: dropped messages
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 2m down 5m multiplier 1.5 max 2h
+     info: number of dropped messaged due to full queues in the last minute
+       to: sysadmin
+
+ template: vernemq_queue_message_expired
+       on: vernemq.queue_undelivered_messages
+    class: Latency
+     type: Messaging
+component: VerneMQ
+   lookup: average -1m unaligned absolute of queue_message_expired
+    units: expired messages
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 2m down 5m multiplier 1.5 max 2h
+     info: number of messages which expired before delivery in the last minute
+       to: sysadmin
+
+ template: vernemq_queue_message_unhandled
+       on: vernemq.queue_undelivered_messages
+    class: Latency
+     type: Messaging
+component: VerneMQ
+   lookup: average -1m unaligned absolute of queue_message_unhandled
+    units: unhandled messages
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 2m down 5m multiplier 1.5 max 2h
+     info: number of unhandled messages (connections with clean session=true) in the last minute
+       to: sysadmin
+
+# Erlang VM
+
+ template: vernemq_average_scheduler_utilization
+       on: vernemq.average_scheduler_utilization
+    class: Utilization
+     type: Messaging
+component: VerneMQ
+   lookup: average -10m unaligned
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (75) : (85))
+     crit: $this > (($status == $CRITICAL) ? (85) : (95))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average scheduler utilization over the last 10 minutes
+       to: sysadmin
+
+# Cluster communication and netsplits
+
+ template: vernemq_cluster_dropped
+       on: vernemq.cluster_dropped
+    class: Errors
+     type: Messaging
+component: VerneMQ
+   lookup: sum -1m unaligned
+    units: KiB
+    every: 1m
+     warn: $this > 0
+    delay: up 5m down 5m multiplier 1.5 max 1h
+     info: amount of traffic dropped during communication with the cluster nodes in the last minute
+       to: sysadmin
+
+ template: vernemq_netsplits
+       on: vernemq.netsplits
+    class: Workload
+     type: Messaging
+component: VerneMQ
+   lookup: sum -1m unaligned absolute of netsplit_detected
+    units: netsplits
+    every: 10s
+     warn: $this > 0
+    delay: down 5m multiplier 1.5 max 2h
+     info: number of detected netsplits (split brain situation) in the last minute
+       to: sysadmin
+
+# Unsuccessful CONNACK
+
+ template: vernemq_mqtt_connack_sent_reason_unsuccessful
+       on: vernemq.mqtt_connack_sent_reason
+    class: Errors
+     type: Messaging
+component: VerneMQ
+   lookup: average -1m unaligned absolute match-names of !success,*
+    units: packets
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 2m down 5m multiplier 1.5 max 2h
+     info: number of sent unsuccessful v3/v5 CONNACK packets in the last minute
+       to: sysadmin
+
+# Not normal DISCONNECT
+
+ template: vernemq_mqtt_disconnect_received_reason_not_normal
+       on: vernemq.mqtt_disconnect_received_reason
+    class: Workload
+     type: Messaging
+component: VerneMQ
+   lookup: average -1m unaligned absolute match-names of !normal_disconnect,*
+    units: packets
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 2m down 5m multiplier 1.5 max 2h
+     info: number of received not normal v5 DISCONNECT packets in the last minute
+       to: sysadmin
+
+ template: vernemq_mqtt_disconnect_sent_reason_not_normal
+       on: vernemq.mqtt_disconnect_sent_reason
+    class: Errors
+     type: Messaging
+component: VerneMQ
+   lookup: average -1m unaligned absolute match-names of !normal_disconnect,*
+    units: packets
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 2m down 5m multiplier 1.5 max 2h
+     info: number of sent not normal v5 DISCONNECT packets in the last minute
+       to: sysadmin
+
+# SUBSCRIBE errors and unauthorized attempts
+
+ template: vernemq_mqtt_subscribe_error
+       on: vernemq.mqtt_subscribe_error
+    class: Errors
+     type: Messaging
+component: VerneMQ
+   lookup: average -1m unaligned absolute
+    units: failed ops
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 2m down 5m multiplier 1.5 max 2h
+     info: number of failed v3/v5 SUBSCRIBE operations in the last minute
+       to: sysadmin
+
+ template: vernemq_mqtt_subscribe_auth_error
+       on: vernemq.mqtt_subscribe_auth_error
+    class: Workload
+     type: Messaging
+component: VerneMQ
+   lookup: average -1m unaligned absolute
+    units: attempts
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 2m down 5m multiplier 1.5 max 2h
+     info: number of unauthorized v3/v5 SUBSCRIBE attempts in the last minute
+       to: sysadmin
+
+# UNSUBSCRIBE errors
+
+ template: vernemq_mqtt_unsubscribe_error
+       on: vernemq.mqtt_unsubscribe_error
+    class: Errors
+     type: Messaging
+component: VerneMQ
+   lookup: average -1m unaligned absolute
+    units: failed ops
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 2m down 5m multiplier 1.5 max 2h
+     info: number of failed v3/v5 UNSUBSCRIBE operations in the last minute
+       to: sysadmin
+
+# PUBLISH errors and unauthorized attempts
+
+ template: vernemq_mqtt_publish_errors
+       on: vernemq.mqtt_publish_errors
+    class: Errors
+     type: Messaging
+component: VerneMQ
+   lookup: average -1m unaligned absolute
+    units: failed ops
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 2m down 5m multiplier 1.5 max 2h
+     info: number of failed v3/v5 PUBLISH operations in the last minute
+       to: sysadmin
+
+ template: vernemq_mqtt_publish_auth_errors
+       on: vernemq.mqtt_publish_auth_errors
+    class: Workload
+     type: Messaging
+component: VerneMQ
+   lookup: average -1m unaligned absolute
+    units: attempts
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 2m down 5m multiplier 1.5 max 2h
+     info: number of unauthorized v3/v5 PUBLISH attempts in the last minute
+       to: sysadmin
+
+# Unsuccessful and unexpected PUBACK
+
+ template: vernemq_mqtt_puback_received_reason_unsuccessful
+       on: vernemq.mqtt_puback_received_reason
+    class: Errors
+     type: Messaging
+component: VerneMQ
+   lookup: average -1m unaligned absolute match-names of !success,*
+    units: packets
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 2m down 5m multiplier 1.5 max 2h
+     info: number of received unsuccessful v5 PUBACK packets in the last minute
+       to: sysadmin
+
+ template: vernemq_mqtt_puback_sent_reason_unsuccessful
+       on: vernemq.mqtt_puback_sent_reason
+    class: Errors
+     type: Messaging
+component: VerneMQ
+   lookup: average -1m unaligned absolute match-names of !success,*
+    units: packets
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 2m down 5m multiplier 1.5 max 2h
+     info: number of sent unsuccessful v5 PUBACK packets in the last minute
+       to: sysadmin
+
+ template: vernemq_mqtt_puback_unexpected
+       on: vernemq.mqtt_puback_invalid_error
+    class: Workload
+     type: Messaging
+component: VerneMQ
+   lookup: average -1m unaligned absolute
+    units: messages
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 2m down 5m multiplier 1.5 max 2h
+     info: number of received unexpected v3/v5 PUBACK packets in the last minute
+       to: sysadmin
+
+# Unsuccessful and unexpected PUBREC
+
+ template: vernemq_mqtt_pubrec_received_reason_unsuccessful
+       on: vernemq.mqtt_pubrec_received_reason
+    class: Errors
+     type: Messaging
+component: VerneMQ
+   lookup: average -1m unaligned absolute match-names of !success,*
+    units: packets
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 2m down 5m multiplier 1.5 max 2h
+     info: number of received unsuccessful v5 PUBREC packets in the last minute
+       to: sysadmin
+
+ template: vernemq_mqtt_pubrec_sent_reason_unsuccessful
+       on: vernemq.mqtt_pubrec_sent_reason
+    class: Errors
+     type: Messaging
+component: VerneMQ
+   lookup: average -1m unaligned absolute match-names of !success,*
+    units: packets
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 2m down 5m multiplier 1.5 max 2h
+     info: number of sent unsuccessful v5 PUBREC packets in the last minute
+       to: sysadmin
+
+ template: vernemq_mqtt_pubrec_invalid_error
+       on: vernemq.mqtt_pubrec_invalid_error
+    class: Workload
+     type: Messaging
+component: VerneMQ
+   lookup: average -1m unaligned absolute
+    units: messages
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 2m down 5m multiplier 1.5 max 2h
+     info: number of received unexpected v3 PUBREC packets in the last minute
+       to: sysadmin
+
+# Unsuccessful PUBREL
+
+ template: vernemq_mqtt_pubrel_received_reason_unsuccessful
+       on: vernemq.mqtt_pubrel_received_reason
+    class: Errors
+     type: Messaging
+component: VerneMQ
+   lookup: average -1m unaligned absolute match-names of !success,*
+    units: packets
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 2m down 5m multiplier 1.5 max 2h
+     info: number of received unsuccessful v5 PUBREL packets in the last minute
+       to: sysadmin
+
+ template: vernemq_mqtt_pubrel_sent_reason_unsuccessful
+       on: vernemq.mqtt_pubrel_sent_reason
+    class: Errors
+     type: Messaging
+component: VerneMQ
+   lookup: average -1m unaligned absolute match-names of !success,*
+    units: packets
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 2m down 5m multiplier 1.5 max 2h
+     info: number of sent unsuccessful v5 PUBREL packets in the last minute
+       to: sysadmin
+
+# Unsuccessful and unexpected PUBCOMP
+
+ template: vernemq_mqtt_pubcomp_received_reason_unsuccessful
+       on: vernemq.mqtt_pubcomp_received_reason
+    class: Errors
+     type: Messaging
+component: VerneMQ
+   lookup: average -1m unaligned absolute match-names of !success,*
+    units: packets
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 2m down 5m multiplier 1.5 max 2h
+     info: number of received unsuccessful v5 PUBCOMP packets in the last minute
+       to: sysadmin
+
+ template: vernemq_mqtt_pubcomp_sent_reason_unsuccessful
+       on: vernemq.mqtt_pubcomp_sent_reason
+    class: Errors
+     type: Messaging
+component: VerneMQ
+   lookup: average -1m unaligned absolute match-names of !success,*
+    units: packets
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 2m down 5m multiplier 1.5 max 2h
+     info: number of sent unsuccessful v5 PUBCOMP packets in the last minute
+       to: sysadmin
+
+ template: vernemq_mqtt_pubcomp_unexpected
+       on: vernemq.mqtt_pubcomp_invalid_error
+    class: Workload
+     type: Messaging
+component: VerneMQ
+   lookup: average -1m unaligned absolute
+    units: messages
+    every: 1m
+     warn: $this > (($status >= $WARNING) ? (0) : (5))
+    delay: up 2m down 5m multiplier 1.5 max 2h
+     info: number of received unexpected v3/v5 PUBCOMP packets in the last minute
+       to: sysadmin
diff --git a/health/health.d/vsphere.conf b/health/health.d/vsphere.conf
new file mode 100644
index 0000000..d8fc899
--- /dev/null
+++ b/health/health.d/vsphere.conf
@@ -0,0 +1,174 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+# -----------------------------------------------VM Specific------------------------------------------------------------
+# Memory
+
+ template: vsphere_vm_mem_usage
+       on: vsphere.vm_mem_usage_percentage
+    class: Utilization
+     type: Virtual Machine
+component: Memory
+    hosts: *
+     calc: $used
+    units: %
+    every: 20s
+     warn: $this > (($status >= $WARNING)  ? (80) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: down 15m multiplier 1.5 max 1h
+     info: virtual machine memory utilization
+
+# -----------------------------------------------HOST Specific----------------------------------------------------------
+# Memory
+
+ template: vsphere_host_mem_usage
+       on: vsphere.host_mem_usage_percentage
+    class: Utilization
+     type: Virtual Machine
+component: Memory
+    hosts: *
+     calc: $used
+    units: %
+    every: 20s
+     warn: $this > (($status >= $WARNING)  ? (80) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: down 15m multiplier 1.5 max 1h
+     info: host memory utilization
+
+# Network errors
+
+ template: vsphere_inbound_packets_errors
+       on: vsphere.net_errors_total
+    class: Errors
+     type: Virtual Machine
+component: Network
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute match-names of rx
+    units: packets
+    every: 1m
+     info: number of inbound errors for the network interface in the last 10 minutes
+
+ template: vsphere_outbound_packets_errors
+       on: vsphere.net_errors_total
+    class: Errors
+     type: Virtual Machine
+component: Network
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute match-names of tx
+    units: packets
+    every: 1m
+     info: number of outbound errors for the network interface in the last 10 minutes
+
+# Network errors ratio
+
+ template: vsphere_inbound_packets_errors_ratio
+       on: vsphere.net_packets_total
+    class: Errors
+     type: Virtual Machine
+component: Network
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute match-names of rx
+     calc: (($vsphere_inbound_packets_errors != nan AND $this > 1000) ? ($vsphere_inbound_packets_errors * 100 / $this) : (0))
+    units: %
+    every: 1m
+     warn: $this >= 2
+    delay: up 1m down 1h multiplier 1.5 max 2h
+     info: ratio of inbound errors for the network interface over the last 10 minutes
+       to: sysadmin
+
+ template: vsphere_outbound_packets_errors_ratio
+       on: vsphere.net_packets_total
+    class: Errors
+     type: Virtual Machine
+component: Network
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute match-names of tx
+     calc: (($vsphere_outbound_packets_errors != nan AND $this > 1000) ? ($vsphere_outbound_packets_errors * 100 / $this) : (0))
+    units: %
+    every: 1m
+     warn: $this >= 2
+    delay: up 1m down 1h multiplier 1.5 max 2h
+     info: ratio of outbound errors for the network interface over the last 10 minutes
+       to: sysadmin
+
+# -----------------------------------------------Common-------------------------------------------------------------------
+# CPU
+
+ template: vsphere_cpu_usage
+       on: vsphere.cpu_usage_total
+    class: Utilization
+     type: Virtual Machine
+component: CPU
+    hosts: *
+   lookup: average -10m unaligned match-names of used
+    units: %
+    every: 20s
+     warn: $this > (($status >= $WARNING)  ? (75) : (85))
+     crit: $this > (($status == $CRITICAL) ? (85) : (95))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average CPU utilization
+       to: sysadmin
+
+# Network drops
+
+ template: vsphere_inbound_packets_dropped
+       on: vsphere.net_drops_total
+    class: Errors
+     type: Virtual Machine
+component: Network
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute match-names of rx
+    units: packets
+    every: 1m
+     info: number of inbound dropped packets for the network interface in the last 10 minutes
+
+ template: vsphere_outbound_packets_dropped
+       on: vsphere.net_drops_total
+    class: Errors
+     type: Virtual Machine
+component: Network
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute match-names of tx
+    units: packets
+    every: 1m
+     info: number of outbound dropped packets for the network interface in the last 10 minutes
+
+# Network drops ratio
+
+ template: vsphere_inbound_packets_dropped_ratio
+       on: vsphere.net_packets_total
+    class: Errors
+     type: Virtual Machine
+component: Network
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute match-names of rx
+     calc: (($vsphere_inbound_packets_dropped != nan AND $this > 1000) ? ($vsphere_inbound_packets_dropped * 100 / $this) : (0))
+    units: %
+    every: 1m
+     warn: $this >= 2
+    delay: up 1m down 1h multiplier 1.5 max 2h
+     info: ratio of inbound dropped packets for the network interface over the last 10 minutes
+       to: sysadmin
+
+ template: vsphere_outbound_packets_dropped_ratio
+       on: vsphere.net_packets_total
+    class: Errors
+     type: Virtual Machine
+component: Network
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute match-names of tx
+     calc: (($vsphere_outbound_packets_dropped != nan AND $this > 1000) ? ($vsphere_outbound_packets_dropped * 100 / $this) : (0))
+    units: %
+    every: 1m
+     warn: $this >= 2
+    delay: up 1m down 1h multiplier 1.5 max 2h
+     info: ratio of outbound dropped packets for the network interface over the last 10 minutes
+       to: sysadmin
diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf
new file mode 100644
index 0000000..c33c466
--- /dev/null
+++ b/health/health.d/web_log.conf
@@ -0,0 +1,210 @@
+
+# unmatched lines
+
+# the following alarms trigger only when there are enough data.
+# we assume there are enough data when:
+#
+#  $1m_total_requests > 120
+#
+# i.e. when there are at least 120 requests during the last minute
+
+ template: web_log_1m_total_requests
+       on: web_log.requests
+    class: Workload
+     type: Web Server
+component: Web log
+ families: *
+   lookup: sum -1m unaligned
+     calc: ($this == 0)?(1):($this)
+    units: requests
+    every: 10s
+     info: number of HTTP requests in the last minute
+
+ template: web_log_1m_unmatched
+       on: web_log.excluded_requests
+    class: Errors
+     type: Web Server
+component: Web log
+ families: *
+   lookup: sum -1m unaligned of unmatched
+     calc: $this * 100 / $web_log_1m_total_requests
+    units: %
+    every: 10s
+     warn: ($web_log_1m_total_requests > 120) ? ($this > 1) : ( 0 )
+    delay: up 1m down 5m multiplier 1.5 max 1h
+     info: percentage of unparsed log lines over the last minute
+       to: webmaster
+
+# -----------------------------------------------------------------------------
+# high level response code alarms
+
+# the following alarms trigger only when there are enough data.
+# we assume there are enough data when:
+#
+#  $1m_requests > 120
+#
+# i.e. when there are at least 120 requests during the last minute
+
+ template: web_log_1m_requests
+       on: web_log.type_requests
+    class: Workload
+     type: Web Server
+component: Web log
+ families: *
+   lookup: sum -1m unaligned
+     calc: ($this == 0)?(1):($this)
+    units: requests
+    every: 10s
+     info: number of HTTP requests in the last minute
+
+ template: web_log_1m_successful
+       on: web_log.type_requests
+    class: Workload
+     type: Web Server
+component: Web log
+ families: *
+   lookup: sum -1m unaligned of success
+     calc: $this * 100 / $web_log_1m_requests
+    units: %
+    every: 10s
+     warn: ($web_log_1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
+     crit: ($web_log_1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
+    delay: up 2m down 15m multiplier 1.5 max 1h
+     info: ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401)
+       to: webmaster
+
+ template: web_log_1m_redirects
+       on: web_log.type_requests
+    class: Workload
+     type: Web Server
+component: Web log
+ families: *
+   lookup: sum -1m unaligned of redirect
+     calc: $this * 100 / $web_log_1m_requests
+    units: %
+    every: 10s
+     warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING ) ? (  1 ) : ( 20 )) ) : ( 0 )
+    delay: up 2m down 15m multiplier 1.5 max 1h
+     info: ratio of redirection HTTP requests over the last minute (3xx except 304)
+       to: webmaster
+
+ template: web_log_1m_bad_requests
+       on: web_log.type_requests
+    class: Errors
+     type: Web Server
+component: Web log
+ families: *
+   lookup: sum -1m unaligned of bad
+     calc: $this * 100 / $web_log_1m_requests
+    units: %
+    every: 10s
+     warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING)  ? ( 10 ) : ( 30 )) ) : ( 0 )
+    delay: up 2m down 15m multiplier 1.5 max 1h
+     info: ratio of client error HTTP requests over the last minute (4xx except 401)
+       to: webmaster
+
+ template: web_log_1m_internal_errors
+       on: web_log.type_requests
+    class: Errors
+     type: Web Server
+component: Web log
+ families: *
+   lookup: sum -1m unaligned of error
+     calc: $this * 100 / $web_log_1m_requests
+    units: %
+    every: 10s
+     warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING)  ? ( 1 ) : ( 2 )) ) : ( 0 )
+     crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
+    delay: up 2m down 15m multiplier 1.5 max 1h
+     info: ratio of server error HTTP requests over the last minute (5xx)
+       to: webmaster
+
+# -----------------------------------------------------------------------------
+# web slow
+
+# the following alarms trigger only when there are enough data.
+# we assume there are enough data when:
+#
+#  $1m_requests > 120
+#
+# i.e. when there are at least 120 requests during the last minute
+
+ template: web_log_10m_response_time
+       on: web_log.request_processing_time
+    class: Latency
+     type: System
+component: Web log
+ families: *
+   lookup: average -10m unaligned of avg
+    units: ms
+    every: 30s
+     info: average HTTP response time over the last 10 minutes
+
+ template: web_log_web_slow
+       on: web_log.request_processing_time
+    class: Latency
+     type: Web Server
+component: Web log
+ families: *
+   lookup: average -1m unaligned of avg
+    units: ms
+    every: 10s
+    green: 500
+      red: 1000
+     warn: ($web_log_1m_requests > 120) ? ($this > $green && $this > ($web_log_10m_response_time * 2) ) : ( 0 )
+     crit: ($web_log_1m_requests > 120) ? ($this > $red   && $this > ($web_log_10m_response_time * 4) ) : ( 0 )
+    delay: down 15m multiplier 1.5 max 1h
+     info: average HTTP response time over the last 1 minute
+  options: no-clear-notification
+       to: webmaster
+
+# -----------------------------------------------------------------------------
+# web too many or too few requests
+
+# the following alarms trigger only when there are enough data.
+# we assume there are enough data when:
+#
+#  $5m_successful_old > 120
+#
+# i.e. when there were at least 120 requests during the 5 minutes starting
+#      at -10m and ending at -5m
+
+ template: web_log_5m_successful_old
+       on: web_log.type_requests
+    class: Workload
+     type: Web Server
+component: Web log
+ families: *
+   lookup: average -5m at -5m unaligned of success
+    units: requests/s
+    every: 30s
+     info: average number of successful HTTP requests for the 5 minutes starting 10 minutes ago
+
+ template: web_log_5m_successful
+       on: web_log.type_requests
+    class: Workload
+     type: Web Server
+component: Web log
+ families: *
+   lookup: average -5m unaligned of success
+    units: requests/s
+    every: 30s
+     info: average number of successful HTTP requests over the last 5 minutes
+
+ template: web_log_5m_requests_ratio
+       on: web_log.type_requests
+    class: Workload
+     type: Web Server
+component: Web log
+ families: *
+     calc: ($web_log_5m_successful_old > 0)?($web_log_5m_successful * 100 / $web_log_5m_successful_old):(100)
+    units: %
+    every: 30s
+     warn: ($web_log_5m_successful_old > 120) ? ($this > 200 OR $this < 50) : (0)
+     crit: ($web_log_5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
+    delay: down 15m multiplier 1.5 max 1h
+  options: no-clear-notification
+     info: ratio of successful HTTP requests over over the last 5 minutes, \
+           compared with the previous 5 minutes \
+           (clear notification for this alarm will not be sent)
+       to: webmaster
diff --git a/health/health.d/whoisquery.conf b/health/health.d/whoisquery.conf
new file mode 100644
index 0000000..be5eb58
--- /dev/null
+++ b/health/health.d/whoisquery.conf
@@ -0,0 +1,13 @@
+
+ template: whoisquery_days_until_expiration
+       on: whoisquery.time_until_expiration
+    class: Utilization
+     type: Other
+component: WHOIS
+     calc: $expiry
+    units: seconds
+    every: 60s
+     warn: $this < $days_until_expiration_warning*24*60*60
+     crit: $this < $days_until_expiration_critical*24*60*60
+     info: time until the domain name registration expires
+       to: webmaster
diff --git a/health/health.d/wmi.conf b/health/health.d/wmi.conf
new file mode 100644
index 0000000..90d39ce
--- /dev/null
+++ b/health/health.d/wmi.conf
@@ -0,0 +1,139 @@
+
+## CPU
+
+ template: wmi_10min_cpu_usage
+       on: wmi.cpu_utilization_total
+    class: Utilization
+     type: Windows
+component: CPU
+       os: linux
+    hosts: *
+   lookup: average -10m unaligned match-names of dpc,user,privileged,interrupt
+    units: %
+    every: 1m
+     warn: $this > (($status >= $WARNING)  ? (75) : (85))
+     crit: $this > (($status == $CRITICAL) ? (85) : (95))
+    delay: down 15m multiplier 1.5 max 1h
+     info: average CPU utilization over the last 10 minutes
+       to: sysadmin
+
+
+## Memory
+
+ template: wmi_ram_in_use
+       on: wmi.memory_utilization
+    class: Utilization
+     type: Windows
+component: Memory
+       os: linux
+    hosts: *
+     calc: ($used) * 100 / ($used + $available)
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (80) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: down 15m multiplier 1.5 max 1h
+     info: memory utilization
+       to: sysadmin
+
+ template: wmi_swap_in_use
+       on: wmi.memory_swap_utilization
+    class: Utilization
+     type: Windows
+component: Memory
+       os: linux
+    hosts: *
+     calc: ($used) * 100 / ($used + $available)
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (80) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: down 15m multiplier 1.5 max 1h
+     info: swap memory utilization
+       to: sysadmin
+
+
+## Network
+
+ template: wmi_inbound_packets_discarded
+       on: wmi.net_discarded
+    class: Errors
+     type: Windows
+component: Network
+       os: linux
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute match-names of inbound
+    units: packets
+    every: 1m
+     warn: $this >= 5
+    delay: down 1h multiplier 1.5 max 2h
+     info: number of inbound discarded packets for the network interface in the last 10 minutes
+       to: sysadmin
+
+ template: wmi_outbound_packets_discarded
+       on: wmi.net_discarded
+    class: Errors
+     type: Windows
+component: Network
+       os: linux
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute match-names of outbound
+    units: packets
+    every: 1m
+     warn: $this >= 5
+    delay: down 1h multiplier 1.5 max 2h
+     info: number of outbound discarded packets for the network interface in the last 10 minutes
+       to: sysadmin
+
+ template: wmi_inbound_packets_errors
+       on: wmi.net_errors
+    class: Errors
+     type: Windows
+component: Network
+       os: linux
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute match-names of inbound
+    units: packets
+    every: 1m
+     warn: $this >= 5
+    delay: down 1h multiplier 1.5 max 2h
+     info: number of inbound errors for the network interface in the last 10 minutes
+       to: sysadmin
+
+ template: wmi_outbound_packets_errors
+       on: wmi.net_errors
+    class: Errors
+     type: Windows
+component: Network
+       os: linux
+    hosts: *
+ families: *
+   lookup: sum -10m unaligned absolute match-names of outbound
+    units: packets
+    every: 1m
+     warn: $this >= 5
+    delay: down 1h multiplier 1.5 max 2h
+     info: number of outbound errors for the network interface in the last 10 minutes
+       to: sysadmin
+
+
+## Disk
+
+ template: wmi_disk_in_use
+       on: wmi.logical_disk_utilization
+    class: Utilization
+     type: Windows
+component: Disk
+       os: linux
+    hosts: *
+     calc: ($used) * 100 / ($used + $free)
+    units: %
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (80) : (90))
+     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+    delay: down 15m multiplier 1.5 max 1h
+     info: disk space utilization
+       to: sysadmin
diff --git a/health/health.d/x509check.conf b/health/health.d/x509check.conf
new file mode 100644
index 0000000..fc69d02
--- /dev/null
+++ b/health/health.d/x509check.conf
@@ -0,0 +1,24 @@
+
+ template: x509check_days_until_expiration
+       on: x509check.time_until_expiration
+    class: Latency
+     type: Certificates
+component: x509 certificates
+     calc: $expiry
+    units: seconds
+    every: 60s
+     warn: $this < $days_until_expiration_warning*24*60*60
+     crit: $this < $days_until_expiration_critical*24*60*60
+     info: time until x509 certificate expires
+       to: webmaster
+      
+ template: x509check_revocation_status
+       on: x509check.revocation_status
+    class: Errors
+     type: Certificates
+component: x509 certificates
+     calc: $revoked
+    every: 60s
+     crit: $this != nan AND $this != 0
+     info: x509 certificate revocation status (0: revoked, 1: valid)
+       to: webmaster
diff --git a/health/health.d/zfs.conf b/health/health.d/zfs.conf
new file mode 100644
index 0000000..785838d
--- /dev/null
+++ b/health/health.d/zfs.conf
@@ -0,0 +1,41 @@
+
+    alarm: zfs_memory_throttle
+       on: zfs.memory_ops
+    class: Utilization
+     type: System
+component: File system
+   lookup: sum -10m unaligned absolute of throttled
+    units: events
+    every: 1m
+     warn: $this > 0
+    delay: down 1h multiplier 1.5 max 2h
+     info: number of times ZFS had to limit the ARC growth in the last 10 minutes
+       to: sysadmin
+
+# ZFS pool state
+
+ template: zfs_pool_state_warn
+       on: zfspool.state
+    class: Errors
+     type: System
+component: File system
+     calc: $degraded
+    units: boolean
+    every: 10s
+     warn: $this > 0
+    delay: down 1m multiplier 1.5 max 1h
+     info: ZFS pool $family state is degraded
+       to: sysadmin
+
+ template: zfs_pool_state_crit
+       on: zfspool.state
+    class: Errors
+     type: System
+component: File system
+     calc: $faulted + $unavail
+    units: boolean
+    every: 10s
+     crit: $this > 0
+    delay: down 1m multiplier 1.5 max 1h
+     info: ZFS pool $family state is faulted or unavail
+       to: sysadmin