Merging upstream version 1.30.0.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2021-03-31 12:59:21 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2021-03-31 12:59:21 +0000
commit: bb8713bbc1c4594366fc735c04910edbf4c61aab (patch)
tree: d7da56c0b89aa371dd8ad986995dd145fdf6670a /health
parent: Releasing debian version 1.29.3-4. (diff)
download: netdata-bb8713bbc1c4594366fc735c04910edbf4c61aab.tar.xz
netdata-bb8713bbc1c4594366fc735c04910edbf4c61aab.zip
77 files changed, 682 insertions, 807 deletions
diff --git a/health/Makefile.am b/health/Makefile.am
index 399d6df5..0802dc75 100644
--- a/health/Makefile.am
+++ b/health/Makefile.am
@@ -29,7 +29,6 @@ dist_healthconfig_DATA = \
     health.d/anomalies.conf \
     health.d/apache.conf \
     health.d/apcupsd.conf \
-    health.d/apps_plugin.conf \
     health.d/backend.conf \
     health.d/bcache.conf \
     health.d/beanstalkd.conf \
diff --git a/health/health.c b/health/health.c
index b81361e8..0793100a 100644
--- a/health/health.c
+++ b/health/health.c
@@ -966,12 +966,14 @@ void *health_main(void *ptr) {
                         } else if(unlikely(rc->status == RRDCALC_STATUS_CLEAR)) {
                             if(!(rc->rrdcalc_flags & RRDCALC_FLAG_RUN_ONCE)) {
                                 if(rc->old_status == RRDCALC_STATUS_CRITICAL) {
-                                    repeat_every = rc->crit_repeat_every;
+                                    repeat_every = 1;
                                 } else if (rc->old_status == RRDCALC_STATUS_WARNING) {
-                                    repeat_every = rc->warn_repeat_every;
+                                    repeat_every = 1;
                                 }
                             }
                         }
+                    } else {
+                        continue;
                     }
 
                     if(unlikely(repeat_every > 0 && (rc->last_repeat + repeat_every) <= now)) {
diff --git a/health/health.d/adaptec_raid.conf b/health/health.d/adaptec_raid.conf
index a1301ce8..0753c6e5 100644
--- a/health/health.d/adaptec_raid.conf
+++ b/health/health.d/adaptec_raid.conf
@@ -1,24 +1,24 @@
 
 # logical device status check
 
-template: adapter_raid_ld_status
-      on: adapter_raid.ld_status
-  lookup: max -5s
+template: adaptec_raid_ld_status
+      on: adaptec_raid.ld_status
+  lookup: max -10s foreach *
    units: bool
    every: 10s
     crit: $this > 0
    delay: down 5m multiplier 1.5 max 1h
-    info: at least 1 logical device is failed or degraded
+    info: logical device status is failed or degraded
       to: sysadmin
 
 # physical device state check
 
-template: adapter_raid_pd_state
-      on: adapter_raid.pd_state
-  lookup: max -5s
+template: adaptec_raid_pd_state
+      on: adaptec_raid.pd_state
+  lookup: max -10s foreach *
    units: bool
    every: 10s
     crit: $this > 0
    delay: down 5m multiplier 1.5 max 1h
-    info: at least 1 physical device is not in online state
+    info: physical device state is not online
       to: sysadmin
diff --git a/health/health.d/anomalies.conf b/health/health.d/anomalies.conf
index a2d248ef..c4c96eaf 100644
--- a/health/health.d/anomalies.conf
+++ b/health/health.d/anomalies.conf
@@ -1,17 +1,17 @@
 # raise a warning alarm if an anomaly probability is consistently above 50%
 
-template: anomaly_probabilities
+template: anomalies_anomaly_probabilities
       on: anomalies.probability
   lookup: average -2m foreach *
    every: 1m
     warn: $this > 50
-    info: average anomaly probability > 50% for last 2 minutes
+    info: average anomaly probability over the last 2 minutes
 
 # raise a warning alarm if an anomaly flag is consistently firing
 
-template: anomaly_flags
+template: anomalies_anomaly_flags
       on: anomalies.anomaly
   lookup: sum -2m foreach *
    every: 1m
     warn: $this > 10
-    info: count of anomalies > 10 for last 2 minutes
+    info: number of anomalies in the last 2 minutes
diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf
index 4f86037b..12384fac 100644
--- a/health/health.d/apcupsd.conf
+++ b/health/health.d/apcupsd.conf
@@ -1,6 +1,6 @@
 # you can disable an alarm notification by setting the 'to' line to: silent
 
-template: 10min_ups_load
+template: apcupsd_10min_ups_load
       on: apcupsd.load
       os: *
    hosts: *
@@ -10,12 +10,12 @@ template: 10min_ups_load
     warn: $this > (($status >= $WARNING)  ? (70) : (80))
     crit: $this > (($status == $CRITICAL) ? (85) : (95))
    delay: down 10m multiplier 1.5 max 1h
-    info: average UPS load for the last 10 minutes
+    info: average UPS load over the last 10 minutes
       to: sitemgr
 
 # Discussion in https://github.com/netdata/netdata/pull/3928:
 # Fire the alarm as soon as it's going on battery (99% charge) and clear only when full.
-template: ups_charge
+template: apcupsd_ups_charge
       on: apcupsd.charge
       os: *
    hosts: *
@@ -25,7 +25,7 @@ template: ups_charge
     warn: $this < 100
     crit: $this < (($status == $CRITICAL) ? (60) : (50))
    delay: down 10m multiplier 1.5 max 1h
-    info: current UPS charge, averaged over the last 60 seconds to reduce measurement errors
+    info: average UPS charge over the last minute
       to: sitemgr
 
 template: apcupsd_last_collected_secs
diff --git a/health/health.d/apps_plugin.conf b/health/health.d/apps_plugin.conf
deleted file mode 100644
index 9a27bc6b..00000000
--- a/health/health.d/apps_plugin.conf
+++ /dev/null
@@ -1,15 +0,0 @@
-# you can disable an alarm notification by setting the 'to' line to: silent
-
-#  disabled due to https://github.com/netdata/netdata/issues/10327
-#
-#   alarm: used_file_descriptors
-#      on: apps.files
-#   hosts: *
-#    calc: $fdperc
-#   units: %
-#   every: 5s
-#    warn: $this > (($status >= $WARNING)  ? (75) : (80))
-#    crit: $this > (($status == $CRITICAL) ? (85) : (90))
-#   delay: down 5m multiplier 1.5 max 1h
-#    info: Peak percentage of file descriptors used
-#      to: sysadmin
diff --git a/health/health.d/backend.conf b/health/health.d/backend.conf
index e51b8aa5..8089dc94 100644
--- a/health/health.d/backend.conf
+++ b/health/health.d/backend.conf
@@ -6,7 +6,7 @@
    every: 1m
     warn: $this > 0
    delay: down 5m multiplier 1.5 max 1h
-    info: The backends subsystem is deprecated and will be removed soon. Migrate your configuration to exporting.conf.
+    info: the backends subsystem is deprecated and will be removed soon. Migrate your configuration to exporting.conf.
       to: sysadmin
 
 # make sure we are sending data to backend
@@ -31,26 +31,3 @@
    delay: down 5m multiplier 1.5 max 1h
     info: percentage of metrics sent to the backend server
       to: dba
-
-   alarm: backend_metrics_lost
-      on: netdata.backend_metrics
-   units: metrics
-    calc: abs($lost)
-   every: 10s
-    crit: ($this != 0) || ($status == $CRITICAL && abs($sent) == 0)
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of metrics lost due to repeating failures to contact the backend server
-      to: dba
-
-
-# this chart has been removed from netdata
-#   alarm: backend_slow
-#      on: netdata.backend_latency
-#   units: %
-#    calc: $latency * 100 / ($update_every * 1000)
-#   every: 10s
-#    warn: $this > 50
-#    crit: $this > 100
-#   delay: down 5m multiplier 1.5 max 1h
-#    info: the percentage of time between iterations needed by the backend time to process the data sent by netdata
-#      to: dba
diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf
index f0da9ac5..d5fccf4f 100644
--- a/health/health.d/bcache.conf
+++ b/health/health.d/bcache.conf
@@ -1,13 +1,14 @@
 
 template: bcache_cache_errors
       on: disk.bcache_cache_read_races
-  lookup: sum -10m unaligned absolute
+  lookup: sum -1m unaligned absolute
    units: errors
    every: 1m
     warn: $this > 0
-    crit: $this > ( ($status >= $CRITICAL) ? (0) : (10) )
-   delay: down 1h multiplier 1.5 max 2h
-    info: the number of times bcache had issues using the cache, during the last 10 mins (this usually means your SSD cache is failing)
+   delay: up 2m down 1h multiplier 1.5 max 2h
+    info: number of times data was read from the cache, \
+          the bucket was reused and invalidated in the last 10 minutes \
+          (when this occurs the data is reread from the backing device)
       to: sysadmin
 
 template: bcache_cache_dirty
@@ -16,7 +17,8 @@ template: bcache_cache_dirty
    units: %
    every: 1m
     warn: $this > ( ($status >= $WARNING ) ? ( 70 ) : ( 90 ) )
-    crit: $this > ( ($status >= $CRITICAL) ? ( 90 ) : ( 95 ) )
+    crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
    delay: up 1m down 1h multiplier 1.5 max 2h
-    info: the percentage of cache space used for dirty and metadata (this usually means your SSD cache is too small)
+    info: percentage of cache space used for dirty data and metadata \
+          (this usually means your SSD cache is too small)
       to: sysadmin
diff --git a/health/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf
index 30dc2732..0c428ecb 100644
--- a/health/health.d/beanstalkd.conf
+++ b/health/health.d/beanstalkd.conf
@@ -1,6 +1,6 @@
 # get the number of buried jobs in all queues
 
-template: server_buried_jobs
+template: beanstalk_server_buried_jobs
       on: beanstalk.current_jobs
     calc: $buried
    units: jobs
@@ -8,12 +8,14 @@ template: server_buried_jobs
     warn: $this > 0
     crit: $this > 10
    delay: up 0 down 5m multiplier 1.2 max 1h
-    info: the number of buried jobs aggregated across all tubes
+    info: number of buried jobs across all tubes. \
+          You need to manually kick them so they can be processed. \
+          Presence of buried jobs in a tube does not affect new jobs.
       to: sysadmin
       
 # get the number of buried jobs per queue
 
-#template: tube_buried_jobs
+#template: beanstalk_tube_buried_jobs
 #      on: beanstalk.jobs
 #    calc: $buried
 #   units: jobs
@@ -26,7 +28,7 @@ template: server_buried_jobs
 
 # get the current number of tubes
 
-#template: number_of_tubes
+#template: beanstalk_number_of_tubes
 #      on: beanstalk.current_tubes
 #    calc: $tubes
 #   every: 10s
diff --git a/health/health.d/bind_rndc.conf b/health/health.d/bind_rndc.conf
index 4145e77c..5cc7a72f 100644
--- a/health/health.d/bind_rndc.conf
+++ b/health/health.d/bind_rndc.conf
@@ -1,9 +1,9 @@
- template: bind_rndc_stats_file_size
+template: bind_rndc_stats_file_size
       on: bind_rndc.stats_size
    units: megabytes
    every: 60
     calc: $stats_size
     warn: $this > 512
     crit: $this > 1024
-    info: Bind stats file is very large! Consider to create logrotate conf file for it!
+    info: BIND statistics-file size
       to: sysadmin
diff --git a/health/health.d/boinc.conf b/health/health.d/boinc.conf
index 43c588db..25b7f199 100644
--- a/health/health.d/boinc.conf
+++ b/health/health.d/boinc.conf
@@ -12,7 +12,7 @@ families: *
     warn: $this > 0
     crit: $this > 1
    delay: up 1m down 5m multiplier 1.5 max 1h
-    info: the total number of compute errors over the past 10 minutes
+    info: average number of compute errors over the last 10 minutes
       to: sysadmin
 
 # Warn on lots of upload errors
@@ -27,7 +27,7 @@ families: *
     warn: $this > 0
     crit: $this > 1
    delay: up 1m down 5m multiplier 1.5 max 1h
-    info: the average number of failed uploads over the past 10 minutes
+    info: average number of failed uploads over the last 10 minutes
       to: sysadmin
 
 # Warn on the task queue being empty
@@ -42,7 +42,7 @@ families: *
     warn: $this < 1
     crit: $this < 0.1
    delay: up 5m down 10m multiplier 1.5 max 1h
-    info: the total number of locally available tasks
+    info: average number of total tasks over the last 10 minutes
       to: sysadmin
 
 # Warn on no active tasks with a non-empty queue
@@ -58,5 +58,5 @@ families: *
     warn: $this < 1
     crit: $this < 0.1
    delay: up 5m down 10m multiplier 1.5 max 1h
-    info: the total number of active tasks
+    info: average number of active tasks over the last 10 minutes
       to: sysadmin
diff --git a/health/health.d/btrfs.conf b/health/health.d/btrfs.conf
index b27aa544..93ab8748 100644
--- a/health/health.d/btrfs.conf
+++ b/health/health.d/btrfs.conf
@@ -10,7 +10,7 @@ families: *
     warn: $this > (($status >= $WARNING)  ? (90) : (95))
     crit: $this > (($status == $CRITICAL) ? (95) : (98))
    delay: up 1m down 15m multiplier 1.5 max 1h
-    info: the percentage of allocated BTRFS physical disk space
+    info: percentage of allocated BTRFS physical disk space
       to: sysadmin
 
 template: btrfs_data
@@ -24,7 +24,7 @@ families: *
     warn: $this > (($status >= $WARNING)  ? (90) : (95)) && $btrfs_allocated > 98
     crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
    delay: up 1m down 15m multiplier 1.5 max 1h
-    info: the percentage of used BTRFS data space
+    info: utilization of BTRFS data space
       to: sysadmin
 
 template: btrfs_metadata
@@ -38,7 +38,7 @@ families: *
     warn: $this > (($status >= $WARNING)  ? (90) : (95)) && $btrfs_allocated > 98
     crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
    delay: up 1m down 15m multiplier 1.5 max 1h
-    info: the percentage of used BTRFS metadata space
+    info: utilization of BTRFS metadata space
       to: sysadmin
 
 template: btrfs_system
@@ -52,6 +52,5 @@ families: *
     warn: $this > (($status >= $WARNING)  ? (90) : (95)) && $btrfs_allocated > 98
     crit: $this > (($status == $CRITICAL) ? (95) : (98)) && $btrfs_allocated > 98
    delay: up 1m down 15m multiplier 1.5 max 1h
-    info: the percentage of used BTRFS system space
+    info: utilization of BTRFS system space
       to: sysadmin
-
diff --git a/health/health.d/ceph.conf b/health/health.d/ceph.conf
index de16f7b6..cdbab0f6 100644
--- a/health/health.d/ceph.conf
+++ b/health/health.d/ceph.conf
@@ -1,13 +1,12 @@
 # low ceph disk available
 
-template: cluster_space_usage
+template: ceph_cluster_space_usage
       on: ceph.general_usage
-    calc: $avail * 100 / ($avail + $used)
+    calc: $used * 100 / ($used + $avail)
    units: %
-   every: 10s
-    warn: $this < 10
-    crit: $this < 1
+   every: 1m
+    warn: $this > (($status >= $WARNING ) ? (85) : (90))
+    crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: down 5m multiplier 1.2 max 1h
-    info: ceph disk usage is almost full
+    info: cluster disk space utilization
       to: sysadmin
-
diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf
index 79ece53f..c0a16f15 100644
--- a/health/health.d/cgroups.conf
+++ b/health/health.d/cgroups.conf
@@ -11,7 +11,7 @@ template: cgroup_10min_cpu_usage
     warn: $this > (($status >= $WARNING)  ? (75) : (85))
     crit: $this > (($status == $CRITICAL) ? (85) : (95))
    delay: down 15m multiplier 1.5 max 1h
-    info: cpu utilization for the last 10 minutes
+    info: average cgroup CPU utilization over the last 10 minutes
       to: sysadmin
 
 template: cgroup_ram_in_use
@@ -24,18 +24,5 @@ template: cgroup_ram_in_use
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: down 15m multiplier 1.5 max 1h
-    info: RAM used by cgroup
-      to: sysadmin
-
-template: cgroup_ram_and_swap_in_use
-      on: cgroup.mem_usage
-      os: linux
-   hosts: *
-    calc: ($ram + $swap) * 100 / $memory_and_swap_limit
-   units: %
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (80) : (90))
-    crit: $this > (($status == $CRITICAL) ? (90) : (98))
-   delay: down 15m multiplier 1.5 max 1h
-    info: RAM and Swap used by cgroup
+    info: cgroup memory utilization
       to: sysadmin
diff --git a/health/health.d/cockroachdb.conf b/health/health.d/cockroachdb.conf
index 8ab2c9d0..47773d04 100644
--- a/health/health.d/cockroachdb.conf
+++ b/health/health.d/cockroachdb.conf
@@ -22,7 +22,7 @@ template: cockroachdb_used_storage_capacity
     warn: $this > (($status >= $WARNING)  ? (80) : (85))
     crit: $this > (($status == $CRITICAL) ? (85) : (95))
    delay: down 15m multiplier 1.5 max 1h
-    info: entire disk usage percentage
+    info: storage capacity utilization
       to: dba
 
 template: cockroachdb_used_usable_storage_capacity
@@ -33,7 +33,7 @@ template: cockroachdb_used_usable_storage_capacity
     warn: $this > (($status >= $WARNING)  ? (80) : (85))
     crit: $this > (($status == $CRITICAL) ? (85) : (95))
    delay: down 15m multiplier 1.5 max 1h
-    info: usable space usage percentage
+    info: storage usable space utilization
       to: dba
 
 # Replication
@@ -67,7 +67,7 @@ template: cockroachdb_open_file_descriptors_limit
    every: 10s
     warn: $this > 80
    delay: down 15m multiplier 1.5 max 1h
-    info: open file descriptors usage percentage
+    info: open file descriptors utilization (against softlimit)
       to: dba
 
 # SQL
diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf
index fa818985..32c69f8f 100644
--- a/health/health.d/cpu.conf
+++ b/health/health.d/cpu.conf
@@ -11,7 +11,7 @@ template: 10min_cpu_usage
     warn: $this > (($status >= $WARNING)  ? (75) : (85))
     crit: $this > (($status == $CRITICAL) ? (85) : (95))
    delay: down 15m multiplier 1.5 max 1h
-    info: average cpu utilization for the last 10 minutes (excluding iowait, nice and steal)
+    info: average CPU utilization over the last 10 minutes (excluding iowait, nice and steal)
       to: sysadmin
 
 template: 10min_cpu_iowait
@@ -24,7 +24,7 @@ template: 10min_cpu_iowait
     warn: $this > (($status >= $WARNING)  ? (20) : (40))
     crit: $this > (($status == $CRITICAL) ? (40) : (50))
    delay: down 15m multiplier 1.5 max 1h
-    info: average CPU wait I/O for the last 10 minutes
+    info: average CPU iowait time over the last 10 minutes
       to: sysadmin
 
 template: 20min_steal_cpu
@@ -37,7 +37,7 @@ template: 20min_steal_cpu
     warn: $this > (($status >= $WARNING)  ? (5)  : (10))
     crit: $this > (($status == $CRITICAL) ? (20) : (30))
    delay: down 1h multiplier 1.5 max 2h
-    info: average CPU steal time for the last 20 minutes
+    info: average CPU steal time over the last 20 minutes
       to: sysadmin
 
 ## FreeBSD
@@ -51,5 +51,5 @@ template: 10min_cpu_usage
     warn: $this > (($status >= $WARNING)  ? (75) : (85))
     crit: $this > (($status == $CRITICAL) ? (85) : (95))
    delay: down 15m multiplier 1.5 max 1h
-    info: average cpu utilization for the last 10 minutes (excluding nice)
+    info: average CPU utilization over the last 10 minutes (excluding nice)
       to: sysadmin
diff --git a/health/health.d/dbengine.conf b/health/health.d/dbengine.conf
index 274673e3..3e51d37e 100644
--- a/health/health.d/dbengine.conf
+++ b/health/health.d/dbengine.conf
@@ -10,7 +10,7 @@ lookup: sum -10m unaligned of fs_errors
  every: 10s
   crit: $this > 0
  delay: down 15m multiplier 1.5 max 1h
-  info: number of File-System errors dbengine came across the last 10 minutes (too many open files, wrong permissions etc)
+  info: number of filesystem errors in the last 10 minutes (too many open files, wrong permissions, etc)
     to: sysadmin
 
  alarm: 10min_dbengine_global_io_errors
@@ -22,7 +22,7 @@ lookup: sum -10m unaligned of io_errors
  every: 10s
   crit: $this > 0
  delay: down 1h multiplier 1.5 max 3h
-  info: number of IO errors dbengine came across the last 10 minutes (CRC errors, out of space, bad disk etc)
+  info: number of IO errors in the last 10 minutes (CRC errors, out of space, bad disk, etc)
     to: sysadmin
 
  alarm: 10min_dbengine_global_flushing_warnings
@@ -34,7 +34,8 @@ lookup: sum -10m unaligned of pg_cache_over_half_dirty_events
  every: 10s
   warn: $this > 0
  delay: down 1h multiplier 1.5 max 3h
-  info: number of times in the last 10 minutes that dbengine dirty pages were over 50% of the instance's page cache, metric data at risk of not being stored in the database, please reduce disk load or use faster disks
+  info: number of times when dbengine dirty pages were over 50% of the instance's page cache in the last 10 minutes. \
+        Metric data are at risk of not being stored in the database. To remedy, reduce disk load or use faster disks.
     to: sysadmin
 
  alarm: 10min_dbengine_global_flushing_errors
@@ -46,5 +47,6 @@ lookup: sum -10m unaligned of flushing_pressure_deletions
  every: 10s
   crit: $this != 0
  delay: down 1h multiplier 1.5 max 3h
-  info: number of pages deleted due to failure to flush data to disk in the last 10 minutes, metric data were lost to unblock data collection, please reduce disk load or use faster disks
+  info: number of pages deleted due to failure to flush data to disk in the last 10 minutes. \
+        Metric data were lost to unblock data collection. To fix, reduce disk load or use faster disks.
     to: sysadmin
diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf
index 9c194ced..d0cd60cf 100644
--- a/health/health.d/disks.conf
+++ b/health/health.d/disks.conf
@@ -20,7 +20,7 @@ families: !/dev !/dev/* !/run !/run/* *
     warn: $this > (($status >= $WARNING ) ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: up 1m down 15m multiplier 1.5 max 1h
-    info: current disk space usage
+    info: disk space utilization
       to: sysadmin
 
 template: disk_inode_usage
@@ -34,7 +34,7 @@ families: !/dev !/dev/* !/run !/run/* *
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: up 1m down 15m multiplier 1.5 max 1h
-    info: current disk inode usage
+    info: disk inode utilization
       to: sysadmin
 
 
@@ -49,35 +49,35 @@ families: !/dev !/dev/* !/run !/run/* *
 # we will use it in the next template to find
 # the hours remaining
 
-template: disk_fill_rate
-      on: disk.space
-      os: linux freebsd
-   hosts: *
-families: *
-  lookup: min -10m at -50m unaligned of avail
-    calc: ($this - $avail) / (($now - $after) / 3600)
-   every: 1m
-   units: GB/hour
-    info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour
+# template: disk_fill_rate
+#       on: disk.space
+#       os: linux freebsd
+#    hosts: *
+# families: *
+#   lookup: min -10m at -50m unaligned of avail
+#     calc: ($this - $avail) / (($now - $after) / 3600)
+#    every: 1m
+#    units: GB/hour
+#     info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour
 
 
 # calculate the hours remaining
 # if the disk continues to fill
 # in this rate
 
-template: out_of_disk_space_time
-      on: disk.space
-      os: linux freebsd
-   hosts: *
-families: *
-    calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf)
-   units: hours
-   every: 10s
-    warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
-    crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
-   delay: down 15m multiplier 1.2 max 1h
-    info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour
-      to: sysadmin
+# template: out_of_disk_space_time
+#       on: disk.space
+#       os: linux freebsd
+#    hosts: *
+# families: *
+#     calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf)
+#    units: hours
+#    every: 10s
+#     warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
+#     crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+#    delay: down 15m multiplier 1.2 max 1h
+#     info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour
+#       to: sysadmin
 
 
 # -----------------------------------------------------------------------------
@@ -91,34 +91,34 @@ families: *
 # we will use it in the next template to find
 # the hours remaining
 
-template: disk_inode_rate
-      on: disk.inodes
-      os: linux freebsd
-   hosts: *
-families: *
-  lookup: min -10m at -50m unaligned of avail
-    calc: ($this - $avail) / (($now - $after) / 3600)
-   every: 1m
-   units: inodes/hour
-    info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour
+# template: disk_inode_rate
+#       on: disk.inodes
+#       os: linux freebsd
+#    hosts: *
+# families: *
+#   lookup: min -10m at -50m unaligned of avail
+#     calc: ($this - $avail) / (($now - $after) / 3600)
+#    every: 1m
+#    units: inodes/hour
+#     info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour
 
 # calculate the hours remaining
 # if the disk inodes are allocated
 # in this rate
 
-template: out_of_disk_inodes_time
-      on: disk.inodes
-      os: linux freebsd
-   hosts: *
-families: *
-    calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf)
-   units: hours
-   every: 10s
-    warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
-    crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
-   delay: down 15m multiplier 1.2 max 1h
-    info: estimated time the disk will run out of inodes, if the system continues to allocate inodes with the rate of the last hour
-      to: sysadmin
+# template: out_of_disk_inodes_time
+#       on: disk.inodes
+#       os: linux freebsd
+#    hosts: *
+# families: *
+#     calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf)
+#    units: hours
+#    every: 10s
+#     warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
+#     crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+#    delay: down 15m multiplier 1.2 max 1h
+#     info: estimated time the disk will run out of inodes, if the system continues to allocate inodes with the rate of the last hour
+#       to: sysadmin
 
 
 # -----------------------------------------------------------------------------
@@ -141,8 +141,8 @@ families: *
     warn: $this > $green * (($status >= $WARNING)  ? (0.7) : (1))
     crit: $this > $red   * (($status == $CRITICAL) ? (0.7) : (1))
    delay: down 15m multiplier 1.2 max 1h
-    info: the percentage of time the disk was busy, during the last 10 minutes
-      to: sysadmin
+    info: average percentage of time the disk was busy over the last 10 minutes
+      to: silent
 
 
 # raise an alarm if the disk backlog
@@ -163,5 +163,5 @@ families: *
     warn: $this > $green * (($status >= $WARNING)  ? (0.7) : (1))
     crit: $this > $red   * (($status == $CRITICAL) ? (0.7) : (1))
    delay: down 15m multiplier 1.2 max 1h
-    info: average of the kernel estimated disk backlog, for the last 10 minutes
-      to: sysadmin
+    info: average disk backlog size over the last 10 minutes
+      to: silent
diff --git a/health/health.d/dns_query.conf b/health/health.d/dns_query.conf
index 113c950e..64770b98 100644
--- a/health/health.d/dns_query.conf
+++ b/health/health.d/dns_query.conf
@@ -8,5 +8,5 @@ template: dns_query_time_query_time
    every: 10s
     warn: $this == nan
    delay: up 20s down 5m multiplier 1.5 max 1h
-    info: query round trip time
+    info: average DNS query round trip time over the last 10 seconds
       to: sysadmin
diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf
index ecf3b84a..dff1f07d 100644
--- a/health/health.d/dnsmasq_dhcp.conf
+++ b/health/health.d/dnsmasq_dhcp.conf
@@ -6,7 +6,7 @@ template: dnsmasq_dhcp_dhcp_range_utilization
    units: %
     calc: $used
     warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) )
-    crit: $this > ( ($status >= $CRITICAL) ? ( 90 ) : ( 95 ) )
+    crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
    delay: down 5m
-    info: dhcp-range utilization above threshold!
+    info: DHCP range utilization
       to: sysadmin
diff --git a/health/health.d/dockerd.conf b/health/health.d/dockerd.conf
index 729906cd..122d82b8 100644
--- a/health/health.d/dockerd.conf
+++ b/health/health.d/dockerd.conf
@@ -4,5 +4,5 @@ template: docker_unhealthy_containers
    every: 10s
   lookup: average -10s
     crit: $this > 0
-    info: number of unhealthy containers
+    info: average number of unhealthy docker containers over the last 10 seconds
       to: sysadmin
diff --git a/health/health.d/entropy.conf b/health/health.d/entropy.conf
index 66d44ec1..0be9d45b 100644
--- a/health/health.d/entropy.conf
+++ b/health/health.d/entropy.conf
@@ -7,10 +7,10 @@
       on: system.entropy
       os: linux
    hosts: *
-  lookup: min -10m unaligned
+  lookup: min -5m unaligned
    units: entries
    every: 5m
     warn: $this < (($status >= $WARNING) ? (200) : (100))
    delay: down 1h multiplier 1.5 max 2h
-    info: minimum entries in the random numbers pool in the last 10 minutes
+    info: minimum number of entries in the random numbers pool in the last 5 minutes
       to: silent
diff --git a/health/health.d/exporting.conf b/health/health.d/exporting.conf
index 506cb0cf..735fb5ae 100644
--- a/health/health.d/exporting.conf
+++ b/health/health.d/exporting.conf
@@ -21,14 +21,3 @@ families: *
    delay: down 5m multiplier 1.5 max 1h
     info: percentage of metrics sent to the external database server
       to: dba
-
-template: exporting_metrics_lost
-families: *
-      on: exporting_data_size
-   units: metrics
-    calc: abs($lost)
-   every: 10s
-    crit: ($this != 0) || ($status == $CRITICAL && abs($sent) == 0)
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of metrics lost due to repeating failures to contact the external database server
-      to: dba
diff --git a/health/health.d/fping.conf b/health/health.d/fping.conf
index 43658fef..92c1525b 100644
--- a/health/health.d/fping.conf
+++ b/health/health.d/fping.conf
@@ -11,18 +11,18 @@ families: *
     info: number of seconds since the last successful data collection
       to: sysadmin
 
-template: host_reachable
+template: fping_host_reachable
 families: *
       on: fping.latency
     calc: $average != nan
    units: up/down
    every: 10s
     crit: $this == 0
-    info: states if the remote host is reachable
    delay: down 30m multiplier 1.5 max 2h
+    info: reachability status of the network host (0: unreachable, 1: reachable)
       to: sysadmin
 
-template: host_latency
+template: fping_host_latency
 families: *
       on: fping.latency
   lookup: average -10s unaligned of average
@@ -32,11 +32,11 @@ families: *
      red: 1000
     warn: $this > $green OR $max > $red
     crit: $this > $red
-    info: average round trip delay during the last 10 seconds
    delay: down 30m multiplier 1.5 max 2h
+    info: average latency to the network host over the last 10 seconds
       to: sysadmin
 
-template: packet_loss
+template: fping_packet_loss
 families: *
       on: fping.quality
   lookup: average -10m unaligned of returned
@@ -47,7 +47,6 @@ families: *
    every: 10s
     warn: $this > $green
     crit: $this > $red
-    info: packet loss percentage
    delay: down 30m multiplier 1.5 max 2h
+    info: packet loss ratio to the network host over the last 10 minutes
       to: sysadmin
-
diff --git a/health/health.d/gearman.conf b/health/health.d/gearman.conf
index e3863ae5..d148f7b7 100644
--- a/health/health.d/gearman.conf
+++ b/health/health.d/gearman.conf
@@ -18,5 +18,5 @@ template: gearman_workers_queued
     warn: $this > 30000
     crit: $this > 100000
    delay: down 5m multiplier 1.5 max 1h
-    info: number of queued jobs
-      to: sysadmin
-\ No newline at end of file
+    info: average number of queued jobs over the last 10 minutes
+      to: sysadmin
diff --git a/health/health.d/haproxy.conf b/health/health.d/haproxy.conf
index e49c70d4..9cd07066 100644
--- a/health/health.d/haproxy.conf
+++ b/health/health.d/haproxy.conf
@@ -4,7 +4,7 @@ template: haproxy_backend_server_status
    every: 10s
   lookup: average -10s
     crit: $this > 0
-    info: number of failed haproxy backend servers
+    info: average number of failed haproxy backend servers over the last 10 seconds
       to: sysadmin
 
 template: haproxy_backend_status
@@ -13,7 +13,7 @@ template: haproxy_backend_status
    every: 10s
   lookup: average -10s
     crit: $this > 0
-    info: number of failed haproxy backends
+    info: average number of failed haproxy backends over the last 10 seconds
       to: sysadmin
 
 template: haproxy_last_collected
diff --git a/health/health.d/hdfs.conf b/health/health.d/hdfs.conf
index 678faab4..7345df4d 100644
--- a/health/health.d/hdfs.conf
+++ b/health/health.d/hdfs.conf
@@ -23,7 +23,7 @@ template: hdfs_capacity_usage
     warn: $this > (($status >= $WARNING)  ? (70) : (80))
     crit: $this > (($status == $CRITICAL) ? (80) : (98))
    delay: down 15m multiplier 1.5 max 1h
-    info: used capacity
+    info: summary datanodes space capacity utilization
       to: sysadmin
 
 
@@ -36,7 +36,7 @@ template: hdfs_missing_blocks
    every: 10s
     warn: $this > 0
    delay: down 15m multiplier 1.5 max 1h
-    info: missing blocks
+    info: number of missing blocks
       to: sysadmin
 
 
@@ -47,7 +47,7 @@ template: hdfs_stale_nodes
    every: 10s
     warn: $this > 0
    delay: down 15m multiplier 1.5 max 1h
-    info: stale data nodes
+    info: number of datanodes marked stale due to delayed heartbeat
       to: sysadmin
 
 
@@ -58,7 +58,7 @@ template: hdfs_dead_nodes
    every: 10s
     crit: $this > 0
    delay: down 15m multiplier 1.5 max 1h
-    info: dead data nodes
+    info: number of datanodes which are currently dead
       to: sysadmin
 
 
@@ -71,5 +71,5 @@ template: hdfs_num_failed_volumes
    every: 10s
     warn: $this > 0
    delay: down 15m multiplier 1.5 max 1h
-    info: failed volumes
+    info: number of failed volumes
       to: sysadmin
diff --git a/health/health.d/httpcheck.conf b/health/health.d/httpcheck.conf
index 0ddf35ea..0158f63e 100644
--- a/health/health.d/httpcheck.conf
+++ b/health/health.d/httpcheck.conf
@@ -11,17 +11,17 @@ families: *
       to: sysadmin
 
 # This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
-template: web_service_up
+template: httpcheck_web_service_up
 families: *
       on: httpcheck.status
   lookup: average -1m unaligned percentage of success
     calc: ($this < 75) ? (0) : ($this)
    every: 5s
    units: up/down
-    info: at least 75% verified responses during last 60 seconds, ideal for badges
+    info: average ratio of successful HTTP requests over the last minute (at least 75%)
       to: silent
 
-template: web_service_bad_content
+template: httpcheck_web_service_bad_content
 families: *
       on: httpcheck.status
   lookup: average -5m unaligned percentage of bad_content
@@ -30,11 +30,11 @@ families: *
     warn: $this >= 10 AND $this < 40
     crit: $this >= 40
    delay: down 5m multiplier 1.5 max 1h
-    info: average of unexpected http response content during the last 5 minutes
+    info: average ratio of HTTP responses with unexpected content over the last 5 minutes
  options: no-clear-notification
       to: webmaster
 
-template: web_service_bad_status
+template: httpcheck_web_service_bad_status
 families: *
       on: httpcheck.status
   lookup: average -5m unaligned percentage of bad_status
@@ -43,57 +43,57 @@ families: *
     warn: $this >= 10 AND $this < 40
     crit: $this >= 40
    delay: down 5m multiplier 1.5 max 1h
-    info: average of unexpected http status during the last 5 minutes
+    info: average ratio of HTTP responses with unexpected status over the last 5 minutes
  options: no-clear-notification
       to: webmaster
 
-template: web_service_timeouts
+template: httpcheck_web_service_timeouts
 families: *
       on: httpcheck.status
   lookup: average -5m unaligned percentage of timeout
    every: 10s
    units: %
-    info: average of timeouts during the last 5 minutes
+    info: average ratio of HTTP request timeouts over the last 5 minutes
 
-template: no_web_service_connections
+template: httpcheck_no_web_service_connections
 families: *
       on: httpcheck.status
   lookup: average -5m unaligned percentage of no_connection
    every: 10s
    units: %
-    info: average of failed requests during the last 5 minutes
+    info: average ratio of failed requests during the last 5 minutes
 
 # combined timeout & no connection alarm
-template: web_service_unreachable
+template: httpcheck_web_service_unreachable
 families: *
       on: httpcheck.status
-    calc: ($no_web_service_connections >= $web_service_timeouts) ? ($no_web_service_connections) : ($web_service_timeouts)
+    calc: ($httpcheck_no_web_service_connections >= $httpcheck_web_service_timeouts) ? ($httpcheck_no_web_service_connections) : ($httpcheck_web_service_timeouts)
    units: %
    every: 10s
-    warn: ($no_web_service_connections >= 10 OR $web_service_timeouts >= 10) AND ($no_web_service_connections < 40 OR $web_service_timeouts < 40)
-    crit: $no_web_service_connections >= 40 OR $web_service_timeouts >= 40
+    warn: ($httpcheck_no_web_service_connections >= 10 OR $httpcheck_web_service_timeouts >= 10) AND ($httpcheck_no_web_service_connections < 40 OR $httpcheck_web_service_timeouts < 40)
+    crit: $httpcheck_no_web_service_connections >= 40 OR $httpcheck_web_service_timeouts >= 40
    delay: down 5m multiplier 1.5 max 1h
-    info: average of failed requests either due to timeouts or no connection during the last 5 minutes
+    info: ratio of failed requests either due to timeouts or no connection over the last 5 minutes
  options: no-clear-notification
       to: webmaster
 
-template: 1h_web_service_response_time
+template: httpcheck_1h_web_service_response_time
 families: *
       on: httpcheck.responsetime
   lookup: average -1h unaligned of time
    every: 30s
    units: ms
-    info: average response time over the last hour
+    info: average HTTP response time over the last hour
 
-template: web_service_slow
+template: httpcheck_web_service_slow
 families: *
       on: httpcheck.responsetime
   lookup: average -3m unaligned of time
    units: ms
    every: 10s
-    warn: ($this > ($1h_web_service_response_time * 2) )
-    crit: ($this > ($1h_web_service_response_time * 3) )
-    info: average response time over the last 3 minutes, compared to the average over the last hour
+    warn: ($this > ($httpcheck_1h_web_service_response_time * 2) )
+    crit: ($this > ($httpcheck_1h_web_service_response_time * 3) )
    delay: down 5m multiplier 1.5 max 1h
+    info: average HTTP response time over the last 3 minutes, compared to the average over the last hour
  options: no-clear-notification
       to: webmaster
diff --git a/health/health.d/ioping.conf b/health/health.d/ioping.conf
index 59a5c8ed..fa0196ef 100644
--- a/health/health.d/ioping.conf
+++ b/health/health.d/ioping.conf
@@ -1,4 +1,4 @@
-template: disk_latency
+template: ioping_disk_latency
 families: *
       on: ioping.latency
   lookup: average -10s unaligned of average
@@ -8,6 +8,6 @@ families: *
      red: 1000
     warn: $this > $green OR $max > $red
     crit: $this > $red
-    info: average round trip delay during the last 10 seconds
    delay: down 30m multiplier 1.5 max 2h
+    info: average I/O latency over the last 10 seconds
       to: sysadmin
diff --git a/health/health.d/ipc.conf b/health/health.d/ipc.conf
index 989d6e91..f4a0f56d 100644
--- a/health/health.d/ipc.conf
+++ b/health/health.d/ipc.conf
@@ -11,7 +11,7 @@
     warn: $this > (($status >= $WARNING)  ? (70) : (80))
     crit: $this > (($status == $CRITICAL) ? (70) : (90))
    delay: down 5m multiplier 1.5 max 1h
-    info: the percentage of IPC semaphores used
+    info: IPC semaphore utilization
       to: sysadmin
 
    alarm: semaphore_arrays_used
@@ -24,5 +24,5 @@
     warn: $this > (($status >= $WARNING)  ? (70) : (80))
     crit: $this > (($status == $CRITICAL) ? (70) : (90))
    delay: down 5m multiplier 1.5 max 1h
-    info: the percentage of IPC semaphore arrays used
+    info: IPC semaphore arrays utilization
       to: sysadmin
diff --git a/health/health.d/ipfs.conf b/health/health.d/ipfs.conf
index 3f77572d..fd53c2c4 100644
--- a/health/health.d/ipfs.conf
+++ b/health/health.d/ipfs.conf
@@ -7,5 +7,5 @@ template: ipfs_datastore_usage
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: down 15m multiplier 1.5 max 1h
-    info: ipfs Datastore close to running out of space
+    info: IPFS datastore utilization
       to: sysadmin
diff --git a/health/health.d/ipmi.conf b/health/health.d/ipmi.conf
index c2558196..563d7a7e 100644
--- a/health/health.d/ipmi.conf
+++ b/health/health.d/ipmi.conf
@@ -6,7 +6,7 @@
     warn: $this > 0
     crit: $critical > 0
    delay: up 5m down 15m multiplier 1.5 max 1h
-    info: the number IPMI sensors in non-nominal state
+    info: number of IPMI sensors in non-nominal state
       to: sysadmin
 
    alarm: ipmi_events
@@ -16,5 +16,5 @@
    every: 10s
     warn: $this > 0
    delay: up 5m down 15m multiplier 1.5 max 1h
-    info: the number of events in the IPMI System Event Log (SEL)
+    info: number of events in the IPMI System Event Log (SEL)
       to: sysadmin
diff --git a/health/health.d/isc_dhcpd.conf b/health/health.d/isc_dhcpd.conf
index 8054656f..d1f93969 100644
--- a/health/health.d/isc_dhcpd.conf
+++ b/health/health.d/isc_dhcpd.conf
@@ -1,10 +1,10 @@
- template: isc_dhcpd_leases_size
-      on: isc_dhcpd.leases_total
-   units: KB
-   every: 60
-    calc: $leases_size
-    warn: $this > 3072
-    crit: $this > 6144
-   delay: up 2m down 5m
-    info: dhcpd.leases file too big! Module can slow down your server.
-      to: sysadmin
+# template: isc_dhcpd_leases_size
+#      on: isc_dhcpd.leases_total
+#   units: KB
+#   every: 60
+#    calc: $leases_size
+#    warn: $this > 3072
+#    crit: $this > 6144
+#   delay: up 2m down 5m
+#    info: dhcpd.leases file too big! Module can slow down your server.
+#      to: sysadmin
diff --git a/health/health.d/kubelet.conf b/health/health.d/kubelet.conf
index d2ef24b5..5eda59b2 100644
--- a/health/health.d/kubelet.conf
+++ b/health/health.d/kubelet.conf
@@ -4,26 +4,26 @@
 
 # True (1) if the node is experiencing a configuration-related error, false (0) otherwise.
 
-   template: node_config_error
+   template: kubelet_node_config_error
          on: k8s_kubelet.kubelet_node_config_error
        calc: $kubelet_node_config_error
       units: bool
       every: 10s
        warn: $this == 1
       delay: down 1m multiplier 1.5 max 2h
-       info: the node is experiencing a configuration-related error
+       info: the node is experiencing a configuration-related error (0: false, 1: true)
          to: sysadmin
 
 # Failed Token() requests to the alternate token source
 
-   template: token_requests
+   template: kubelet_token_requests
      lookup: sum -10s of token_fail_count
          on: k8s_kubelet.kubelet_token_requests
       units: failed requests
       every: 10s
        warn: $this > 0
       delay: down 1m multiplier 1.5 max 2h
-       info: failed token requests to alternate token source
+       info: number of failed Token() requests to the alternate token source
          to: sysadmin
 
 # Docker and runtime operation errors
@@ -35,7 +35,7 @@
       every: 10s
        warn: $this > (($status >= $WARNING)  ? (0) : (20))
       delay: up 30s down 1m multiplier 1.5 max 2h
-       info: operations error
+       info: number of Docker or runtime operation errors
          to: sysadmin
 
 # -----------------------------------------------------------------------------
@@ -53,63 +53,66 @@
 
 # quantile 0.5
 
-template: 1m_kubelet_pleg_relist_latency_quantile_05
+template: kubelet_1m_pleg_relist_latency_quantile_05
       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
   lookup: average -1m unaligned of kubelet_pleg_relist_latency_05
    units: microseconds
    every: 10s
-    info: the average value of pleg relisting latency during the last minute (quantile 0.5)
+    info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.5)
 
-template: 10s_kubelet_pleg_relist_latency_quantile_05
+template: kubelet_10s_pleg_relist_latency_quantile_05
       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
   lookup: average -10s unaligned of kubelet_pleg_relist_latency_05
-    calc: $this * 100 / (($1m_kubelet_pleg_relist_latency_quantile_05 < 1000)?(1000):($1m_kubelet_pleg_relist_latency_quantile_05))
+    calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_05 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_05))
    every: 10s
    units: %
    warn: $this > (($status >= $WARNING)?(100):(200))
    crit: $this > (($status >= $WARNING)?(200):(400))
   delay: down 1m multiplier 1.5 max 2h
-   info: the % of the pleg relisting latency in the last 10 seconds, compared to the last minute (quantile 0.5)
+   info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
+         compared to the last minute (quantile 0.5)
      to: sysadmin
 
 # quantile 0.9
 
-template: 1m_kubelet_pleg_relist_latency_quantile_09
+template: kubelet_1m_pleg_relist_latency_quantile_09
       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
   lookup: average -1m unaligned of kubelet_pleg_relist_latency_09
    units: microseconds
    every: 10s
-    info: the average value of pleg relisting latency during the last minute (quantile 0.9)
+    info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.9)
 
-template: 10s_kubelet_pleg_relist_latency_quantile_09
+template: kubelet_10s_pleg_relist_latency_quantile_09
       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
   lookup: average -10s unaligned of kubelet_pleg_relist_latency_09
-    calc: $this * 100 / (($1m_kubelet_pleg_relist_latency_quantile_09 < 1000)?(1000):($1m_kubelet_pleg_relist_latency_quantile_09))
+    calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_09 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_09))
    every: 10s
    units: %
    warn: $this > (($status >= $WARNING)?(200):(400))
    crit: $this > (($status >= $WARNING)?(400):(800))
   delay: down 1m multiplier 1.5 max 2h
-   info: the % of the pleg relisting latency in the last 10 seconds, compared to the last minute (quantile 0.9)
+   info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
+         compared to the last minute (quantile 0.9)
      to: sysadmin
 
 # quantile 0.99
 
-template: 1m_kubelet_pleg_relist_latency_quantile_099
+template: kubelet_1m_pleg_relist_latency_quantile_099
       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
   lookup: average -1m unaligned of kubelet_pleg_relist_latency_099
    units: microseconds
    every: 10s
-    info: the average value of pleg relisting latency during the last minute (quantile 0.99)
+    info: average Pod Lifecycle Event Generator relisting latency over the last minute (quantile 0.99)
 
-template: 10s_kubelet_pleg_relist_latency_quantile_099
+template: kubelet_10s_pleg_relist_latency_quantile_099
       on: k8s_kubelet.kubelet_pleg_relist_latency_microseconds
   lookup: average -10s unaligned of kubelet_pleg_relist_latency_099
-    calc: $this * 100 / (($1m_kubelet_pleg_relist_latency_quantile_099 < 1000)?(1000):($1m_kubelet_pleg_relist_latency_quantile_099))
+    calc: $this * 100 / (($kubelet_1m_pleg_relist_latency_quantile_099 < 1000)?(1000):($kubelet_1m_pleg_relist_latency_quantile_099))
    every: 10s
    units: %
    warn: $this > (($status >= $WARNING)?(400):(800))
    crit: $this > (($status >= $WARNING)?(800):(1200))
   delay: down 1m multiplier 1.5 max 2h
-   info: the % of the pleg relisting latency in the last 10 seconds, compared to the last minute (quantile 0.99)
+   info: ratio of average Pod Lifecycle Event Generator relisting latency over the last 10 seconds, \
+         compared to the last minute (quantile 0.99)
      to: sysadmin
diff --git a/health/health.d/linux_power_supply.conf b/health/health.d/linux_power_supply.conf
index 38727be2..a27ea072 100644
--- a/health/health.d/linux_power_supply.conf
+++ b/health/health.d/linux_power_supply.conf
@@ -8,5 +8,5 @@ template: linux_power_supply_capacity
     warn: $this < 10
     crit: $this < 5
    delay: up 30s down 5m multiplier 1.2 max 1h
-    info: the percentage remaining capacity of the power supply
+    info: percentage of remaining power supply capacity
       to: sysadmin
diff --git a/health/health.d/load.conf b/health/health.d/load.conf
index ee0c54b8..ffaea172 100644
--- a/health/health.d/load.conf
+++ b/health/health.d/load.conf
@@ -4,18 +4,19 @@
 # Calculate the base trigger point for the load average alarms.
 # This is the maximum number of CPU's in the system over the past 1
 # minute, with a special case for a single CPU of setting the trigger at 2.
-   alarm: load_trigger
+   alarm: load_cpu_number
       on: system.load
       os: linux
    hosts: *
     calc: ($active_processors == nan or $active_processors == inf or $active_processors < 2) ? ( 2 ) : ( $active_processors )
    units: cpus
    every: 1m
-    info: trigger point for load average alarms
+    info: number of active CPU cores in the system
 
 # Send alarms if the load average is unusually high.
 # These intentionally _do not_ calculate the average over the sampled
 # time period because the values being checked already are averages.
+
    alarm: load_average_15
       on: system.load
       os: linux
@@ -23,10 +24,9 @@
   lookup: max -1m unaligned of load15
    units: load
    every: 1m
-    warn: $this > (($status >= $WARNING)  ? (1.75 * $load_trigger) : (2 * $load_trigger))
-    crit: $this > (($status == $CRITICAL) ? (3.5 * $load_trigger) : (4 * $load_trigger))
+    warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 175 : 200)
    delay: down 15m multiplier 1.5 max 1h
-    info: fifteen-minute load average
+    info: system fifteen-minute load average
       to: sysadmin
 
    alarm: load_average_5
@@ -36,10 +36,9 @@
   lookup: max -1m unaligned of load5
    units: load
    every: 1m
-    warn: $this > (($status >= $WARNING)  ? (3.5 * $load_trigger) : (4 * $load_trigger))
-    crit: $this > (($status == $CRITICAL) ? (7 * $load_trigger) : (8 * $load_trigger))
+    warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 350 : 400)
    delay: down 15m multiplier 1.5 max 1h
-    info: five-minute load average
+    info: system five-minute load average
       to: sysadmin
 
    alarm: load_average_1
@@ -49,8 +48,7 @@
   lookup: max -1m unaligned of load1
    units: load
    every: 1m
-    warn: $this > (($status >= $WARNING)  ? (7 * $load_trigger) : (8 * $load_trigger))
-    crit: $this > (($status == $CRITICAL) ? (14 * $load_trigger) : (16 * $load_trigger))
+    warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 700 : 800)
    delay: down 15m multiplier 1.5 max 1h
-    info: one-minute load average
+    info: system one-minute load average
       to: sysadmin
diff --git a/health/health.d/mdstat.conf b/health/health.d/mdstat.conf
index 2f906e18..ca2d0d9f 100644
--- a/health/health.d/mdstat.conf
+++ b/health/health.d/mdstat.conf
@@ -14,7 +14,8 @@ template: mdstat_disks
    every: 10s
     calc: $down
     crit: $this > 0
-    info: Array is degraded!
+    info: number of devices in the down state. \
+          Any number > 0 indicates that the array is degraded.
       to: sysadmin
 
 template: mdstat_mismatch_cnt
@@ -24,7 +25,7 @@ template: mdstat_mismatch_cnt
    every: 60s
     warn: $this > 1024
    delay: up 30m
-    info: Mismatch count!
+    info: number of unsynchronized blocks
       to: sysadmin
 
 template: mdstat_nonredundant_last_collected
@@ -35,4 +36,4 @@ template: mdstat_nonredundant_last_collected
     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
     info: number of seconds since the last successful data collection
-      to: sysadmin
-\ No newline at end of file
+      to: sysadmin
diff --git a/health/health.d/megacli.conf b/health/health.d/megacli.conf
index 6e81a2a0..f861765d 100644
--- a/health/health.d/megacli.conf
+++ b/health/health.d/megacli.conf
@@ -1,48 +1,56 @@
-template: adapter_state
+
+## Adapters (controllers)
+
+template: megacli_adapter_state
       on: megacli.adapter_degraded
-   units: is degraded
-  lookup: sum -10s
+  lookup: max -10s foreach *
+   units: boolean
    every: 10s
     crit: $this > 0
-    info: adapter state
+   delay: down 5m multiplier 2 max 10m
+    info: adapter is in the degraded state (0: false, 1: true)
+      to: sysadmin
+
+## Physical Disks
+
+template: megacli_pd_predictive_failures
+      on: megacli.pd_predictive_failure
+  lookup: sum -10s foreach *
+   units: predictive failures
+   every: 10s
+    warn: $this > 0
+   delay: up 1m down 5m multiplier 2 max 10m
+    info: number of physical drive predictive failures
+      to: sysadmin
+
+template: megacli_pd_media_errors
+      on: megacli.pd_media_error
+  lookup: sum -10s foreach *
+   units: media errors
+   every: 10s
+    warn: $this > 0
+   delay: up 1m down 5m multiplier 2 max 10m
+    info: number of physical drive media errors
       to: sysadmin
 
-template: bbu_relative_charge
+## Battery Backup Units (BBU)
+
+template: megacli_bbu_relative_charge
       on: megacli.bbu_relative_charge
-   units: percent
   lookup: average -10s
+   units: percent
    every: 10s
     warn: $this <= (($status >= $WARNING)  ? (85) : (80))
     crit: $this <= (($status == $CRITICAL)  ? (50) : (40))
-    info: BBU relative state of charge
+    info: average battery backup unit (BBU) relative state of charge over the last 10 seconds
       to: sysadmin
 
-template: bbu_cycle_count
+template: megacli_bbu_cycle_count
       on: megacli.bbu_cycle_count
-   units: cycle count
   lookup: average -10s
+   units: cycles
    every: 10s
     warn: $this >= 100
     crit: $this >= 500
-    info: BBU cycle count
-      to: sysadmin
-
-template: pd_media_errors
-      on: megacli.pd_media_error
-   units: media errors
-  lookup: sum -10s
-   every: 10s
-    warn: $this > 0
-   delay: down 1m multiplier 2 max 10m
-    info: physical drive media errors
-      to: sysadmin
-
-template: pd_predictive_failures
-      on: megacli.pd_predictive_failure
-   units: predictive failures
-  lookup: sum -10s
-   every: 10s
-    warn: $this > 0
-   delay: down 1m multiplier 2 max 10m
-    info: physical drive predictive failures
+    info: average battery backup unit (BBU) charge cycles count over the last 10 seconds
       to: sysadmin
diff --git a/health/health.d/memcached.conf b/health/health.d/memcached.conf
index d248ef57..e610f181 100644
--- a/health/health.d/memcached.conf
+++ b/health/health.d/memcached.conf
@@ -23,30 +23,31 @@ template: memcached_cache_memory_usage
     warn: $this > (($status >= $WARNING)  ? (70) : (80))
     crit: $this > (($status == $CRITICAL) ? (80) : (90))
    delay: up 0 down 15m multiplier 1.5 max 1h
-    info: current cache memory usage
+    info: cache memory utilization
       to: dba
 
 
 # find the rate memcached cache is filling
 
-template: cache_fill_rate
+template: memcached_cache_fill_rate
       on: memcached.cache
   lookup: min -10m at -50m unaligned of available
     calc: ($this - $available) / (($now - $after) / 3600)
    units: KB/hour
    every: 1m
-    info: average rate the cache fills up (positive), or frees up (negative) space, for the last hour
+    info: average rate the cache fills up (positive), or frees up (negative) space over the last hour
 
 
 # find the hours remaining until memcached cache is full
 
-template: out_of_cache_space_time
+template: memcached_out_of_cache_space_time
       on: memcached.cache
-    calc: ($cache_fill_rate > 0) ? ($available / $cache_fill_rate) : (inf)
+    calc: ($memcached_cache_fill_rate > 0) ? ($available / $memcached_cache_fill_rate) : (inf)
    units: hours
    every: 10s
     warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
     crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
    delay: down 15m multiplier 1.5 max 1h
-    info: estimated time the cache will run out of space, if the system continues to add data with the rate of the last hour
+    info: estimated time the cache will run out of space \
+          if the system continues to add data at the same rate as the past hour
       to: dba
diff --git a/health/health.d/memory.conf b/health/health.d/memory.conf
index 4a0e6e52..e95c0aad 100644
--- a/health/health.d/memory.conf
+++ b/health/health.d/memory.conf
@@ -10,7 +10,7 @@
    every: 1m
     warn: $this > 0
    delay: down 1h multiplier 1.5 max 1h
-    info: number of ECC correctable errors during the last hour
+    info: number of ECC correctable errors in the last 10 minutes
       to: sysadmin
 
    alarm: 1hour_ecc_memory_uncorrectable
@@ -22,7 +22,7 @@
    every: 1m
     crit: $this > 0
    delay: down 1h multiplier 1.5 max 1h
-    info: number of ECC uncorrectable errors during the last hour
+    info: number of ECC uncorrectable errors in the last 10 minutes
       to: sysadmin
 
    alarm: 1hour_memory_hw_corrupted
diff --git a/health/health.d/mysql.conf b/health/health.d/mysql.conf
index 62cef5a2..7451b3f4 100644
--- a/health/health.d/mysql.conf
+++ b/health/health.d/mysql.conf
@@ -24,7 +24,7 @@ template: mysql_10s_slow_queries
     warn: $this > (($status >= $WARNING)  ? (5)  : (10))
     crit: $this > (($status == $CRITICAL) ? (10) : (20))
    delay: down 5m multiplier 1.5 max 1h
-    info: number of mysql slow queries over the last 10 seconds
+    info: number of slow queries in the last 10 seconds
       to: dba
 
 
@@ -36,7 +36,7 @@ template: mysql_10s_table_locks_immediate
   lookup: sum -10s absolute of immediate
    units: immediate locks
    every: 10s
-    info: number of table immediate locks over the last 10 seconds
+    info: number of table immediate locks in the last 10 seconds
       to: dba
 
 template: mysql_10s_table_locks_waited
@@ -44,7 +44,7 @@ template: mysql_10s_table_locks_waited
   lookup: sum -10s absolute of waited
    units: waited locks
    every: 10s
-    info: number of table waited locks over the last 10 seconds
+    info: number of table waited locks in the last 10 seconds
       to: dba
 
 template: mysql_10s_waited_locks_ratio
@@ -55,7 +55,7 @@ template: mysql_10s_waited_locks_ratio
     warn: $this > (($status >= $WARNING)  ? (10) : (25))
     crit: $this > (($status == $CRITICAL) ? (25) : (50))
    delay: down 30m multiplier 1.5 max 1h
-    info: the ratio of mysql waited table locks, for the last 10 seconds
+    info: ratio of waited table locks over the last 10 seconds
       to: dba
 
 
@@ -70,7 +70,7 @@ template: mysql_connections
     warn: $this > (($status >= $WARNING)  ? (60) : (70))
     crit: $this > (($status == $CRITICAL) ? (80) : (90))
    delay: down 15m multiplier 1.5 max 1h
-    info: the ratio of current active connections vs the maximum possible number of connections
+    info: client connections utilization
       to: dba
 
 
@@ -84,7 +84,7 @@ template: mysql_replication
    every: 10s
     crit: $this == 0
    delay: down 5m multiplier 1.5 max 1h
-    info: checks if mysql replication has stopped
+    info: replication status (0: stopped, 1: working)
       to: dba
 
 template: mysql_replication_lag
@@ -95,7 +95,8 @@ template: mysql_replication_lag
     warn: $this > (($status >= $WARNING)  ? (5)  : (10))
     crit: $this > (($status == $CRITICAL) ? (10) : (30))
    delay: down 15m multiplier 1.5 max 1h
-    info: the number of seconds mysql replication is behind this master
+    info: difference between the timestamp of the latest transaction processed by the SQL thread and \
+          the timestamp of the same transaction when it was processed on the master
       to: dba
 
 
@@ -107,7 +108,7 @@ template: mysql_galera_cluster_size_max_2m
   lookup: max -2m absolute
    units: nodes
    every: 10s
-    info: max cluster size 2 minute
+    info: maximum galera cluster size in the last 2 minutes
       to: dba
 
 template: mysql_galera_cluster_size
@@ -118,7 +119,7 @@ template: mysql_galera_cluster_size
     warn: $this > $mysql_galera_cluster_size_max_2m
     crit: $this < $mysql_galera_cluster_size_max_2m
    delay: up 20s down 5m multiplier 1.5 max 1h
-    info: cluster size has changed
+    info: current galera cluster size, compared to the maximum size in the last 2 minutes
       to: dba
 
 # galera node state
@@ -130,7 +131,8 @@ template: mysql_galera_cluster_state
     warn: $this < 4
     crit: $this < 2
    delay: up 30s down 5m multiplier 1.5 max 1h
-    info: node state (0: undefined, 1: joining, 2: donor/desynced, 3: joined, 4: synced)
+    info: galera node state \
+          (0: undefined, 1: joining, 2: donor/desynced, 3: joined, 4: synced)
       to: dba
 
 
@@ -142,5 +144,7 @@ template: mysql_galera_cluster_status
    every: 10s
     crit: $mysql_galera_cluster_state != nan AND $this != 0
    delay: up 30s down 5m multiplier 1.5 max 1h
-    info: node and cluster status (-1: unknown, 0: primary/quorum present, 1: non-primary/quorum lost, 2: disconnected)
+    info: galera node cluster component status \
+          (-1: unknown, 0: primary/quorum present, 1: non-primary/quorum lost, 2: disconnected). \
+          Any other value than primary indicates that the node is part of a nonoperational component.
       to: dba
diff --git a/health/health.d/net.conf b/health/health.d/net.conf
index 261290e5..33202421 100644
--- a/health/health.d/net.conf
+++ b/health/health.d/net.conf
@@ -12,7 +12,7 @@
      calc: ( $nic_speed_max > 0 ) ? ( $nic_speed_max) : ( nan )
     units: Mbit
     every: 10s
-     info: The current speed of the physical network interface
+     info: network interface current speed
 
  template: 1m_received_traffic_overflow
        on: net.net
@@ -20,13 +20,12 @@
     hosts: *
  families: *
    lookup: average -1m unaligned absolute of received
-     calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan )
+     calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed)) : ( nan )
     units: %
     every: 10s
-     warn: $this > (($status >= $WARNING)  ? (80) : (85))
-     crit: $this > (($status == $CRITICAL) ? (85) : (90))
-    delay: down 1m multiplier 1.5 max 1h
-     info: interface received bandwidth usage over net device speed max
+     warn: $this > (($status >= $WARNING)  ? (85) : (90))
+    delay: up 1m down 1m multiplier 1.5 max 1h
+     info: average inbound utilization for the network interface over the last minute
        to: sysadmin
 
  template: 1m_sent_traffic_overflow
@@ -35,13 +34,12 @@
     hosts: *
  families: *
    lookup: average -1m unaligned absolute of sent
-     calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan )
+     calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed)) : ( nan )
     units: %
     every: 10s
-     warn: $this > (($status >= $WARNING)  ? (80) : (85))
-     crit: $this > (($status == $CRITICAL) ? (85) : (90))
-    delay: down 1m multiplier 1.5 max 1h
-     info: interface sent bandwidth usage over net device speed max
+     warn: $this > (($status >= $WARNING)  ? (85) : (90))
+    delay: up 1m down 1m multiplier 1.5 max 1h
+     info: average outbound utilization for the network interface over the last minute
        to: sysadmin
 
 # -----------------------------------------------------------------------------
@@ -58,56 +56,76 @@ template: inbound_packets_dropped
       on: net.drops
       os: linux
    hosts: *
-families: *
+families: !net* *
   lookup: sum -10m unaligned absolute of inbound
    units: packets
    every: 1m
-    warn: $this >= 5
-   delay: down 1h multiplier 1.5 max 2h
-    info: interface inbound dropped packets in the last 10 minutes
-      to: sysadmin
+    info: number of inbound dropped packets for the network interface in the last 10 minutes
 
 template: outbound_packets_dropped
       on: net.drops
       os: linux
    hosts: *
-families: *
+families: !net* *
   lookup: sum -10m unaligned absolute of outbound
    units: packets
    every: 1m
-    warn: $this >= 5
-   delay: down 1h multiplier 1.5 max 2h
-    info: interface outbound dropped packets in the last 10 minutes
-      to: sysadmin
+    info: number of outbound dropped packets for the network interface in the last 10 minutes
 
 template: inbound_packets_dropped_ratio
       on: net.packets
       os: linux
    hosts: *
-families: *
+families: !net* !wl* *
   lookup: sum -10m unaligned absolute of received
-    calc: (($inbound_packets_dropped != nan AND $this > 0) ? ($inbound_packets_dropped * 100 / $this) : (0))
+    calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0))
    units: %
    every: 1m
-    warn: $this >= 0.1
-    crit: $this >= 2
-   delay: down 1h multiplier 1.5 max 2h
-    info: the ratio of inbound dropped packets vs the total number of received packets of the network interface, during the last 10 minutes
+    warn: $this >= 2
+   delay: up 1m down 1h multiplier 1.5 max 2h
+    info: ratio of inbound dropped packets for the network interface over the last 10 minutes
       to: sysadmin
 
 template: outbound_packets_dropped_ratio
       on: net.packets
       os: linux
    hosts: *
-families: *
+families: !net* !wl* *
   lookup: sum -10m unaligned absolute of sent
-    calc: (($outbound_packets_dropped != nan AND $this > 0) ? ($outbound_packets_dropped * 100 / $this) : (0))
+    calc: (($outbound_packets_dropped != nan AND $this > 1000) ? ($outbound_packets_dropped * 100 / $this) : (0))
    units: %
    every: 1m
-    warn: $this >= 0.1
-    crit: $this >= 2
-   delay: down 1h multiplier 1.5 max 2h
-    info: the ratio of outbound dropped packets vs the total number of sent packets of the network interface, during the last 10 minutes
+    warn: $this >= 2
+   delay: up 1m down 1h multiplier 1.5 max 2h
+    info: ratio of outbound dropped packets for the network interface over the last 10 minutes
+      to: sysadmin
+
+template: wifi_inbound_packets_dropped_ratio
+      on: net.packets
+      os: linux
+   hosts: *
+families: wl*
+  lookup: sum -10m unaligned absolute of received
+    calc: (($inbound_packets_dropped != nan AND $this > 1000) ? ($inbound_packets_dropped * 100 / $this) : (0))
+   units: %
+   every: 1m
+    warn: $this >= 10
+   delay: up 1m down 1h multiplier 1.5 max 2h
+    info: ratio of inbound dropped packets for the network interface over the last 10 minutes
+      to: sysadmin
+
+template: wifi_outbound_packets_dropped_ratio
+      on: net.packets
+      os: linux
+   hosts: *
+families: wl*
+  lookup: sum -10m unaligned absolute of sent
+    calc: (($outbound_packets_dropped != nan AND $this > 1000) ? ($outbound_packets_dropped * 100 / $this) : (0))
+   units: %
+   every: 1m
+    warn: $this >= 10
+   delay: up 1m down 1h multiplier 1.5 max 2h
+    info: ratio of outbound dropped packets for the network interface over the last 10 minutes
       to: sysadmin
 
 # -----------------------------------------------------------------------------
@@ -123,7 +141,7 @@ families: *
    every: 1m
     warn: $this >= 5
    delay: down 1h multiplier 1.5 max 2h
-    info: interface inbound errors in the last 10 minutes
+    info: number of inbound errors for the network interface in the last 10 minutes
       to: sysadmin
 
 template: interface_outbound_errors
@@ -136,7 +154,7 @@ families: *
    every: 1m
     warn: $this >= 5
    delay: down 1h multiplier 1.5 max 2h
-    info: interface outbound errors in the last 10 minutes
+    info: number of outbound errors for the network interface in the last 10 minutes
       to: sysadmin
 
 # -----------------------------------------------------------------------------
@@ -157,7 +175,7 @@ families: *
    every: 1m
     warn: $this > 0
    delay: down 1h multiplier 1.5 max 2h
-    info: interface fifo errors in the last 10 minutes
+    info: number of FIFO errors for the network interface in the last 10 minutes
       to: sysadmin
 
 # -----------------------------------------------------------------------------
@@ -177,7 +195,7 @@ families: *
   lookup: average -1m unaligned of received
    units: packets
    every: 10s
-    info: the average number of packets received during the last minute
+    info: average number of packets received by the network interface over the last minute
 
 template: 10s_received_packets_storm
       on: net.packets
@@ -189,7 +207,8 @@ families: *
    every: 10s
    units: %
     warn: $this > (($status >= $WARNING)?(200):(5000))
-    crit: $this > (($status >= $WARNING)?(5000):(6000))
+    crit: $this > (($status == $CRITICAL)?(5000):(6000))
  options: no-clear-notification
-    info: the % of the rate of received packets in the last 10 seconds, compared to the rate of the last minute (clear notification for this alarm will not be sent)
+    info: ratio of average number of received packets for the network interface over the last 10 seconds, \
+          compared to the rate over the last minute
       to: sysadmin
diff --git a/health/health.d/netfilter.conf b/health/health.d/netfilter.conf
index 1d07752c..f827d8e4 100644
--- a/health/health.d/netfilter.conf
+++ b/health/health.d/netfilter.conf
@@ -1,19 +1,6 @@
 
 # you can disable an alarm notification by setting the 'to' line to: silent
 
-   alarm: netfilter_last_collected_secs
-      on: netfilter.conntrack_sockets
-      os: linux
-   hosts: *
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: sysadmin
-
    alarm: netfilter_conntrack_full
       on: netfilter.conntrack_sockets
       os: linux
@@ -22,8 +9,8 @@
     calc: $this * 100 / $netfilter_conntrack_max
    units: %
    every: 10s
-    warn: $this > (($status >= $WARNING)  ? (70) : (80))
-    crit: $this > (($status == $CRITICAL) ? (80) : (90))
+    warn: $this > (($status >= $WARNING)  ? (85) : (90))
+    crit: $this > (($status == $CRITICAL) ? (90) : (95))
    delay: down 5m multiplier 1.5 max 1h
-    info: the number of connections tracked by the netfilter connection tracker, as a percentage of the connection tracker table size
+    info: netfilter connection tracker table size utilization
       to: sysadmin
diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf
index b255d35f..f450b712 100644
--- a/health/health.d/pihole.conf
+++ b/health/health.d/pihole.conf
@@ -20,9 +20,9 @@ template: pihole_blocked_queries
    units: %
     calc: $blocked
     warn: $this > ( ($status >= $WARNING ) ? ( 45 ) : ( 55 ) )
-    crit: $this > ( ($status >= $CRITICAL) ? ( 55 ) : ( 75 ) )
+    crit: $this > ( ($status == $CRITICAL) ? ( 55 ) : ( 75 ) )
    delay: up 2m down 5m
-    info: percentage of blocked dns queries for the last 24 hour
+    info: percentage of blocked dns queries over the last 24 hour
       to: sysadmin
 
 
@@ -36,7 +36,7 @@ template: pihole_blocklist_last_update
     calc: $ago
     warn: $this > 60 * 60 * 24 * 8
     crit: $this > 60 * 60 * 24 * 8 * 2
-    info: blocklist last update time
+    info: gravity.list (blocklist) file last update time
       to: sysadmin
 
 # Gravity file check (gravity.list).
@@ -48,7 +48,7 @@ template: pihole_blocklist_gravity_file
     calc: $file_exists
     crit: $this != 1
    delay: up 2m down 5m
-    info: gravity file existence
+    info: gravity.list (blocklist) file existence state (0: exists, 1: not-exists)
       to: sysadmin
 
 # Pi-hole's ability to block unwanted domains.
@@ -61,5 +61,5 @@ template: pihole_status
     calc: $enabled
     warn: $this != 1
    delay: up 2m down 5m
-    info: unwanted domains blocking status
+    info: unwanted domains blocking status (0: enabled, 1: disabled)
       to: sysadmin
diff --git a/health/health.d/portcheck.conf b/health/health.d/portcheck.conf
index 696333fd..29dcebbc 100644
--- a/health/health.d/portcheck.conf
+++ b/health/health.d/portcheck.conf
@@ -11,17 +11,17 @@ families: *
       to: sysadmin
 
 # This is a fast-reacting no-notification alarm ideal for custom dashboards or badges
-template: service_reachable
+template: portcheck_service_reachable
 families: *
       on: portcheck.status
   lookup: average -1m unaligned percentage of success
     calc: ($this < 75) ? (0) : ($this)
    every: 5s
    units: up/down
-    info: at least 75% successful connections during last 60 seconds, ideal for badges
+    info: average ratio of successful connections over the last minute (at least 75%)
       to: silent
 
-template: connection_timeouts
+template: portcheck_connection_timeouts
 families: *
       on: portcheck.status
   lookup: average -5m unaligned percentage of timeout
@@ -30,10 +30,10 @@ families: *
     warn: $this >= 10 AND $this < 40
     crit: $this >= 40
    delay: down 5m multiplier 1.5 max 1h
-    info: average of timeouts during the last 5 minutes
+    info: average ratio of timeouts over the last 5 minutes
       to: sysadmin
 
-template: connection_fails
+template: portcheck_connection_fails
 families: *
       on: portcheck.status
   lookup: average -5m unaligned percentage of no_connection,failed
@@ -42,5 +42,5 @@ families: *
     warn: $this >= 10 AND $this < 40
     crit: $this >= 40
    delay: down 5m multiplier 1.5 max 1h
-    info: average of failed connections during the last 5 minutes
+    info: average ratio of failed connections over the last 5 minutes
       to: sysadmin
diff --git a/health/health.d/processes.conf b/health/health.d/processes.conf
index 293f1aa0..b464d8f6 100644
--- a/health/health.d/processes.conf
+++ b/health/health.d/processes.conf
@@ -6,8 +6,8 @@
     calc: $active * 100 / $pidmax
    units: %
    every: 5s
-    warn: $this > (($status >= $WARNING)  ? (75) : (80))
-    crit: $this > (($status == $CRITICAL) ? (85) : (90))
+    warn: $this > (($status >= $WARNING)  ? (85) : (90))
+    crit: $this > (($status == $CRITICAL) ? (90) : (95))
    delay: down 5m multiplier 1.5 max 1h
-    info: the percentage of active processes
+    info: system process IDs (PID) space utilization
       to: sysadmin
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
index 0a71dac8..2daecc48 100644
--- a/health/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -7,7 +7,8 @@
    hosts: *
     calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz - $zfs.arc_size.min)
    every: 10s
-    info: the amount of memory that is reported as used, but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC)
+    info: amount of memory reported as used, \
+          but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC)
 
    alarm: ram_in_use
       on: system.ram
@@ -20,7 +21,7 @@
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: down 15m multiplier 1.5 max 1h
-    info: system RAM used
+    info: system memory utilization
       to: sysadmin
 
    alarm: ram_available
@@ -33,7 +34,7 @@
     warn: $this < (($status >= $WARNING)  ? (15) : (10))
     crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
    delay: down 15m multiplier 1.5 max 1h
-    info: estimated amount of RAM available for userspace processes, without causing swapping
+    info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
       to: sysadmin
 
 ## FreeBSD
@@ -47,7 +48,7 @@
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: down 15m multiplier 1.5 max 1h
-    info: system RAM usage
+    info: system memory utilization
       to: sysadmin
 
    alarm: ram_available
@@ -60,5 +61,5 @@
     warn: $this < (($status >= $WARNING)  ? (15) : (10))
     crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
    delay: down 15m multiplier 1.5 max 1h
-    info: estimated amount of RAM available for userspace processes, without causing swapping
+    info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
       to: sysadmin
diff --git a/health/health.d/redis.conf b/health/health.d/redis.conf
index c08a884a..43f98a1d 100644
--- a/health/health.d/redis.conf
+++ b/health/health.d/redis.conf
@@ -18,7 +18,7 @@ families: *
    every: 10s
     crit: $rdb_last_bgsave_status != 0
    units: ok/failed
-    info: states if redis bgsave is working
+    info: status of the last RDB save operation (0: ok, 1: error)
    delay: down 5m multiplier 1.5 max 1h
       to: dba
 
@@ -29,6 +29,6 @@ families: *
     warn: $rdb_bgsave_in_progress > 600
     crit: $rdb_bgsave_in_progress > 1200
    units: seconds
-    info: the time redis needs to save its database
+    info: duration of the on-going RDB save operation
    delay: down 5m multiplier 1.5 max 1h
       to: dba
diff --git a/health/health.d/retroshare.conf b/health/health.d/retroshare.conf
index 2344b60e..51b1deb4 100644
--- a/health/health.d/retroshare.conf
+++ b/health/health.d/retroshare.conf
@@ -21,5 +21,5 @@ template: retroshare_dht_working
     warn: $this < (($status >= $WARNING)  ? (120) : (100))
     crit: $this < (($status == $CRITICAL) ? (10)  : (1))
    delay: up 0 down 15m multiplier 1.5 max 1h
-    info: Checks if the DHT has enough peers to operate
+    info: number of DHT peers
       to: sysadmin
diff --git a/health/health.d/riakkv.conf b/health/health.d/riakkv.conf
index 74530277..d6346026 100644
--- a/health/health.d/riakkv.conf
+++ b/health/health.d/riakkv.conf
@@ -1,5 +1,5 @@
 # Ensure that Riak is running.  template: riak_last_collected_secs
-template: riak_last_collected_secs
+template: riakkv_last_collected_secs
       on: riak.kv.throughput
     calc: $now - $last_collected_t
    units: seconds ago
@@ -11,7 +11,7 @@ template: riak_last_collected_secs
       to: dba
 
 # Warn if a list keys operation is running.
-template: riak_list_keys_active
+template: riakkv_list_keys_active
       on: riak.core.fsm_active
     calc: $list_fsm_active
    units: state machines
@@ -23,44 +23,50 @@ template: riak_list_keys_active
 
 ## Timing healthchecks
 # KV GET
-template: 1h_kv_get_mean_latency
+template: riakkv_1h_kv_get_mean_latency
       on: riak.kv.latency.get
     calc: $node_get_fsm_time_mean
   lookup: average -1h unaligned of time
    every: 30s
    units: ms
-    info: mean average KV GET latency over the last hour
+    info: average time between reception of client GET request and \
+          subsequent response to client over the last hour
 
-template: riak_kv_get_slow
+template: riakkv_kv_get_slow
       on: riak.kv.latency.get
     calc: $mean
   lookup: average -3m unaligned of time
    units: ms
    every: 10s
-    warn: ($this > ($1h_kv_get_mean_latency * 2) )
-    crit: ($this > ($1h_kv_get_mean_latency * 3) )
-    info: average KV GET time over the last 3 minutes, compared to the average over the last hour
+    warn: ($this > ($riakkv_1h_kv_get_mean_latency * 2) )
+    crit: ($this > ($riakkv_1h_kv_get_mean_latency * 3) )
+    info: average time between reception of client GET request and \
+          subsequent response to the client over the last 3 minutes, \
+          compared to the average over the last hour
    delay: down 5m multiplier 1.5 max 1h
       to: dba
 
 # KV PUT
-template: 1h_kv_put_mean_latency
+template: riakkv_1h_kv_put_mean_latency
       on: riak.kv.latency.put
     calc: $node_put_fsm_time_mean
   lookup: average -1h unaligned of time
    every: 30s
    units: ms
-    info: mean average KV PUT latency over the last hour
+    info: average time between reception of client PUT request and \
+          subsequent response to the client over the last hour
 
-template: riak_kv_put_slow
+template: riakkv_kv_put_slow
       on: riak.kv.latency.put
     calc: $mean
   lookup: average -3m unaligned of time
    units: ms
    every: 10s
-    warn: ($this > ($1h_kv_put_mean_latency * 2) )
-    crit: ($this > ($1h_kv_put_mean_latency * 3) )
-    info: average KV PUT time over the last 3 minutes, compared to the average over the last hour
+    warn: ($this > ($riakkv_1h_kv_put_mean_latency * 2) )
+    crit: ($this > ($riakkv_1h_kv_put_mean_latency * 3) )
+    info: average time between reception of client PUT request and \
+          subsequent response to the client over the last 3 minutes, \
+          compared to the average over the last hour
    delay: down 5m multiplier 1.5 max 1h
       to: dba
 
@@ -69,12 +75,12 @@ template: riak_kv_put_slow
 
 # Default Erlang VM process limit: 262144
 # On systems observed, this is < 2000, but may grow depending on load.
-template: riak_vm_high_process_count
+template: riakkv_vm_high_process_count
       on: riak.vm
     calc: $sys_process_count
    units: processes
    every: 10s
     warn: $this > 10000
     crit: $this > 100000
-    info: number of processes running in the Erlang VM (the default limit on ERTS 10.2.4 is 262144)
+    info: number of processes running in the Erlang VM
       to: dba
diff --git a/health/health.d/scaleio.conf b/health/health.d/scaleio.conf
index 1a3088a2..ab9771bb 100644
--- a/health/health.d/scaleio.conf
+++ b/health/health.d/scaleio.conf
@@ -22,7 +22,7 @@ template: scaleio_storage_pool_capacity_utilization
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: down 15m multiplier 1.5 max 1h
-    info: Storage Pool capacity utilization
+    info: storage pool capacity utilization
       to: sysadmin
 
 
@@ -34,5 +34,5 @@ template: scaleio_sdc_mdm_connection_state
    every: 10s
     warn: $this != 1
    delay: up 30s down 5m multiplier 1.5 max 1h
-    info: Sdc connection to MDM state
+    info: Data Client (SDC) to Metadata Manager (MDM) connection state (0: disconnected, 1: connected)
       to: sysadmin
diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf
index f835f2ae..f761e4a0 100644
--- a/health/health.d/softnet.conf
+++ b/health/health.d/softnet.conf
@@ -12,7 +12,8 @@
    every: 10s
     warn: $this > (($status >= $WARNING) ? (0) : (10))
    delay: down 1h multiplier 1.5 max 2h
-    info: average number of packets dropped in the last 1min, because sysctl net.core.netdev_max_backlog was exceeded (this can be a cause for dropped packets)
+    info: average number of dropped packets in the last minute \
+          due to exceeded net.core.netdev_max_backlog
       to: sysadmin
 
    alarm: 1min_netdev_budget_ran_outs
@@ -24,7 +25,9 @@
    every: 10s
     warn: $this > (($status >= $WARNING) ? (0) : (10))
    delay: down 1h multiplier 1.5 max 2h
-    info: average number of times, during the last 1min, ksoftirq ran out of sysctl net.core.netdev_budget or net.core.netdev_budget_usecs, with work remaining (this can be a cause for dropped packets)
+    info: average number of times ksoftirq ran out of sysctl net.core.netdev_budget or \
+          net.core.netdev_budget_usecs with work remaining over the last minute \
+          (this can be a cause for dropped packets)
       to: silent
 
    alarm: 10min_netisr_backlog_exceeded
@@ -34,7 +37,9 @@
   lookup: average -1m unaligned absolute of qdrops
    units: packets
    every: 10s
-    warn: $this > (($status >+ $WARNING) ? (0) : (10))
+    warn: $this > (($status >= $WARNING) ? (0) : (10))
    delay: down 1h multiplier 1.5 max 2h
-    info: average number of drops in the last 1min, because sysctl net.route.netisr_maxqlen was exceeded (this can be a cause for dropped packets)
+    info: average number of drops in the last minute \
+          due to exceeded sysctl net.route.netisr_maxqlen \
+          (this can be a cause for dropped packets)
       to: sysadmin
diff --git a/health/health.d/swap.conf b/health/health.d/swap.conf
index f920b080..66c36c13 100644
--- a/health/health.d/swap.conf
+++ b/health/health.d/swap.conf
@@ -10,23 +10,9 @@
     calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
    units: % of RAM
    every: 1m
-    warn: $this > (($status >= $WARNING)  ? (10) : (20))
-    crit: $this > (($status == $CRITICAL) ? (20) : (30))
-   delay: up 0 down 15m multiplier 1.5 max 1h
-    info: the amount of memory swapped in the last 30 minutes, as a percentage of the system RAM
-      to: sysadmin
-
-   alarm: ram_in_swap
-      on: system.swap
-      os: linux
-   hosts: *
-    calc: $used * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
-   units: % of RAM
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? (15) : (20))
-    crit: $this > (($status == $CRITICAL) ? (40) : (50))
-   delay: up 30s down 15m multiplier 1.5 max 1h
-    info: the swap memory used, as a percentage of the system RAM
+    warn: $this > (($status >= $WARNING)  ? (20) : (30))
+   delay: down 15m multiplier 1.5 max 1h
+    info: percentage of the system RAM swapped in the last 30 minutes
       to: sysadmin
 
    alarm: used_swap
@@ -39,5 +25,5 @@
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: up 30s down 15m multiplier 1.5 max 1h
-    info: the percentage of swap memory used
+    info: swap memory utilization
       to: sysadmin
diff --git a/health/health.d/synchronization.conf b/health/health.d/synchronization.conf
new file mode 100644
index 00000000..417624ad
--- /dev/null
+++ b/health/health.d/synchronization.conf
@@ -0,0 +1,12 @@
+   alarm: sync_freq
+      on: mem.sync
+  lookup: sum -1m of sync
+   units: calls
+  plugin: ebpf.plugin
+   every: 1m
+    warn: $this > 6
+   delay: up 1m down 10m multiplier 1.5 max 1h
+    info: number of sync() system calls. \
+          Every call causes all pending modifications to filesystem metadata and \
+          cached file data to be written to the underlying filesystems.
+      to: sysadmin
diff --git a/health/health.d/tcp_conn.conf b/health/health.d/tcp_conn.conf
index 7aa9a980..38b1062d 100644
--- a/health/health.d/tcp_conn.conf
+++ b/health/health.d/tcp_conn.conf
@@ -13,7 +13,7 @@
    units: %
    every: 10s
     warn: $this > (($status >= $WARNING ) ? ( 60 ) : ( 80 ))
-    crit: $this > (($status >= $CRITICAL) ? ( 80 ) : ( 90 ))
+    crit: $this > (($status == $CRITICAL) ? ( 80 ) : ( 90 ))
    delay: up 0 down 5m multiplier 1.5 max 1h
-    info: the percentage of IPv4 TCP connections over the max allowed
+    info: IPv4 TCP connections utilization
       to: sysadmin
diff --git a/health/health.d/tcp_listen.conf b/health/health.d/tcp_listen.conf
index 3b307257..dad462eb 100644
--- a/health/health.d/tcp_listen.conf
+++ b/health/health.d/tcp_listen.conf
@@ -28,7 +28,7 @@
     warn: $this > 1
     crit: $this > (($status == $CRITICAL) ? (1) : (5))
    delay: up 0 down 5m multiplier 1.5 max 1h
-    info: the average number of times the TCP accept queue of the kernel overflown, during the last minute
+    info: average number of overflows in the TCP accept queue over the last minute
       to: sysadmin
 
 # THIS IS TOO GENERIC
@@ -43,7 +43,7 @@
     warn: $this > 1
     crit: $this > (($status == $CRITICAL) ? (1) : (5))
    delay: up 0 down 5m multiplier 1.5 max 1h
-    info: the average number of times the TCP accept queue of the kernel dropped packets, during the last minute (includes bogus packets received)
+    info: average number of dropped packets in the TCP accept queue over the last minute
       to: sysadmin
 
 
@@ -65,7 +65,8 @@
     warn: $this > 1
     crit: $this > (($status == $CRITICAL) ? (0) : (5))
    delay: up 10 down 5m multiplier 1.5 max 1h
-    info: the number of times the TCP SYN queue of the kernel was full and dropped packets, during the last minute
+    info: average number of SYN requests was dropped due to the full TCP SYN queue over the last minute \
+          (SYN cookies were not enabled)
       to: sysadmin
 
    alarm: 1m_tcp_syn_queue_cookies
@@ -78,6 +79,6 @@
     warn: $this > 1
     crit: $this > (($status == $CRITICAL) ? (0) : (5))
    delay: up 10 down 5m multiplier 1.5 max 1h
-    info: the number of times the TCP SYN queue of the kernel was full and sent SYN cookies, during the last minute
+    info: average number of sent SYN cookies due to the full TCP SYN queue over the last minute
       to: sysadmin
 
diff --git a/health/health.d/tcp_mem.conf b/health/health.d/tcp_mem.conf
index 6927d576..29d4ad68 100644
--- a/health/health.d/tcp_mem.conf
+++ b/health/health.d/tcp_mem.conf
@@ -14,7 +14,7 @@
    units: %
    every: 10s
     warn: ${mem} > (($status >= $WARNING  ) ? ( ${tcp_mem_pressure} * 0.8 ) : ( ${tcp_mem_pressure}   ))
-    crit: ${mem} > (($status >= $CRITICAL ) ? ( ${tcp_mem_pressure}       ) : ( ${tcp_mem_high} * 0.9 ))
+    crit: ${mem} > (($status == $CRITICAL ) ? ( ${tcp_mem_pressure}       ) : ( ${tcp_mem_high} * 0.9 ))
    delay: up 0 down 5m multiplier 1.5 max 1h
-    info: the amount of TCP memory as a percentage of its max memory limit
+    info: TCP memory utilization
       to: sysadmin
diff --git a/health/health.d/tcp_orphans.conf b/health/health.d/tcp_orphans.conf
index 280d6590..17ff7a95 100644
--- a/health/health.d/tcp_orphans.conf
+++ b/health/health.d/tcp_orphans.conf
@@ -15,7 +15,7 @@
    units: %
    every: 10s
     warn: $this > (($status >= $WARNING ) ? ( 20 ) : ( 25 ))
-    crit: $this > (($status >= $CRITICAL) ? ( 25 ) : ( 50 ))
+    crit: $this > (($status == $CRITICAL) ? ( 25 ) : ( 50 ))
    delay: up 0 down 5m multiplier 1.5 max 1h
-    info: the percentage of orphan IPv4 TCP sockets over the max allowed (this may lead to too-many-orphans errors)
+    info: orphan IPv4 TCP sockets utilization
       to: sysadmin
diff --git a/health/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf
index 36a550a5..af2a7525 100644
--- a/health/health.d/tcp_resets.conf
+++ b/health/health.d/tcp_resets.conf
@@ -2,21 +2,6 @@
 # you can disable an alarm notification by setting the 'to' line to: silent
 
 # -----------------------------------------------------------------------------
-
-   alarm: ipv4_tcphandshake_last_collected_secs
-      on: ipv4.tcphandshake
-      os: linux freebsd
-   hosts: *
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: up 0 down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: sysadmin
-
-# -----------------------------------------------------------------------------
 # tcp resets this host sends
 
    alarm: 1m_ipv4_tcp_resets_sent
@@ -26,7 +11,7 @@
   lookup: average -1m at -10s unaligned absolute of OutRsts
    units: tcp resets/s
    every: 10s
-    info: average TCP RESETS this host is sending, over the last minute
+    info: average number of sent TCP RESETS over the last minute
 
    alarm: 10s_ipv4_tcp_resets_sent
       on: ipv4.tcphandshake
@@ -38,7 +23,10 @@
     warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING)  ? (1) : (20)))
    delay: up 20s down 60m multiplier 1.2 max 2h
  options: no-clear-notification
-    info: average TCP RESETS this host is sending, over the last 10 seconds (this can be an indication that a port scan is made, or that a service running on this host has crashed; clear notification for this alarm will not be sent)
+    info: average number of sent TCP RESETS over the last 10 seconds. \
+          This can indicate a port scan, \
+          or that a service running on this host has crashed. \
+          Netdata will not send a clear notification for this alarm.
       to: sysadmin
 
 # -----------------------------------------------------------------------------
@@ -51,7 +39,7 @@
   lookup: average -1m at -10s unaligned absolute of AttemptFails
    units: tcp resets/s
    every: 10s
-    info: average TCP RESETS this host is sending, over the last minute
+    info: average number of received TCP RESETS over the last minute
 
    alarm: 10s_ipv4_tcp_resets_received
       on: ipv4.tcphandshake
@@ -63,5 +51,7 @@
     warn: $this > ((($1m_ipv4_tcp_resets_received < 5)?(5):($1m_ipv4_tcp_resets_received)) * (($status >= $WARNING)  ? (1) : (10)))
    delay: up 20s down 60m multiplier 1.2 max 2h
  options: no-clear-notification
-    info: average TCP RESETS this host is receiving, over the last 10 seconds (this can be an indication that a service this host needs, has crashed; clear notification for this alarm will not be sent)
+    info: average number of received TCP RESETS over the last 10 seconds. \
+          This can be an indication that a service this host needs has crashed. \
+          Netdata will not send a clear notification for this alarm.
       to: sysadmin
diff --git a/health/health.d/udp_errors.conf b/health/health.d/udp_errors.conf
index 1e47b5c8..4836d631 100644
--- a/health/health.d/udp_errors.conf
+++ b/health/health.d/udp_errors.conf
@@ -2,21 +2,6 @@
 # you can disable an alarm notification by setting the 'to' line to: silent
 
 # -----------------------------------------------------------------------------
-
-   alarm: ipv4_udperrors_last_collected_secs
-      on: ipv4.udperrors
-      os: linux freebsd
-   hosts: *
-    calc: $now - $last_collected_t
-   units: seconds ago
-   every: 10s
-    warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-    crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-   delay: up 0 down 5m multiplier 1.5 max 1h
-    info: number of seconds since the last successful data collection
-      to: sysadmin
-
-# -----------------------------------------------------------------------------
 # UDP receive buffer errors
 
    alarm: 1m_ipv4_udp_receive_buffer_errors
@@ -26,10 +11,9 @@
   lookup: average -1m unaligned absolute of RcvbufErrors
    units: errors
    every: 10s
-    warn: $this > 1
-    crit: $this > (($status == $CRITICAL) ? (0) : (10))
-    info: average number of UDP receive buffer errors during the last minute
-   delay: up 0 down 60m multiplier 1.2 max 2h
+    warn: $this > (($status >= $WARNING) ? (0) : (10))
+    info: average number of UDP receive buffer errors over the last minute
+   delay: up 1m down 60m multiplier 1.2 max 2h
       to: sysadmin
 
 # -----------------------------------------------------------------------------
@@ -42,8 +26,7 @@
   lookup: average -1m unaligned absolute of SndbufErrors
    units: errors
    every: 10s
-    warn: $this > 1
-    crit: $this > (($status == $CRITICAL) ? (0) : (10))
-    info: number of UDP send buffer errors during the last minute
-   delay: up 0 down 60m multiplier 1.2 max 2h
+    warn: $this > (($status >= $WARNING) ? (0) : (10))
+    info: average number of UDP send buffer errors over the last minute
+   delay: up 1m down 60m multiplier 1.2 max 2h
       to: sysadmin
diff --git a/health/health.d/unbound.conf b/health/health.d/unbound.conf
index bdedc11a..567baf18 100644
--- a/health/health.d/unbound.conf
+++ b/health/health.d/unbound.conf
@@ -21,7 +21,7 @@ template: unbound_request_list_overwritten
    every: 10s
     warn: $this > 5
    delay: up 10 down 5m multiplier 1.5 max 1h
-    info: the number of overwritten queries in the request-list
+    info: number of overwritten queries in the request-list
       to: sysadmin
 
 template: unbound_request_list_dropped
@@ -31,5 +31,5 @@ template: unbound_request_list_dropped
    every: 10s
     warn: $this > 0
    delay: up 10 down 5m multiplier 1.5 max 1h
-    info: the number of dropped queries in the request-list
+    info: number of dropped queries in the request-list
       to: sysadmin
diff --git a/health/health.d/vcsa.conf b/health/health.d/vcsa.conf
index 7bb98a9b..f4b03d4c 100644
--- a/health/health.d/vcsa.conf
+++ b/health/health.d/vcsa.conf
@@ -27,7 +27,8 @@ template: vcsa_system_health
     warn: ($this == 1) || ($this == 2)
     crit: $this == 3
    delay: down 1m multiplier 1.5 max 1h
-    info: overall system health status
+    info: overall system health status \
+          (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
       to: sysadmin
 
 # Components health:
@@ -45,7 +46,8 @@ template: vcsa_swap_health
     warn: $this == 1
     crit: ($this == 2) || ($this == 3)
    delay: down 1m multiplier 1.5 max 1h
-    info: swap health status
+    info: swap health status \
+          (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
       to: sysadmin
 
 template: vcsa_storage_health
@@ -56,7 +58,8 @@ template: vcsa_storage_health
     warn: $this == 1
     crit: ($this == 2) || ($this == 3)
    delay: down 1m multiplier 1.5 max 1h
-    info: storage health status
+    info: storage health status \
+          (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
       to: sysadmin
 
 template: vcsa_mem_health
@@ -67,7 +70,8 @@ template: vcsa_mem_health
     warn: $this == 1
     crit: ($this == 2) || ($this == 3)
    delay: down 1m multiplier 1.5 max 1h
-    info: mem health status
+    info: memory health status \
+          (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
       to: sysadmin
 
 template: vcsa_load_health
@@ -78,7 +82,8 @@ template: vcsa_load_health
     warn: $this == 1
     crit: ($this == 2) || ($this == 3)
    delay: down 1m multiplier 1.5 max 1h
-    info: load health status
+    info: load health status \
+          (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
       to: sysadmin
 
 template: vcsa_database_storage_health
@@ -89,7 +94,8 @@ template: vcsa_database_storage_health
     warn: $this == 1
     crit: ($this == 2) || ($this == 3)
    delay: down 1m multiplier 1.5 max 1h
-    info: database storage health status
+    info: database storage health status \
+          (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
       to: sysadmin
 
 template: vcsa_applmgmt_health
@@ -100,7 +106,8 @@ template: vcsa_applmgmt_health
     warn: $this == 1
     crit: ($this == 2) || ($this == 3)
    delay: down 1m multiplier 1.5 max 1h
-    info: appl mgmt health status
+    info: applmgmt health status \
+          (-1: unknown, 0: green, 1: yellow, 2: orange, 3: red, 4: grey)
       to: sysadmin
 
 
@@ -118,5 +125,6 @@ template: vcsa_software_updates_health
     warn: $this == 4
     crit: $this == 3
    delay: down 1m multiplier 1.5 max 1h
-    info: software packages health status
+    info: software updates availability status \
+          (-1: unknown, 0: green, 2: orange, 3: red, 4: grey)
       to: sysadmin
diff --git a/health/health.d/vernemq.conf b/health/health.d/vernemq.conf
index 36bbaf82..9598dd39 100644
--- a/health/health.d/vernemq.conf
+++ b/health/health.d/vernemq.conf
@@ -18,10 +18,10 @@ template: vernemq_socket_errors
       on: vernemq.socket_errors
   lookup: sum -1m unaligned absolute of socket_error
    units: errors
-   every: 10s
-    warn: $this > (($status == $WARNING) ? (0) : (5))
-   delay: down 5m multiplier 1.5 max 2h
-    info: socket errors in the last minute
+   every: 1m
+    warn: $this > (($status >= $WARNING) ? (0) : (5))
+   delay: up 2m down 5m multiplier 1.5 max 2h
+    info: number of socket errors in the last minute
       to: sysadmin
 
 # Queues dropped/expired/unhandled PUBLISH messages
@@ -30,30 +30,30 @@ template: vernemq_queue_message_drop
       on: vernemq.queue_undelivered_messages
   lookup: sum -1m unaligned absolute of queue_message_drop
    units: dropped messages
-   every: 10s
-    warn: $this > (($status == $WARNING) ? (0) : (5))
-   delay: down 5m multiplier 1.5 max 2h
-    info: dropped messaged due to full queues in the last minute
+   every: 1m
+    warn: $this > (($status >= $WARNING) ? (0) : (5))
+   delay: up 5m down 5m multiplier 1.5 max 2h
+    info: number of dropped messaged due to full queues in the last minute
       to: sysadmin
 
 template: vernemq_queue_message_expired
       on: vernemq.queue_undelivered_messages
   lookup: sum -1m unaligned absolute of queue_message_expired
    units: expired messages
-   every: 10s
-    warn: $this > (($status == $WARNING) ? (0) : (15))
-   delay: down 5m multiplier 1.5 max 2h
-    info: messages which expired before delivery in the last minute
+   every: 1m
+    warn: $this > (($status >= $WARNING) ? (0) : (15))
+   delay: up 5m down 5m multiplier 1.5 max 2h
+    info: number of messages which expired before delivery in the last minute
       to: sysadmin
 
 template: vernemq_queue_message_unhandled
       on: vernemq.queue_undelivered_messages
   lookup: sum -1m unaligned absolute of queue_message_unhandled
    units: unhandled messages
-   every: 10s
-    warn: $this > (($status == $WARNING) ? (0) : (5))
-   delay: down 5m multiplier 1.5 max 2h
-    info: unhandled messages (connections with clean session=true) in the last minute
+   every: 1m
+    warn: $this > (($status >= $WARNING) ? (0) : (5))
+   delay: up 5m down 5m multiplier 1.5 max 2h
+    info: number of unhandled messages (connections with clean session=true) in the last minute
       to: sysadmin
 
 # Erlang VM
@@ -66,19 +66,19 @@ template: vernemq_average_scheduler_utilization
     warn: $this > (($status >= $WARNING)  ? (75) : (85))
     crit: $this > (($status == $CRITICAL) ? (85) : (95))
    delay: down 15m multiplier 1.5 max 1h
-    info: average scheduler utilization for the last 10 minutes
+    info: average scheduler utilization over the last 10 minutes
       to: sysadmin
 
 # Cluster communication and netsplits
 
 template: vernemq_cluster_dropped
       on: vernemq.cluster_dropped
-  lookup: average -1m unaligned
-   units: KiB/s
-   every: 10s
+  lookup: sum -1m unaligned
+   units: KiB
+   every: 1m
     warn: $this > 0
-   delay: down 5m multiplier 1.5 max 1h
-    info: the amount of traffic dropped during communication with the cluster nodes in the last minute
+   delay: up 5m down 5m multiplier 1.5 max 1h
+    info: amount of traffic dropped during communication with the cluster nodes in the last minute
       to: sysadmin
 
 template: vernemq_netsplits
@@ -88,68 +88,41 @@ template: vernemq_netsplits
    every: 10s
     warn: $this > 0
    delay: down 5m multiplier 1.5 max 2h
-    info: detected netsplits in the last minute
+    info: number of detected netsplits (split brain situation) in the last minute
       to: sysadmin
 
 # Unsuccessful CONNACK
 
-template: vernemq_mqtt_connack_sent_reason_success
-      on: vernemq.mqtt_connack_sent_reason
-  lookup: sum -1m unaligned absolute match-names of success
-   units: packets
-   every: 10s
-    info: successful v3/v5 CONNACK sent in the last minute
-      to: sysadmin
-
 template: vernemq_mqtt_connack_sent_reason_unsuccessful
       on: vernemq.mqtt_connack_sent_reason
-  lookup: sum -1m unaligned absolute
-    calc: $this - $vernemq_mqtt_connack_sent_reason_success
+  lookup: sum -1m unaligned absolute match-names of !success,*
    units: packets
-   every: 10s
-    warn: $this > 0
-   delay: down 5m multiplier 1.5 max 2h
-    info: unsuccessful v3/v5 CONNACK sent in the last minute
+   every: 1m
+    warn: $this > (($status >= $WARNING) ? (0) : (5))
+   delay: up 5m down 5m multiplier 1.5 max 2h
+    info: number of sent unsuccessful v3/v5 CONNACK packets in the last minute
       to: sysadmin
 
 # Not normal DISCONNECT
 
-template: vernemq_mqtt_disconnect_received_reason_normal_disconnect
-      on: vernemq.mqtt_disconnect_received_reason
-  lookup: sum -1m unaligned absolute match-names of normal_disconnect
-   units: packets
-   every: 10s
-    info: normal v5 DISCONNECT received in the last minute
-      to: sysadmin
-
-template: vernemq_mqtt_disconnect_sent_reason_normal_disconnect
-      on: vernemq.mqtt_disconnect_sent_reason
-  lookup: sum -1m unaligned absolute match-names of normal_disconnect
-   units: packets
-   every: 10s
-    info: normal v5 DISCONNECT sent in the last minute
-      to: sysadmin
-
 template: vernemq_mqtt_disconnect_received_reason_not_normal
       on: vernemq.mqtt_disconnect_received_reason
-  lookup: sum -1m unaligned absolute
-    calc: $this - $vernemq_mqtt_disconnect_received_reason_normal_disconnect
+  lookup: sum -1m unaligned absolute match-names of !normal_disconnect,*
    units: packets
-   every: 10s
-    warn: $this > 0
-   delay: down 5m multiplier 1.5 max 2h
-    info: not normal v5 DISCONNECT received in the last minute
+   every: 1m
+    warn: $this > (($status >= $WARNING) ? (0) : (5))
+   delay: up 5m down 5m multiplier 1.5 max 2h
+    info: number of received not normal v5 DISCONNECT packets in the last minute
       to: sysadmin
 
 template: vernemq_mqtt_disconnect_sent_reason_not_normal
       on: vernemq.mqtt_disconnect_sent_reason
-  lookup: sum -1m unaligned absolute
-    calc: $this - $vernemq_mqtt_disconnect_sent_reason_normal_disconnect
+  lookup: sum -1m unaligned absolute match-names of !normal_disconnect,*
    units: packets
-   every: 10s
-    warn: $this > 0
-   delay: down 5m multiplier 1.5 max 2h
-    info: not normal v5 DISCONNECT sent in the last minute
+   every: 1m
+    warn: $this > (($status >= $WARNING) ? (0) : (5))
+   delay: up 5m down 5m multiplier 1.5 max 2h
+    info: number of sent not normal v5 DISCONNECT packets in the last minute
       to: sysadmin
 
 # SUBSCRIBE errors and unauthorized attempts
@@ -158,20 +131,20 @@ template: vernemq_mqtt_subscribe_error
       on: vernemq.mqtt_subscribe_error
   lookup: sum -1m unaligned absolute
    units: failed ops
-   every: 10s
-    warn: $this > 0
-   delay: down 5m multiplier 1.5 max 2h
-    info: failed v3/v5 SUBSCRIBE operations in the last minute
+   every: 1m
+    warn: $this > (($status >= $WARNING) ? (0) : (5))
+   delay: up 5m down 5m multiplier 1.5 max 2h
+    info: number of failed v3/v5 SUBSCRIBE operations in the last minute
       to: sysadmin
 
 template: vernemq_mqtt_subscribe_auth_error
       on: vernemq.mqtt_subscribe_auth_error
   lookup: sum -1m unaligned absolute
    units: attempts
-   every: 10s
-    warn: $this > 0
-   delay: down 5m multiplier 1.5 max 2h
-    info: unauthorized v3/v5 SUBSCRIBE attempts in the last minute
+   every: 1m
+    warn: $this > (($status >= $WARNING) ? (0) : (5))
+   delay: up 5m down 5m multiplier 1.5 max 2h
+    info: number of unauthorized v3/v5 SUBSCRIBE attempts in the last minute
       to: sysadmin
 
 # UNSUBSCRIBE errors
@@ -180,10 +153,10 @@ template: vernemq_mqtt_unsubscribe_error
       on: vernemq.mqtt_unsubscribe_error
   lookup: sum -1m unaligned absolute
    units: failed ops
-   every: 10s
-    warn: $this > 0
-   delay: down 5m multiplier 1.5 max 2h
-    info: failed v3/v5 UNSUBSCRIBE operations in the last minute
+   every: 1m
+    warn: $this > (($status >= $WARNING) ? (0) : (5))
+   delay: up 5m down 5m multiplier 1.5 max 2h
+    info: number of failed v3/v5 UNSUBSCRIBE operations in the last minute
       to: sysadmin
 
 # PUBLISH errors and unauthorized attempts
@@ -192,208 +165,136 @@ template: vernemq_mqtt_publish_errors
       on: vernemq.mqtt_publish_errors
   lookup: sum -1m unaligned absolute
    units: failed ops
-   every: 10s
-    warn: $this > 0
-   delay: down 5m multiplier 1.5 max 2h
-    info: failed v3/v5 PUBLISH operations in the last minute
+   every: 1m
+    warn: $this > (($status >= $WARNING) ? (0) : (5))
+   delay: up 5m down 5m multiplier 1.5 max 2h
+    info: number of failed v3/v5 PUBLISH operations in the last minute
       to: sysadmin
 
 template: vernemq_mqtt_publish_auth_errors
       on: vernemq.mqtt_publish_auth_errors
   lookup: sum -1m unaligned absolute
    units: attempts
-   every: 10s
-    warn: $this > 0
-   delay: down 5m multiplier 1.5 max 2h
-    info: unauthorized v3/v5 PUBLISH attempts in the last minute
+   every: 1m
+    warn: $this > (($status >= $WARNING) ? (0) : (5))
+   delay: up 5m down 5m multiplier 1.5 max 2h
+    info: number of unauthorized v3/v5 PUBLISH attempts in the last minute
       to: sysadmin
 
 # Unsuccessful and unexpected PUBACK
 
-template: vernemq_mqtt_puback_received_reason_success
-      on: vernemq.mqtt_puback_received_reason
-  lookup: sum -1m unaligned absolute match-names of success
-   units: packets
-   every: 10s
-    info: successful v5 PUBACK received in the last minute
-      to: sysadmin
-
-template: vernemq_mqtt_puback_sent_reason_success
-      on: vernemq.mqtt_puback_sent_reason
-  lookup: sum -1m unaligned absolute match-names of success
-   units: packets
-   every: 10s
-    info: successful v5 PUBACK sent in the last minute
-      to: sysadmin
-
 template: vernemq_mqtt_puback_received_reason_unsuccessful
       on: vernemq.mqtt_puback_received_reason
-  lookup: sum -1m unaligned absolute
-    calc: $this - $vernemq_mqtt_puback_received_reason_success
+  lookup: sum -1m unaligned absolute match-names of !success,*
    units: packets
-   every: 10s
-    warn: $this > 0
-   delay: down 5m multiplier 1.5 max 2h
-    info: unsuccessful v5 PUBACK received in the last minute
+   every: 1m
+    warn: $this > (($status >= $WARNING) ? (0) : (5))
+   delay: up 5m down 5m multiplier 1.5 max 2h
+    info: number of received unsuccessful v5 PUBACK packets in the last minute
       to: sysadmin
 
 template: vernemq_mqtt_puback_sent_reason_unsuccessful
       on: vernemq.mqtt_puback_sent_reason
-  lookup: sum -1m unaligned absolute
-    calc: $this - $vernemq_mqtt_puback_sent_reason_success
+  lookup: sum -1m unaligned absolute match-names of !success,*
    units: packets
-   every: 10s
-    warn: $this > 0
-   delay: down 5m multiplier 1.5 max 2h
-    info: unsuccessful v5 PUBACK sent in the last minute
+   every: 1m
+    warn: $this > (($status >= $WARNING) ? (0) : (5))
+   delay: up 5m down 5m multiplier 1.5 max 2h
+    info: number of sent unsuccessful v5 PUBACK packets in the last minute
       to: sysadmin
 
 template: vernemq_mqtt_puback_unexpected
       on: vernemq.mqtt_puback_invalid_error
   lookup: sum -1m unaligned absolute
    units: messages
-   every: 10s
-    warn: $this > 0
-   delay: down 5m multiplier 1.5 max 2h
-    info: unexpected v3/v5 PUBACK received in the last minute
+   every: 1m
+    warn: $this > (($status >= $WARNING) ? (0) : (5))
+   delay: up 5m down 5m multiplier 1.5 max 2h
+    info: number of received unexpected v3/v5 PUBACK packets in the last minute
       to: sysadmin
 
 # Unsuccessful and unexpected PUBREC
 
-template: vernemq_mqtt_pubrec_received_reason_success
-      on: vernemq.mqtt_pubrec_received_reason
-  lookup: sum -1m unaligned absolute match-names of success
-   units: packets
-   every: 10s
-    info: successful v5 PUBREC received in the last minute
-      to: sysadmin
-
-template: vernemq_mqtt_pubrec_sent_reason_success
-      on: vernemq.mqtt_pubrec_sent_reason
-  lookup: sum -1m unaligned absolute match-names of success
-   units: packets
-   every: 10s
-    info: successful v5 PUBREC sent in the last minute
-      to: sysadmin
-
 template: vernemq_mqtt_pubrec_received_reason_unsuccessful
       on: vernemq.mqtt_pubrec_received_reason
-  lookup: sum -1m unaligned absolute
-    calc: $this - $vernemq_mqtt_pubrec_received_reason_success
+  lookup: sum -1m unaligned absolute match-names of !success,*
    units: packets
-   every: 10s
-    warn: $this > 0
-   delay: down 5m multiplier 1.5 max 2h
-    info: unsuccessful v5 PUBREC received in the last minute
+   every: 1m
+    warn: $this > (($status >= $WARNING) ? (0) : (5))
+   delay: up 5m down 5m multiplier 1.5 max 2h
+    info: number of received unsuccessful v5 PUBREC packets in the last minute
       to: sysadmin
 
 template: vernemq_mqtt_pubrec_sent_reason_unsuccessful
       on: vernemq.mqtt_pubrec_sent_reason
-  lookup: sum -1m unaligned absolute
-    calc: $this - $vernemq_mqtt_pubrec_sent_reason_success
+  lookup: sum -1m unaligned absolute match-names of !success,*
    units: packets
-   every: 10s
-    warn: $this > 0
-   delay: down 5m multiplier 1.5 max 2h
-    info: unsuccessful v5 PUBREC sent in the last minute
+   every: 1m
+    warn: $this > (($status >= $WARNING) ? (0) : (5))
+   delay: up 5m down 5m multiplier 1.5 max 2h
+    info: number of sent unsuccessful v5 PUBREC packets in the last minute
       to: sysadmin
 
 template: vernemq_mqtt_pubrec_invalid_error
       on: vernemq.mqtt_pubrec_invalid_error
   lookup: sum -1m unaligned absolute
    units: messages
-   every: 10s
-    warn: $this > 0
-   delay: down 5m multiplier 1.5 max 2h
-    info: unexpected v3 PUBREC received in the last minute
+   every: 1m
+    warn: $this > (($status >= $WARNING) ? (0) : (5))
+   delay: up 5m down 5m multiplier 1.5 max 2h
+    info: number of received unexpected v3 PUBREC packets in the last minute
       to: sysadmin
 
 # Unsuccessful PUBREL
 
-template: vernemq_mqtt_pubrel_received_reason_success
-      on: vernemq.mqtt_pubrel_received_reason
-  lookup: sum -1m unaligned absolute match-names of success
-   units: packets
-   every: 10s
-    info: successful v5 PUBREL received in the last minute
-      to: sysadmin
-
-template: vernemq_mqtt_pubrel_sent_reason_success
-      on: vernemq.mqtt_pubrel_sent_reason
-  lookup: sum -1m unaligned absolute match-names of success
-   units: packets
-   every: 10s
-    info: successful v5 PUBREL sent in the last minute
-      to: sysadmin
-
 template: vernemq_mqtt_pubrel_received_reason_unsuccessful
       on: vernemq.mqtt_pubrel_received_reason
-  lookup: sum -1m unaligned absolute
-    calc: $this - $vernemq_mqtt_pubrel_received_reason_success
+  lookup: sum -1m unaligned absolute match-names of !success,*
    units: packets
-   every: 10s
-    warn: $this > 0
-   delay: down 5m multiplier 1.5 max 2h
-    info: unsuccessful v5 PUBREL received in the last minute
+   every: 1m
+    warn: $this > (($status >= $WARNING) ? (0) : (5))
+   delay: up 5m down 5m multiplier 1.5 max 2h
+    info: number of received unsuccessful v5 PUBREL packets in the last minute
       to: sysadmin
 
 template: vernemq_mqtt_pubrel_sent_reason_unsuccessful
       on: vernemq.mqtt_pubrel_sent_reason
-  lookup: sum -1m unaligned absolute
-    calc: $this - $vernemq_mqtt_pubrel_sent_reason_success
+  lookup: sum -1m unaligned absolute match-names of !success,*
    units: packets
-   every: 10s
-    warn: $this > 0
-   delay: down 5m multiplier 1.5 max 2h
-    info: unsuccessful v5 PUBREL sent in the last minute
+   every: 1m
+    warn: $this > (($status >= $WARNING) ? (0) : (5))
+   delay: up 5m down 5m multiplier 1.5 max 2h
+    info: number of sent unsuccessful v5 PUBREL packets in the last minute
       to: sysadmin
 
 # Unsuccessful and unexpected PUBCOMP
 
-template: vernemq_mqtt_pubcomp_received_reason_success
-      on: vernemq.mqtt_pubcomp_received_reason
-  lookup: sum -1m unaligned absolute match-names of success
-   units: packets
-   every: 10s
-    info: successful v5 PUBCOMP received in the last minute
-      to: sysadmin
-
-template: vernemq_mqtt_pubcomp_sent_reason_success
-      on: vernemq.mqtt_pubcomp_sent_reason
-  lookup: sum -1m unaligned absolute match-names of success
-   units: packets
-   every: 10s
-    info: successful v5 PUBCOMP sent in the last minute
-      to: sysadmin
-
 template: vernemq_mqtt_pubcomp_received_reason_unsuccessful
       on: vernemq.mqtt_pubcomp_received_reason
-  lookup: sum -1m unaligned absolute
-    calc: $this - $vernemq_mqtt_pubcomp_received_reason_success
+  lookup: sum -1m unaligned absolute match-names of !success,*
    units: packets
-   every: 10s
-    warn: $this > 0
-   delay: down 5m multiplier 1.5 max 2h
-    info: unsuccessful v5 PUBCOMP received in the last minute
+   every: 1m
+    warn: $this > (($status >= $WARNING) ? (0) : (5))
+   delay: up 5m down 5m multiplier 1.5 max 2h
+    info: number of received unsuccessful v5 PUBCOMP packets in the last minute
       to: sysadmin
 
 template: vernemq_mqtt_pubcomp_sent_reason_unsuccessful
       on: vernemq.mqtt_pubcomp_sent_reason
-  lookup: sum -1m unaligned absolute
-    calc: $this - $vernemq_mqtt_pubcomp_sent_reason_success
+  lookup: sum -1m unaligned absolute match-names of !success,*
    units: packets
-   every: 10s
-    warn: $this > 0
-   delay: down 5m multiplier 1.5 max 2h
-    info: unsuccessful v5 PUBCOMP sent in the last minute
+   every: 1m
+    warn: $this > (($status >= $WARNING) ? (0) : (5))
+   delay: up 5m down 5m multiplier 1.5 max 2h
+    info: number of sent unsuccessful v5 PUBCOMP packets in the last minute
       to: sysadmin
 
 template: vernemq_mqtt_pubcomp_unexpected
       on: vernemq.mqtt_pubcomp_invalid_error
   lookup: sum -1m unaligned absolute
    units: messages
-   every: 10s
-    warn: $this > 0
-   delay: down 5m multiplier 1.5 max 2h
-    info: unexpected v3/v5 PUBCOMP received in the last minute
+   every: 1m
+    warn: $this > (($status >= $WARNING) ? (0) : (5))
+   delay: up 5m down 5m multiplier 1.5 max 2h
+    info: number of received unexpected v3/v5 PUBCOMP packets in the last minute
       to: sysadmin
diff --git a/health/health.d/vsphere.conf b/health/health.d/vsphere.conf
index d8b2be19..3e1414c1 100644
--- a/health/health.d/vsphere.conf
+++ b/health/health.d/vsphere.conf
@@ -13,7 +13,7 @@ template: vsphere_vm_mem_usage
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: down 15m multiplier 1.5 max 1h
-    info: used RAM
+    info: virtual machine memory utilization
 
 # -----------------------------------------------HOST Specific----------------------------------------------------------
 # Memory
@@ -27,7 +27,7 @@ template: vsphere_host_mem_usage
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: down 15m multiplier 1.5 max 1h
-    info: used RAM
+    info: host memory utilization
 
 # Network errors
 
@@ -38,10 +38,7 @@ families: *
   lookup: sum -10m unaligned absolute match-names of rx
    units: packets
    every: 1m
-    warn: $this >= 5
-   delay: down 1h multiplier 1.5 max 2h
-    info: interface inbound dropped packets in the last 10 minutes
-      to: sysadmin
+    info: number of inbound errors for the network interface in the last 10 minutes
 
 template: vsphere_outbound_packets_errors
       on: vsphere.net_errors_total
@@ -50,10 +47,7 @@ families: *
   lookup: sum -10m unaligned absolute match-names of tx
    units: packets
    every: 1m
-    warn: $this >= 5
-   delay: down 1h multiplier 1.5 max 2h
-    info: interface outbound dropped packets in the last 10 minutes
-      to: sysadmin
+    info: number of outbound errors for the network interface in the last 10 minutes
 
 # Network errors ratio
 
@@ -62,13 +56,12 @@ template: vsphere_inbound_packets_errors_ratio
    hosts: *
 families: *
   lookup: sum -10m unaligned absolute match-names of rx
-    calc: (($vsphere_inbound_packets_errors != nan AND $this > 0) ? ($vsphere_inbound_packets_errors * 100 / $this) : (0))
+    calc: (($vsphere_inbound_packets_errors != nan AND $this > 1000) ? ($vsphere_inbound_packets_errors * 100 / $this) : (0))
    units: %
    every: 1m
-    warn: $this >= 0.1
-    crit: $this >= 2
-   delay: down 1h multiplier 1.5 max 2h
-    info: the ratio of inbound errors vs the total number of received packets of the network interface, during the last 10 minutes
+    warn: $this >= 2
+   delay: up 1m down 1h multiplier 1.5 max 2h
+    info: ratio of inbound errors for the network interface over the last 10 minutes
       to: sysadmin
 
 template: vsphere_outbound_packets_errors_ratio
@@ -76,13 +69,12 @@ template: vsphere_outbound_packets_errors_ratio
    hosts: *
 families: *
   lookup: sum -10m unaligned absolute match-names of tx
-    calc: (($vsphere_outbound_packets_errors != nan AND $this > 0) ? ($vsphere_outbound_packets_errors * 100 / $this) : (0))
+    calc: (($vsphere_outbound_packets_errors != nan AND $this > 1000) ? ($vsphere_outbound_packets_errors * 100 / $this) : (0))
    units: %
    every: 1m
-    warn: $this >= 0.1
-    crit: $this >= 2
-   delay: down 1h multiplier 1.5 max 2h
-    info: the ratio of outbound errors vs the total number of sent packets of the network interface, during the last 10 minutes
+    warn: $this >= 2
+   delay: up 1m down 1h multiplier 1.5 max 2h
+    info: ratio of outbound errors for the network interface over the last 10 minutes
       to: sysadmin
 
 # -----------------------------------------------Common-------------------------------------------------------------------
@@ -97,7 +89,7 @@ template: vsphere_cpu_usage
     warn: $this > (($status >= $WARNING)  ? (75) : (85))
     crit: $this > (($status == $CRITICAL) ? (85) : (95))
    delay: down 15m multiplier 1.5 max 1h
-    info: cpu utilization for the last 10 minutes
+    info: average CPU utilization
       to: sysadmin
 
 # Network drops
@@ -109,10 +101,7 @@ families: *
   lookup: sum -10m unaligned absolute match-names of rx
    units: packets
    every: 1m
-    warn: $this >= 5
-   delay: down 1h multiplier 1.5 max 2h
-    info: interface inbound dropped packets in the last 10 minutes
-      to: sysadmin
+    info: number of inbound dropped packets for the network interface in the last 10 minutes
 
 template: vsphere_outbound_packets_dropped
       on: vsphere.net_drops_total
@@ -121,10 +110,7 @@ families: *
   lookup: sum -10m unaligned absolute match-names of tx
    units: packets
    every: 1m
-    warn: $this >= 5
-   delay: down 1h multiplier 1.5 max 2h
-    info: interface outbound dropped packets in the last 10 minutes
-      to: sysadmin
+    info: number of outbound dropped packets for the network interface in the last 10 minutes
 
 # Network drops ratio
 
@@ -133,13 +119,12 @@ template: vsphere_inbound_packets_dropped_ratio
    hosts: *
 families: *
   lookup: sum -10m unaligned absolute match-names of rx
-    calc: (($vsphere_inbound_packets_dropped != nan AND $this > 0) ? ($vsphere_inbound_packets_dropped * 100 / $this) : (0))
+    calc: (($vsphere_inbound_packets_dropped != nan AND $this > 1000) ? ($vsphere_inbound_packets_dropped * 100 / $this) : (0))
    units: %
    every: 1m
-    warn: $this >= 0.1
-    crit: $this >= 2
-   delay: down 1h multiplier 1.5 max 2h
-    info: the ratio of inbound dropped packets vs the total number of received packets of the network interface, during the last 10 minutes
+    warn: $this >= 2
+   delay: up 1m down 1h multiplier 1.5 max 2h
+    info: ratio of inbound dropped packets for the network interface over the last 10 minutes
       to: sysadmin
 
 template: vsphere_outbound_packets_dropped_ratio
@@ -147,11 +132,10 @@ template: vsphere_outbound_packets_dropped_ratio
    hosts: *
 families: *
   lookup: sum -10m unaligned absolute match-names of tx
-    calc: (($vsphere_outbound_packets_dropped != nan AND $this > 0) ? ($vsphere_outbound_packets_dropped * 100 / $this) : (0))
+    calc: (($vsphere_outbound_packets_dropped != nan AND $this > 1000) ? ($vsphere_outbound_packets_dropped * 100 / $this) : (0))
    units: %
    every: 1m
-    warn: $this >= 0.1
-    crit: $this >= 2
-   delay: down 1h multiplier 1.5 max 2h
-    info: the ratio of outbound dropped packets vs the total number of sent packets of the network interface, during the last 10 minutes
+    warn: $this >= 2
+   delay: up 1m down 1h multiplier 1.5 max 2h
+    info: ratio of outbound dropped packets for the network interface over the last 10 minutes
       to: sysadmin
diff --git a/health/health.d/web_log.conf b/health/health.d/web_log.conf
index 44de38a4..0b01990c 100644
--- a/health/health.d/web_log.conf
+++ b/health/health.d/web_log.conf
@@ -31,7 +31,7 @@ families: *
     calc: ($this == 0)?(1):($this)
    units: requests
    every: 10s
-    info: the sum of all HTTP requests over the last minute
+    info: number of HTTP requests in the last minute
 
 template: 1m_successful
       on: web_log.response_statuses
@@ -43,7 +43,7 @@ families: *
     warn: ($1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
     crit: ($1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
    delay: up 2m down 15m multiplier 1.5 max 1h
-    info: the ratio of successful HTTP responses (1xx, 2xx, 304, 401) over the last minute
+    info: ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401)
       to: webmaster
 
 template: 1m_redirects
@@ -56,7 +56,7 @@ families: *
     warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING ) ? (  1 ) : ( 20 )) ) : ( 0 )
     crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 )
    delay: up 2m down 15m multiplier 1.5 max 1h
-    info: the ratio of HTTP redirects (3xx except 304) over the last minute
+    info: ratio of redirection HTTP requests over the last minute (3xx except 304)
       to: webmaster
 
 template: 1m_bad_requests
@@ -69,7 +69,7 @@ families: *
     warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING)  ? ( 10 ) : ( 30 )) ) : ( 0 )
     crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 )
    delay: up 2m down 15m multiplier 1.5 max 1h
-    info: the ratio of HTTP bad requests (4xx except 401) over the last minute
+    info: ratio of client error HTTP requests over the last minute (4xx except 401)
       to: webmaster
 
 template: 1m_internal_errors
@@ -82,7 +82,7 @@ families: *
     warn: ($1m_requests > 120) ? ($this > (($status >= $WARNING)  ? ( 1 ) : ( 2 )) ) : ( 0 )
     crit: ($1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
    delay: up 2m down 15m multiplier 1.5 max 1h
-    info: the ratio of HTTP internal server errors (5xx), over the last minute
+    info: ratio of server error HTTP requests over the last minute (5xx)
       to: webmaster
 
 # unmatched lines
@@ -101,10 +101,10 @@ families: *
     calc: ($this == 0)?(1):($this)
    units: requests
    every: 10s
-    info: the sum of all HTTP requests over the last minute
+    info: number of HTTP requests over the last minute
 
 template: 1m_unmatched
-on: web_log.response_codes
+      on: web_log.response_codes
 families: *
   lookup: sum -1m unaligned of unmatched
     calc: $this * 100 / $1m_total_requests
@@ -112,7 +112,7 @@ families: *
    every: 10s
     warn: ($1m_total_requests > 120) ? ($this > 1) : ( 0 )
    delay: up 1m down 5m multiplier 1.5 max 1h
-    info: the ratio of unmatched lines, over the last minute
+    info: percentage of unparsed log lines over the last minute
       to: webmaster
 
 # -----------------------------------------------------------------------------
@@ -131,7 +131,7 @@ families: *
   lookup: average -10m unaligned of avg
    units: ms
    every: 30s
-    info: the average time to respond to HTTP requests, over the last 10 minutes
+    info: average HTTP response time over the last 10 minutes
 
 template: web_slow
       on: web_log.response_time
@@ -144,7 +144,7 @@ families: *
     warn: ($1m_requests > 120) ? ($this > $green && $this > ($10m_response_time * 2) ) : ( 0 )
     crit: ($1m_requests > 120) ? ($this > $red   && $this > ($10m_response_time * 4) ) : ( 0 )
    delay: down 15m multiplier 1.5 max 1h
-    info: the average time to respond to HTTP requests, over the last 1 minute
+    info: average HTTP response time over the last minute
  options: no-clear-notification
       to: webmaster
 
@@ -165,7 +165,7 @@ families: *
   lookup: average -5m at -5m unaligned of successful_requests
    units: requests/s
    every: 30s
-    info: average rate of successful HTTP requests over the last 5 minutes
+    info: average number of successful HTTP requests for the 5 minutes starting 10 minutes ago
 
 template: 5m_successful
       on: web_log.response_statuses
@@ -173,7 +173,7 @@ families: *
   lookup: average -5m unaligned of successful_requests
    units: requests/s
    every: 30s
-    info: average successful HTTP requests over the last 5 minutes
+    info: average number of successful HTTP requests over the last 5 minutes
 
 template: 5m_requests_ratio
       on: web_log.response_codes
@@ -185,7 +185,7 @@ families: *
     crit: ($5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
    delay: down 15m multiplier 1.5 max 1h
 options: no-clear-notification
-    info: the percentage of successful web requests over the last 5 minutes, \
+    info: ratio of successful HTTP requests over the last 5 minutes, \
           compared with the previous 5 minutes \
           (clear notification for this alarm will not be sent)
       to: webmaster
@@ -224,7 +224,7 @@ families: *
     calc: ($this == 0)?(1):($this)
    units: requests
    every: 10s
-    info: the sum of all HTTP requests over the last minute
+    info: number of HTTP requests in the last minute
 
 template: web_log_1m_unmatched
       on: web_log.excluded_requests
@@ -235,7 +235,7 @@ families: *
    every: 10s
     warn: ($web_log_1m_total_requests > 120) ? ($this > 1) : ( 0 )
    delay: up 1m down 5m multiplier 1.5 max 1h
-    info: the ratio of unmatched lines, over the last minute
+    info: percentage of unparsed log lines over the last minute
       to: webmaster
 
 # -----------------------------------------------------------------------------
@@ -255,7 +255,7 @@ families: *
     calc: ($this == 0)?(1):($this)
    units: requests
    every: 10s
-    info: the sum of all HTTP requests over the last minute
+    info: number of HTTP requests in the last minute
 
 template: web_log_1m_successful
       on: web_log.type_requests
@@ -267,7 +267,7 @@ families: *
     warn: ($web_log_1m_requests > 120) ? ($this < (($status >= $WARNING ) ? ( 95 ) : ( 85 )) ) : ( 0 )
     crit: ($web_log_1m_requests > 120) ? ($this < (($status == $CRITICAL) ? ( 85 ) : ( 75 )) ) : ( 0 )
    delay: up 2m down 15m multiplier 1.5 max 1h
-    info: the ratio of successful HTTP responses (1xx, 2xx, 304, 401) over the last minute
+    info: ratio of successful HTTP requests over the last minute (1xx, 2xx, 304, 401)
       to: webmaster
 
 template: web_log_1m_redirects
@@ -280,7 +280,7 @@ families: *
     warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING ) ? (  1 ) : ( 20 )) ) : ( 0 )
     crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 20 ) : ( 30 )) ) : ( 0 )
    delay: up 2m down 15m multiplier 1.5 max 1h
-    info: the ratio of HTTP redirects (3xx except 304) over the last minute
+    info: ratio of redirection HTTP requests over the last minute (3xx except 304)
       to: webmaster
 
 template: web_log_1m_bad_requests
@@ -293,7 +293,7 @@ families: *
     warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING)  ? ( 10 ) : ( 30 )) ) : ( 0 )
     crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 30 ) : ( 50 )) ) : ( 0 )
    delay: up 2m down 15m multiplier 1.5 max 1h
-    info: the ratio of HTTP bad requests (4xx except 401) over the last minute
+    info: ratio of client error HTTP requests over the last minute (4xx except 401)
       to: webmaster
 
 template: web_log_1m_internal_errors
@@ -306,7 +306,7 @@ families: *
     warn: ($web_log_1m_requests > 120) ? ($this > (($status >= $WARNING)  ? ( 1 ) : ( 2 )) ) : ( 0 )
     crit: ($web_log_1m_requests > 120) ? ($this > (($status == $CRITICAL) ? ( 2 ) : ( 5 )) ) : ( 0 )
    delay: up 2m down 15m multiplier 1.5 max 1h
-    info: the ratio of HTTP internal server errors (5xx), over the last minute
+    info: ratio of server error HTTP requests over the last minute (5xx)
       to: webmaster
 
 # -----------------------------------------------------------------------------
@@ -325,7 +325,7 @@ families: *
   lookup: average -10m unaligned of avg
    units: ms
    every: 30s
-    info: the average time to respond to HTTP requests, over the last 10 minutes
+    info: average HTTP response time over the last 10 minutes
 
 template: web_log_web_slow
       on: web_log.request_processing_time
@@ -338,7 +338,7 @@ families: *
     warn: ($web_log_1m_requests > 120) ? ($this > $green && $this > ($web_log_10m_response_time * 2) ) : ( 0 )
     crit: ($web_log_1m_requests > 120) ? ($this > $red   && $this > ($web_log_10m_response_time * 4) ) : ( 0 )
    delay: down 15m multiplier 1.5 max 1h
-    info: the average time to respond to HTTP requests, over the last 1 minute
+    info: average HTTP response time over the last 1 minute
  options: no-clear-notification
       to: webmaster
 
@@ -359,7 +359,7 @@ families: *
   lookup: average -5m at -5m unaligned of success
    units: requests/s
    every: 30s
-    info: average rate of successful HTTP requests over the last 5 minutes
+    info: average number of successful HTTP requests for the 5 minutes starting 10 minutes ago
 
 template: web_log_5m_successful
       on: web_log.type_requests
@@ -367,7 +367,7 @@ families: *
   lookup: average -5m unaligned of success
    units: requests/s
    every: 30s
-    info: average successful HTTP requests over the last 5 minutes
+    info: average number of successful HTTP requests over the last 5 minutes
 
 template: web_log_5m_requests_ratio
       on: web_log.type_requests
@@ -379,7 +379,7 @@ families: *
     crit: ($web_log_5m_successful_old > 120) ? ($this > 400 OR $this < 25) : (0)
    delay: down 15m multiplier 1.5 max 1h
 options: no-clear-notification
-    info: the percentage of successful web requests over the last 5 minutes, \
+    info: ratio of successful HTTP requests over over the last 5 minutes, \
           compared with the previous 5 minutes \
           (clear notification for this alarm will not be sent)
       to: webmaster
diff --git a/health/health.d/whoisquery.conf b/health/health.d/whoisquery.conf
index 275e11dd..36ae02fa 100644
--- a/health/health.d/whoisquery.conf
+++ b/health/health.d/whoisquery.conf
@@ -20,5 +20,5 @@ template: whoisquery_days_until_expiration
    every: 60s
     warn: $this < $days_until_expiration_warning*24*60*60
     crit: $this < $days_until_expiration_critical*24*60*60
-    info: domain time until expiration
+    info: time until the domain name registration expires
       to: webmaster
diff --git a/health/health.d/wmi.conf b/health/health.d/wmi.conf
index 0441fc1f..f1f71a60 100644
--- a/health/health.d/wmi.conf
+++ b/health/health.d/wmi.conf
@@ -26,7 +26,7 @@ template: wmi_10min_cpu_usage
     warn: $this > (($status >= $WARNING)  ? (75) : (85))
     crit: $this > (($status == $CRITICAL) ? (85) : (95))
    delay: down 15m multiplier 1.5 max 1h
-    info: cpu utilization for the last 10 minutes
+    info: average CPU utilization over the last 10 minutes
       to: sysadmin
 
 
@@ -42,7 +42,7 @@ template: wmi_ram_in_use
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: down 15m multiplier 1.5 max 1h
-    info: used RAM
+    info: memory utilization
       to: sysadmin
 
 template: wmi_swap_in_use
@@ -55,13 +55,13 @@ template: wmi_swap_in_use
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: down 15m multiplier 1.5 max 1h
-    info: used Swap
+    info: swap memory utilization
       to: sysadmin
 
 
 ## Network
 
-template: inbound_packets_discarded
+template: wmi_inbound_packets_discarded
       on: wmi.net_discarded
       os: linux
    hosts: *
@@ -71,10 +71,10 @@ families: *
    every: 1m
     warn: $this >= 5
    delay: down 1h multiplier 1.5 max 2h
-    info: interface inbound discarded packets in the last 10 minutes
+    info: number of inbound discarded packets for the network interface in the last 10 minutes
       to: sysadmin
 
-template: outbound_packets_discarded
+template: wmi_outbound_packets_discarded
       on: wmi.net_discarded
       os: linux
    hosts: *
@@ -84,10 +84,10 @@ families: *
    every: 1m
     warn: $this >= 5
    delay: down 1h multiplier 1.5 max 2h
-    info: interface outbound discarded packets in the last 10 minutes
+    info: number of outbound discarded packets for the network interface in the last 10 minutes
       to: sysadmin
 
-template: inbound_packets_errors
+template: wmi_inbound_packets_errors
       on: wmi.net_errors
       os: linux
    hosts: *
@@ -97,10 +97,10 @@ families: *
    every: 1m
     warn: $this >= 5
    delay: down 1h multiplier 1.5 max 2h
-    info: interface inbound errors in the last 10 minutes
+    info: number of inbound errors for the network interface in the last 10 minutes
       to: sysadmin
 
-template: outbound_packets_errors
+template: wmi_outbound_packets_errors
       on: wmi.net_errors
       os: linux
    hosts: *
@@ -110,7 +110,7 @@ families: *
    every: 1m
     warn: $this >= 5
    delay: down 1h multiplier 1.5 max 2h
-    info: interface outbound errors in the last 10 minutes
+    info: number of outbound errors for the network interface in the last 10 minutes
       to: sysadmin
 
 
@@ -126,5 +126,5 @@ template: wmi_disk_in_use
     warn: $this > (($status >= $WARNING)  ? (80) : (90))
     crit: $this > (($status == $CRITICAL) ? (90) : (98))
    delay: down 15m multiplier 1.5 max 1h
-    info: used disk space
+    info: disk space utilization
       to: sysadmin
diff --git a/health/health.d/x509check.conf b/health/health.d/x509check.conf
index dfca3770..f2e4a050 100644
--- a/health/health.d/x509check.conf
+++ b/health/health.d/x509check.conf
@@ -20,7 +20,7 @@ template: x509check_days_until_expiration
    every: 60s
     warn: $this < $days_until_expiration_warning*24*60*60
     crit: $this < $days_until_expiration_critical*24*60*60
-    info: certificate time until expiration
+    info: time until x509 certificate expires
       to: webmaster
       
 template: x509check_revocation_status
@@ -28,5 +28,5 @@ template: x509check_revocation_status
     calc: $revoked
    every: 60s
     crit: $this != nan AND $this != 0
-    info: certificate revocation status
+    info: x509 certificate revocation status (0: revoked, 1: valid)
       to: webmaster
diff --git a/health/health.d/zfs.conf b/health/health.d/zfs.conf
index af73824e..74f96dd3 100644
--- a/health/health.d/zfs.conf
+++ b/health/health.d/zfs.conf
@@ -6,5 +6,5 @@
    every: 1m
     warn: $this > 0
    delay: down 1h multiplier 1.5 max 2h
-    info: the number of times ZFS had to limit the ARC growth in the last 10 minutes
+    info: number of times ZFS had to limit the ARC growth in the last 10 minutes
       to: sysadmin
diff --git a/health/health.h b/health/health.h
index 5281e16e..07ce1311 100644
--- a/health/health.h
+++ b/health/health.h
@@ -64,7 +64,7 @@ extern int health_variable_lookup(const char *variable, uint32_t hash, RRDCALC *
 extern void health_aggregate_alarms(RRDHOST *host, BUFFER *wb, BUFFER* context, RRDCALC_STATUS status);
 extern void health_alarms2json(RRDHOST *host, BUFFER *wb, int all);
 extern void health_alarms_values2json(RRDHOST *host, BUFFER *wb, int all);
-extern void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after);
+extern void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart);
 
 void health_api_v1_chart_variables2json(RRDSET *st, BUFFER *buf);
 void health_api_v1_chart_custom_variables2json(RRDSET *st, BUFFER *buf);
diff --git a/health/health_config.c b/health/health_config.c
index 1acf3693..e24acf77 100644
--- a/health/health_config.c
+++ b/health/health_config.c
@@ -384,7 +384,7 @@ static inline int health_parse_db_lookup(
     }
 
     // sane defaults
-    *every = abs(*after);
+    *every = ABS(*after);
 
     // now we may have optional parameters
     while(*s) {
diff --git a/health/health_json.c b/health/health_json.c
index 7b5a1e3c..2a81d1c0 100644
--- a/health/health_json.c
+++ b/health/health_json.c
@@ -2,7 +2,7 @@
 
 #include "health.h"
 
-static inline void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) {
+void health_string2json(BUFFER *wb, const char *prefix, const char *label, const char *value, const char *suffix) {
     if(value && *value) {
         buffer_sprintf(wb, "%s\"%s\":\"", prefix, label);
         buffer_strcat_htmlescape(wb, value);
@@ -13,7 +13,7 @@ static inline void health_string2json(BUFFER *wb, const char *prefix, const char
         buffer_sprintf(wb, "%s\"%s\":null%s", prefix, label, suffix);
 }
 
-inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) {
+void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST *host) {
     buffer_sprintf(wb,
             "\n\t{\n"
                     "\t\t\"hostname\": \"%s\",\n"
@@ -93,18 +93,22 @@ inline void health_alarm_entry2json_nolock(BUFFER *wb, ALARM_ENTRY *ae, RRDHOST
     buffer_strcat(wb, "\t}");
 }
 
-void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after) {
+void health_alarm_log2json(RRDHOST *host, BUFFER *wb, uint32_t after, char *chart) {
     netdata_rwlock_rdlock(&host->health_log.alarm_log_rwlock);
 
     buffer_strcat(wb, "[");
 
     unsigned int max = host->health_log.max;
     unsigned int count = 0;
+    uint32_t hash_chart = 0;
+    if (chart) hash_chart = simple_hash(chart);
     ALARM_ENTRY *ae;
-    for(ae = host->health_log.alarms; ae && count < max ; count++, ae = ae->next) {
-        if(ae->unique_id > after) {
-            if(likely(count)) buffer_strcat(wb, ",");
+    for (ae = host->health_log.alarms; ae && count < max; ae = ae->next) {
+        if ((ae->unique_id > after) && (!chart || (ae->hash_chart == hash_chart && !strcmp(ae->chart, chart)))) {
+            if (likely(count))
+                buffer_strcat(wb, ",");
             health_alarm_entry2json_nolock(wb, ae, host);
+            count++;
         }
     }
 
@@ -298,6 +302,9 @@ static void health_alarms2json_fill_alarms(RRDHOST *host, BUFFER *wb, int all, v
         if(unlikely(!rc->rrdset || !rc->rrdset->last_collected_time.tv_sec))
             continue;
 
+        if (unlikely(!rrdset_is_available_for_exporting_and_alarms(rc->rrdset)))
+            continue;
+
         if(likely(!all && !(rc->status == RRDCALC_STATUS_WARNING || rc->status == RRDCALC_STATUS_CRITICAL)))
             continue;
 
diff --git a/health/health_log.c b/health/health_log.c
index 8c0bc5c3..3205f592 100644
--- a/health/health_log.c
+++ b/health/health_log.c
@@ -213,8 +213,8 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char
         if(likely(*pointers[0] == 'U' || *pointers[0] == 'A')) {
             ALARM_ENTRY *ae = NULL;
 
-            if(entries < 26) {
-                error("HEALTH [%s]: line %zu of file '%s' should have at least 26 entries, but it has %d. Ignoring it.", host->hostname, line, filename, entries);
+            if(entries < 27) {
+                error("HEALTH [%s]: line %zu of file '%s' should have at least 27 entries, but it has %d. Ignoring it.", host->hostname, line, filename, entries);
                 errored++;
                 continue;
             }
@@ -243,7 +243,7 @@ static inline ssize_t health_alarm_log_read(RRDHOST *host, FILE *fp, const char
                 RRDCALC *rc = alarm_max_last_repeat(host, alarm_name,simple_hash(alarm_name));
                 if (!rc) {
                     for(rc = host->alarms; rc ; rc = rc->next) {
-                        RRDCALC *rdcmp  = (RRDCALC *) avl_insert_lock(&(host)->alarms_idx_name, (avl *)rc);
+                        RRDCALC *rdcmp  = (RRDCALC *) avl_insert_lock(&(host)->alarms_idx_name, (avl_t *)rc);
                         if(rdcmp != rc) {
                             error("Cannot insert the alarm index ID using log %s", rc->name);
                         }
diff --git a/health/notifications/alarm-notify.sh.in b/health/notifications/alarm-notify.sh.in
index 3bf8db5f..bf6c0281 100755
--- a/health/notifications/alarm-notify.sh.in
+++ b/health/notifications/alarm-notify.sh.in
@@ -209,6 +209,9 @@ if [[ ${1} = "unittest" ]]; then
   cfgfile="${3}"    # the location of the config file to use for unit testing
   status="${4}"     # the current status : REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
   old_status="${5}" # the previous status: REMOVED, UNINITIALIZED, UNDEFINED, CLEAR, WARNING, CRITICAL
+elif [[ ${1} = "dump_methods" ]]; then
+    dump_methods=1
+    status="WARNING"
 else
   roles="${1}"               # the roles that should be notified for this event
   args_host="${2}"           # the host generated this event
@@ -372,6 +375,7 @@ EMAIL_PLAINTEXT_ONLY=
 IRC_NICKNAME=
 IRC_REALNAME=
 IRC_NETWORK=
+IRC_PORT=6667
 
 # hangouts configs
 declare -A HANGOUTS_WEBHOOK_URI
@@ -549,6 +553,15 @@ filter_recipient_by_criticality() {
 # check stackpulse
 [ -z "${STACKPULSE_WEBHOOK}" ] && SEND_STACKPULSE="NO"
 
+# check msteam
+[ -z "${MSTEAM_WEBHOOK_URL}" ] && SEND_MSTEAM="NO"
+
+# check pd
+[ -z "${DEFAULT_RECIPIENT_PD}" ] && SEND_PD="NO"
+
+# check prowl
+[ -z "${DEFAULT_RECIPIENT_PROWL}" ] && SEND_PROWL="NO"
+
 if [ "${SEND_PUSHOVER}" = "YES" ] ||
   [ "${SEND_SLACK}" = "YES" ] ||
   [ "${SEND_ROCKETCHAT}" = "YES" ] ||
@@ -639,6 +652,15 @@ if [ "${SEND_AWSSNS}" = "YES" ] && [ -z "${aws}" ]; then
   fi
 fi
 
+if [ ${dump_methods} ]; then
+    for name in "${!SEND_@}"; do
+        if [ "${!name}" = "YES" ]; then
+            echo "$name"
+        fi
+    done
+    exit
+fi
+
 # -----------------------------------------------------------------------------
 # find the recipients' addresses per method
 
@@ -864,14 +886,15 @@ send_email() {
       echo >&2 "--- END sendmail command ---"
     fi
 
-    "${sendmail}" -t "${opts[@]}"
+    local cmd_output
+    cmd_output=$("${sendmail}" -t "${opts[@]}" 2>&1)
     ret=$?
 
     if [ ${ret} -eq 0 ]; then
       info "sent email notification for: ${host} ${chart}.${name} is ${status} to '${to_email}'"
       return 0
     else
-      error "failed to send email notification for: ${host} ${chart}.${name} is ${status} to '${to_email}' with error code ${ret}."
+      error "failed to send email notification for: ${host} ${chart}.${name} is ${status} to '${to_email}' with error code ${ret} (${cmd_output})."
       return 1
     fi
   fi
@@ -1722,9 +1745,9 @@ send_prowl() {
 # irc sender
 
 send_irc() {
-  local NICKNAME="${1}" REALNAME="${2}" CHANNELS="${3}" NETWORK="${4}" SERVERNAME="${5}" MESSAGE="${6}" sent=0 channel color send_alarm reply_codes error
+  local NICKNAME="${1}" REALNAME="${2}" CHANNELS="${3}" NETWORK="${4}" PORT="${5}" SERVERNAME="${6}" MESSAGE="${7}" sent=0 channel color send_alarm reply_codes error
 
-  if [ "${SEND_IRC}" = "YES" ] && [ -n "${NICKNAME}" ] && [ -n "${REALNAME}" ] && [ -n "${CHANNELS}" ] && [ -n "${NETWORK}" ] && [ -n "${SERVERNAME}" ]; then
+  if [ "${SEND_IRC}" = "YES" ] && [ -n "${NICKNAME}" ] && [ -n "${REALNAME}" ] && [ -n "${CHANNELS}" ] && [ -n "${NETWORK}" ] && [ -n "${SERVERNAME}" ] && [ -n "${PORT}" ]; then
     case "${status}" in
     WARNING) color="warning" ;;
     CRITICAL) color="danger" ;;
@@ -1735,7 +1758,7 @@ send_irc() {
     SNDMESSAGE="${MESSAGE//$'\n'/", "}"
     for CHANNEL in ${CHANNELS}; do
       error=0
-      send_alarm=$(echo -e "USER ${NICKNAME} guest ${REALNAME} ${SERVERNAME}\\nNICK ${NICKNAME}\\nJOIN ${CHANNEL}\\nPRIVMSG ${CHANNEL} :${SNDMESSAGE}\\nQUIT\\n" \  | nc "${NETWORK}" 6667)
+      send_alarm=$(echo -e "USER ${NICKNAME} guest ${REALNAME} ${SERVERNAME}\\nNICK ${NICKNAME}\\nJOIN ${CHANNEL}\\nPRIVMSG ${CHANNEL} :${SNDMESSAGE}\\nQUIT\\n" \  | nc "${NETWORK}" "${PORT}")
       reply_codes=$(echo "${send_alarm}" | cut -d ' ' -f 2 | grep -o '[0-9]*')
       for code in ${reply_codes}; do
         if [ "${code}" -ge 400 ] && [ "${code}" -le 599 ]; then
@@ -2465,7 +2488,7 @@ SENT_PROWL=$?
 # -----------------------------------------------------------------------------
 # send the irc message
 
-send_irc "${IRC_NICKNAME}" "${IRC_REALNAME}" "${to_irc}" "${IRC_NETWORK}" "${host}" "${host} ${status_message} - ${name//_/ } - ${chart} ----- ${alarm}
+send_irc "${IRC_NICKNAME}" "${IRC_REALNAME}" "${to_irc}" "${IRC_NETWORK}" "${IRC_PORT}" "${host}" "${host} ${status_message} - ${name//_/ } - ${chart} ----- ${alarm}
 Severity: ${severity}
 Chart: ${chart}
 Family: ${family}
diff --git a/health/notifications/email/README.md b/health/notifications/email/README.md
index 827a9c0b..ebd7f4b8 100644
--- a/health/notifications/email/README.md
+++ b/health/notifications/email/README.md
@@ -43,7 +43,7 @@ You can always find the location of the alarm-notify.sh script in `netdata.conf`
 If you want an alternative to `sendmail` in order to have a simple MTA configuration for sending emails and auth to an existing SMTP server, you can do the following:
 
 - Install `msmtp`.
-- Modify the `sendmail` path in `health_alarm_notify.conf` to point to the location of `mstmp`:
+- Modify the `sendmail` path in `health_alarm_notify.conf` to point to the location of `msmtp`:
 ```
 # The full path to the sendmail command.
 # If empty, the system $PATH will be searched for it.
diff --git a/health/notifications/health_alarm_notify.conf b/health/notifications/health_alarm_notify.conf
index be669e13..2dab1d48 100755
--- a/health/notifications/health_alarm_notify.conf
+++ b/health/notifications/health_alarm_notify.conf
@@ -676,6 +676,10 @@ DEFAULT_RECIPIENT_IRC=""
 # e.g. "irc.freenode.net"
 IRC_NETWORK=""
 
+# The irc port to which a connection will occur.
+# e.g. 6667 (the default one), 6697 (a TLS/SSL one)
+IRC_PORT=6667
+
 # The irc nickname which is required to send the notification. It must not be
 # an already registered name as the connection's MODE is defined as a 'guest'.
 IRC_NICKNAME=""
diff --git a/health/notifications/stackpulse/README.md b/health/notifications/stackpulse/README.md
index 13d2f723..4c44954a 100644
--- a/health/notifications/stackpulse/README.md
+++ b/health/notifications/stackpulse/README.md
@@ -39,8 +39,9 @@ SEND_STACKPULSE="YES"
 STACKPULSE_WEBHOOK="https://hooks.stackpulse.io/v1/webhooks/YOUR_UNIQUE_ID"
 ```
 
-4.  Now [restart Netdata](/docs/getting-started.md#start-stop-and-restart-netdata). When your node creates an alarm, you
-    can see the associated notification on your StackPulse Administration Portal 
+4.  Now restart Netdata using `sudo systemctl restart netdata`, or the [appropriate
+    method](/docs/configure/start-stop-restart.md) for your system. When your node creates an alarm, you can see the
+    associated notification on your StackPulse Administration Portal 
 
 ## React to alarms with playbooks
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2021-03-31 12:59:21 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2021-03-31 12:59:21 +0000
commit	bb8713bbc1c4594366fc735c04910edbf4c61aab (patch)
tree	d7da56c0b89aa371dd8ad986995dd145fdf6670a /health
parent	Releasing debian version 1.29.3-4. (diff)
download	netdata-bb8713bbc1c4594366fc735c04910edbf4c61aab.tar.xz netdata-bb8713bbc1c4594366fc735c04910edbf4c61aab.zip