26 files changed, 249 insertions, 219 deletions
diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf
index 3f92e80df..8492bb6c7 100644
--- a/health/health.d/bcache.conf
+++ b/health/health.d/bcache.conf
@@ -12,7 +12,7 @@ component: Disk
      info: number of times data was read from the cache, \
            the bucket was reused and invalidated in the last 10 minutes \
            (when this occurs the data is reread from the backing device)
-       to: sysadmin
+       to: silent
 
  template: bcache_cache_dirty
        on: disk.bcache_cache_alloc
@@ -26,4 +26,4 @@ component: Disk
     delay: up 1m down 1h multiplier 1.5 max 2h
      info: percentage of cache space used for dirty data and metadata \
            (this usually means your SSD cache is too small)
-       to: sysadmin
+       to: silent
diff --git a/health/health.d/btrfs.conf b/health/health.d/btrfs.conf
index 97b7a3a94..b2a50682b 100644
--- a/health/health.d/btrfs.conf
+++ b/health/health.d/btrfs.conf
@@ -9,11 +9,10 @@ component: File system
      calc: 100 - ($unallocated * 100 / ($unallocated + $data_used + $data_free + $meta_used + $meta_free + $sys_used + $sys_free))
     units: %
     every: 10s
-     warn: $this > (($status >= $WARNING)  ? (90) : (95))
-     crit: $this > (($status == $CRITICAL) ? (95) : (98))
+     warn: $this > (($status == $CRITICAL) ? (95) : (98))
     delay: up 1m down 15m multiplier 1.5 max 1h
      info: percentage of allocated BTRFS physical disk space
-       to: sysadmin
+       to: silent
 
  template: btrfs_data
        on: btrfs.data
@@ -86,7 +85,7 @@ component: File system
     hosts: *
     units: errors
    lookup: max -10m every 1m of write_errs
-     warn: $this > 0
+     crit: $this > 0
     delay: up 1m down 15m multiplier 1.5 max 1h
      info: number of encountered BTRFS write errors
        to: sysadmin
@@ -100,7 +99,7 @@ component: File system
     hosts: *
     units: errors
    lookup: max -10m every 1m of flush_errs
-     warn: $this > 0
+     crit: $this > 0
     delay: up 1m down 15m multiplier 1.5 max 1h
      info: number of encountered BTRFS flush errors
        to: sysadmin
diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf
index f625e5455..53a6ea00f 100644
--- a/health/health.d/cgroups.conf
+++ b/health/health.d/cgroups.conf
@@ -11,11 +11,10 @@ component: CPU
    lookup: average -10m unaligned
     units: %
     every: 1m
-     warn: $this > (($status >= $WARNING)  ? (75) : (85))
-     crit: $this > (($status == $CRITICAL) ? (85) : (95))
+     warn: $this > (($status == $CRITICAL) ? (85) : (95))
     delay: down 15m multiplier 1.5 max 1h
      info: average cgroup CPU utilization over the last 10 minutes
-       to: sysadmin
+       to: silent
 
  template: cgroup_ram_in_use
        on: cgroup.mem_usage
@@ -31,44 +30,45 @@ component: Memory
      crit: $this > (($status == $CRITICAL) ? (90) : (98))
     delay: down 15m multiplier 1.5 max 1h
      info: cgroup memory utilization
-       to: sysadmin
-
-# -----------------------------------------------------------------------------
-# check for packet storms
-
-# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
-# 2. do the same for the last 10s
-# 3. raise an alarm if the later is 10x or 20x the first
-# we assume the minimum packet storm should at least have
-# 10000 packets/s, average of the last 10 seconds
-
- template: cgroup_1m_received_packets_rate
-       on: cgroup.net_packets
-    class: Workload
-     type: Cgroups
-component: Network
-    hosts: *
-   lookup: average -1m unaligned of received
-    units: packets
-    every: 10s
-     info: average number of packets received by the network interface ${label:device} over the last minute
-
- template: cgroup_10s_received_packets_storm
-       on: cgroup.net_packets
-    class: Workload
-     type: Cgroups
-component: Network
-    hosts: *
-   lookup: average -10s unaligned of received
-     calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
-    every: 10s
-    units: %
-     warn: $this > (($status >= $WARNING)?(200):(5000))
-  options: no-clear-notification
-     info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
-           compared to the rate over the last minute
-       to: sysadmin
+       to: silent
 
+# FIXME COMMENTED DUE TO A BUG IN NETDATA
+## -----------------------------------------------------------------------------
+## check for packet storms
+#
+## 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
+## 2. do the same for the last 10s
+## 3. raise an alarm if the later is 10x or 20x the first
+## we assume the minimum packet storm should at least have
+## 10000 packets/s, average of the last 10 seconds
+#
+# template: cgroup_1m_received_packets_rate
+#       on: cgroup.net_packets
+#    class: Workload
+#     type: Cgroups
+#component: Network
+#    hosts: *
+#   lookup: average -1m unaligned of received
+#    units: packets
+#    every: 10s
+#     info: average number of packets received by the network interface ${label:device} over the last minute
+#
+# template: cgroup_10s_received_packets_storm
+#       on: cgroup.net_packets
+#    class: Workload
+#     type: Cgroups
+#component: Network
+#    hosts: *
+#   lookup: average -10s unaligned of received
+#     calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
+#    every: 10s
+#    units: %
+#     warn: $this > (($status >= $WARNING)?(200):(5000))
+#  options: no-clear-notification
+#     info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
+#           compared to the rate over the last minute
+#       to: sysadmin
+#
 # ---------------------------------K8s containers--------------------------------------------
 
  template: k8s_cgroup_10min_cpu_usage
@@ -83,8 +83,9 @@ component: CPU
     every: 1m
      warn: $this > (($status >= $WARNING)  ? (75) : (85))
     delay: down 15m multiplier 1.5 max 1h
-     info: average cgroup CPU utilization over the last 10 minutes
-       to: sysadmin
+     info: container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \
+           average CPU utilization over the last 10 minutes
+       to: silent
 
  template: k8s_cgroup_ram_in_use
        on: k8s.cgroup.mem_usage
@@ -99,40 +100,42 @@ component: Memory
      warn: $this > (($status >= $WARNING)  ? (80) : (90))
      crit: $this > (($status == $CRITICAL) ? (90) : (98))
     delay: down 15m multiplier 1.5 max 1h
-     info: cgroup memory utilization
-       to: sysadmin
+     info: container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \
+           memory utilization
+       to: silent
 
 # check for packet storms
 
-# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
-# 2. do the same for the last 10s
-# 3. raise an alarm if the later is 10x or 20x the first
-# we assume the minimum packet storm should at least have
-# 10000 packets/s, average of the last 10 seconds
-
- template: k8s_cgroup_1m_received_packets_rate
-       on: k8s.cgroup.net_packets
-    class: Workload
-     type: Cgroups
-component: Network
-    hosts: *
-   lookup: average -1m unaligned of received
-    units: packets
-    every: 10s
-     info: average number of packets received by the network interface ${label:device} over the last minute
-
- template: k8s_cgroup_10s_received_packets_storm
-       on: k8s.cgroup.net_packets
-    class: Workload
-     type: Cgroups
-component: Network
-    hosts: *
-   lookup: average -10s unaligned of received
-     calc: $this * 100 / (($k8s_cgroup_10s_received_packets_storm < 1000)?(1000):($k8s_cgroup_10s_received_packets_storm))
-    every: 10s
-    units: %
-     warn: $this > (($status >= $WARNING)?(200):(5000))
-  options: no-clear-notification
-     info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
-           compared to the rate over the last minute
-       to: sysadmin
+# FIXME COMMENTED DUE TO A BUG IN NETDATA
+## 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
+## 2. do the same for the last 10s
+## 3. raise an alarm if the later is 10x or 20x the first
+## we assume the minimum packet storm should at least have
+## 10000 packets/s, average of the last 10 seconds
+#
+# template: k8s_cgroup_1m_received_packets_rate
+#       on: k8s.cgroup.net_packets
+#    class: Workload
+#     type: Cgroups
+#component: Network
+#    hosts: *
+#   lookup: average -1m unaligned of received
+#    units: packets
+#    every: 10s
+#     info: average number of packets received by the network interface ${label:device} over the last minute
+#
+# template: k8s_cgroup_10s_received_packets_storm
+#       on: k8s.cgroup.net_packets
+#    class: Workload
+#     type: Cgroups
+#component: Network
+#    hosts: *
+#   lookup: average -10s unaligned of received
+#     calc: $this * 100 / (($k8s_cgroup_10s_received_packets_storm < 1000)?(1000):($k8s_cgroup_10s_received_packets_storm))
+#    every: 10s
+#    units: %
+#     warn: $this > (($status >= $WARNING)?(200):(5000))
+#  options: no-clear-notification
+#     info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
+#           compared to the rate over the last minute
+#       to: sysadmin
diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf
index 907d6ff8a..4de5edd75 100644
--- a/health/health.d/cpu.conf
+++ b/health/health.d/cpu.conf
@@ -15,7 +15,7 @@ component: CPU
      crit: $this > (($status == $CRITICAL) ? (85) : (95))
     delay: down 15m multiplier 1.5 max 1h
      info: average CPU utilization over the last 10 minutes (excluding iowait, nice and steal)
-       to: sysadmin
+       to: silent
 
  template: 10min_cpu_iowait
        on: system.cpu
@@ -28,9 +28,9 @@ component: CPU
     units: %
     every: 1m
      warn: $this > (($status >= $WARNING)  ? (20) : (40))
-    delay: down 15m multiplier 1.5 max 1h
+    delay: up 30m down 30m multiplier 1.5 max 2h
      info: average CPU iowait time over the last 10 minutes
-       to: sysadmin
+       to: silent
 
  template: 20min_steal_cpu
        on: system.cpu
@@ -45,7 +45,7 @@ component: CPU
      warn: $this > (($status >= $WARNING)  ? (5)  : (10))
     delay: down 1h multiplier 1.5 max 2h
      info: average CPU steal time over the last 20 minutes
-       to: sysadmin
+       to: silent
 
 ## FreeBSD
  template: 10min_cpu_usage
@@ -62,4 +62,4 @@ component: CPU
      crit: $this > (($status == $CRITICAL) ? (85) : (95))
     delay: down 15m multiplier 1.5 max 1h
      info: average CPU utilization over the last 10 minutes (excluding nice)
-       to: sysadmin
+       to: silent
diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf
index 7bd4f120c..27f5d6691 100644
--- a/health/health.d/disks.conf
+++ b/health/health.d/disks.conf
@@ -21,7 +21,7 @@ chart labels: mount_point=!/dev !/dev/* !/run !/run/* *
     units: %
     every: 1m
      warn: $this > (($status >= $WARNING ) ? (80) : (90))
-     crit: $this > (($status == $CRITICAL) ? (90) : (98))
+     crit: ($this > (($status == $CRITICAL) ? (90) : (98))) && $avail < 5
     delay: up 1m down 15m multiplier 1.5 max 1h
      info: disk ${label:mount_point} space utilization
        to: sysadmin
@@ -55,33 +55,32 @@ chart labels: mount_point=!/dev !/dev/* !/run !/run/* *
 # we will use it in the next template to find
 # the hours remaining
 
-# template: disk_fill_rate
-#       on: disk.space
-#       os: linux freebsd
-#    hosts: *
-#   lookup: min -10m at -50m unaligned of avail
-#     calc: ($this - $avail) / (($now - $after) / 3600)
-#    every: 1m
-#    units: GB/hour
-#     info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour
-
+template: disk_fill_rate
+      on: disk.space
+      os: linux freebsd
+   hosts: *
+  lookup: min -10m at -50m unaligned of avail
+    calc: ($this - $avail) / (($now - $after) / 3600)
+   every: 1m
+   units: GB/hour
+    info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour
 
 # calculate the hours remaining
 # if the disk continues to fill
 # in this rate
 
-# template: out_of_disk_space_time
-#       on: disk.space
-#       os: linux freebsd
-#    hosts: *
-#     calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf)
-#    units: hours
-#    every: 10s
-#     warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
-#     crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
-#    delay: down 15m multiplier 1.2 max 1h
-#     info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour
-#       to: sysadmin
+template: out_of_disk_space_time
+      on: disk.space
+      os: linux freebsd
+   hosts: *
+    calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf)
+   units: hours
+   every: 10s
+    warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
+    crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+   delay: down 15m multiplier 1.2 max 1h
+    info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour
+      to: silent
 
 
 # -----------------------------------------------------------------------------
@@ -95,32 +94,32 @@ chart labels: mount_point=!/dev !/dev/* !/run !/run/* *
 # we will use it in the next template to find
 # the hours remaining
 
-# template: disk_inode_rate
-#       on: disk.inodes
-#       os: linux freebsd
-#    hosts: *
-#   lookup: min -10m at -50m unaligned of avail
-#     calc: ($this - $avail) / (($now - $after) / 3600)
-#    every: 1m
-#    units: inodes/hour
-#     info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour
+template: disk_inode_rate
+      on: disk.inodes
+      os: linux freebsd
+   hosts: *
+  lookup: min -10m at -50m unaligned of avail
+    calc: ($this - $avail) / (($now - $after) / 3600)
+   every: 1m
+   units: inodes/hour
+    info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour
 
 # calculate the hours remaining
 # if the disk inodes are allocated
 # in this rate
 
-# template: out_of_disk_inodes_time
-#       on: disk.inodes
-#       os: linux freebsd
-#    hosts: *
-#     calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf)
-#    units: hours
-#    every: 10s
-#     warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
-#     crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
-#    delay: down 15m multiplier 1.2 max 1h
-#     info: estimated time the disk will run out of inodes, if the system continues to allocate inodes with the rate of the last hour
-#       to: sysadmin
+template: out_of_disk_inodes_time
+      on: disk.inodes
+      os: linux freebsd
+   hosts: *
+    calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf)
+   units: hours
+   every: 10s
+    warn: $this > 0 and $this < (($status >= $WARNING)  ? (48) : (8))
+    crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+   delay: down 15m multiplier 1.2 max 1h
+    info: estimated time the disk will run out of inodes, if the system continues to allocate inodes with the rate of the last hour
+      to: silent
 
 
 # -----------------------------------------------------------------------------
diff --git a/health/health.d/file_descriptors.conf b/health/health.d/file_descriptors.conf
index d136ea517..60bb8d384 100644
--- a/health/health.d/file_descriptors.conf
+++ b/health/health.d/file_descriptors.conf
@@ -20,12 +20,12 @@
      type: System
 component: Process
        os: linux
-   module: !* *
+   module: *
     hosts: *
-   lookup: max -1m unaligned foreach *
+   lookup: max -10s unaligned foreach *
     units: %
-    every: 1m
-     warn: $this > (($status >= $WARNING)  ? (85) : (90))
+    every: 10s
+     warn: $this > (($status >= $WARNING)  ? (85) : (95))
     delay: down 15m multiplier 1.5 max 1h
-     info: maximum utilization of open files among all application group PIDs
+     info: open files percentage against the processes limits, among all PIDs in application group
        to: sysadmin
diff --git a/health/health.d/ioping.conf b/health/health.d/ioping.conf
index 47ac4453c..5fd785b84 100644
--- a/health/health.d/ioping.conf
+++ b/health/health.d/ioping.conf
@@ -10,4 +10,4 @@ component: Disk
      warn: $this > $green
     delay: down 30m multiplier 1.5 max 2h
      info: average I/O latency over the last 10 seconds
-       to: sysadmin
+       to: silent
diff --git a/health/health.d/ipmi.conf b/health/health.d/ipmi.conf
index 4d6478cca..1775783df 100644
--- a/health/health.d/ipmi.conf
+++ b/health/health.d/ipmi.conf
@@ -23,4 +23,4 @@ component: IPMI
      warn: $this > 0
     delay: up 5m down 15m multiplier 1.5 max 1h
      info: number of events in the IPMI System Event Log (SEL)
-       to: sysadmin
+       to: silent
diff --git a/health/health.d/linux_power_supply.conf b/health/health.d/linux_power_supply.conf
index 4562122ca..71a5be284 100644
--- a/health/health.d/linux_power_supply.conf
+++ b/health/health.d/linux_power_supply.conf
@@ -11,4 +11,4 @@ component: Battery
      warn: $this < 10
     delay: up 30s down 5m multiplier 1.2 max 1h
      info: percentage of remaining power supply capacity
-       to: sysadmin
+       to: silent
diff --git a/health/health.d/load.conf b/health/health.d/load.conf
index 75989c57f..20f6781c8 100644
--- a/health/health.d/load.conf
+++ b/health/health.d/load.conf
@@ -34,7 +34,7 @@ component: Load
      warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 175 : 200)
     delay: down 15m multiplier 1.5 max 1h
      info: system fifteen-minute load average
-       to: sysadmin
+       to: silent
 
     alarm: load_average_5
        on: system.load
@@ -50,7 +50,7 @@ component: Load
      warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 350 : 400)
     delay: down 15m multiplier 1.5 max 1h
      info: system five-minute load average
-       to: sysadmin
+       to: silent
 
     alarm: load_average_1
        on: system.load
@@ -66,4 +66,4 @@ component: Load
      warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 700 : 800)
     delay: down 15m multiplier 1.5 max 1h
      info: system one-minute load average
-       to: sysadmin
+       to: silent
diff --git a/health/health.d/mdstat.conf b/health/health.d/mdstat.conf
index b90455a58..4dc0bf207 100644
--- a/health/health.d/mdstat.conf
+++ b/health/health.d/mdstat.conf
@@ -1,15 +1,3 @@
- template: mdstat_last_collected
-       on: md.disks
-    class: Latency
-     type: System
-component: RAID
-     calc: $now - $last_collected_t
-    units: seconds ago
-    every: 10s
-     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
-     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
-     info: number of seconds since the last successful data collection
-       to: sysadmin
 
  template: mdstat_disks
        on: md.disks
@@ -19,7 +7,7 @@ component: RAID
     units: failed devices
     every: 10s
      calc: $down
-     crit: $this > 0
+     warn: $this > 0
      info: number of devices in the down state for the ${label:device} ${label:raid_level} array. \
            Any number > 0 indicates that the array is degraded.
        to: sysadmin
@@ -36,7 +24,7 @@ chart labels: raid_level=!raid1 !raid10 *
      warn: $this > 1024
     delay: up 30m
      info: number of unsynchronized blocks for the ${label:device} ${label:raid_level} array
-       to: sysadmin
+       to: silent
 
  template: mdstat_nonredundant_last_collected
        on: md.nonredundant
diff --git a/health/health.d/memory.conf b/health/health.d/memory.conf
index 010cbbd7b..8badf09c4 100644
--- a/health/health.d/memory.conf
+++ b/health/health.d/memory.conf
@@ -1,47 +1,80 @@
-
 # you can disable an alarm notification by setting the 'to' line to: silent
 
-    alarm: 1hour_ecc_memory_correctable
-       on: mem.ecc_ce
+    alarm: 1hour_memory_hw_corrupted
+       on: mem.hwcorrupt
     class: Errors
      type: System
 component: Memory
        os: linux
     hosts: *
-   lookup: sum -10m unaligned
+     calc: $HardwareCorrupted
+    units: MB
+    every: 10s
+     warn: $this > 0
+    delay: down 1h multiplier 1.5 max 1h
+     info: amount of memory corrupted due to a hardware failure
+       to: sysadmin
+
+## ECC Controller
+
+ template: ecc_memory_mc_correctable
+       on: mem.edac_mc
+    class: Errors
+     type: System
+component: Memory
+       os: linux
+    hosts: *
+   lookup: sum -10m unaligned of correctable, correctable_noinfo
     units: errors
     every: 1m
      warn: $this > 0
     delay: down 1h multiplier 1.5 max 1h
-     info: number of ECC correctable errors in the last 10 minutes
+     info: memory controller ${label:controller} ECC correctable errors in the last 10 minutes
        to: sysadmin
 
-    alarm: 1hour_ecc_memory_uncorrectable
-       on: mem.ecc_ue
+ template: ecc_memory_mc_uncorrectable
+       on: mem.edac_mc
     class: Errors
      type: System
 component: Memory
        os: linux
     hosts: *
-   lookup: sum -10m unaligned
+   lookup: sum -10m unaligned of uncorrectable,uncorrectable_noinfo
     units: errors
     every: 1m
      crit: $this > 0
     delay: down 1h multiplier 1.5 max 1h
-     info: number of ECC uncorrectable errors in the last 10 minutes
+     info: memory controller ${label:controller} ECC uncorrectable errors in the last 10 minutes
        to: sysadmin
 
-    alarm: 1hour_memory_hw_corrupted
-       on: mem.hwcorrupt
+## ECC DIMM
+
+ template: ecc_memory_dimm_correctable
+       on: mem.edac_mc_dimm
     class: Errors
      type: System
 component: Memory
        os: linux
     hosts: *
-     calc: $HardwareCorrupted
-    units: MB
-    every: 10s
+   lookup: sum -10m unaligned of correctable
+    units: errors
+    every: 1m
      warn: $this > 0
     delay: down 1h multiplier 1.5 max 1h
-     info: amount of memory corrupted due to a hardware failure
+     info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC correctable errors in the last 10 minutes
+       to: sysadmin
+
+ template: ecc_memory_dimm_uncorrectable
+       on: mem.edac_mc_dimm
+    class: Errors
+     type: System
+component: Memory
+       os: linux
+    hosts: *
+   lookup: sum -10m unaligned of uncorrectable
+    units: errors
+    every: 1m
+     crit: $this > 0
+    delay: down 1h multiplier 1.5 max 1h
+     info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC uncorrectable errors in the last 10 minutes
        to: sysadmin
diff --git a/health/health.d/net.conf b/health/health.d/net.conf
index 08a4eecb4..095d488da 100644
--- a/health/health.d/net.conf
+++ b/health/health.d/net.conf
@@ -30,7 +30,7 @@ component: Network
      warn: $this > (($status >= $WARNING)  ? (85) : (90))
     delay: up 1m down 1m multiplier 1.5 max 1h
      info: average inbound utilization for the network interface ${label:device} over the last minute
-       to: sysadmin
+       to: silent
 
  template: 1m_sent_traffic_overflow
        on: net.net
@@ -46,7 +46,7 @@ component: Network
      warn: $this > (($status >= $WARNING)  ? (85) : (90))
     delay: up 1m down 1m multiplier 1.5 max 1h
      info: average outbound utilization for the network interface ${label:device} over the last minute
-       to: sysadmin
+       to: silent
 
 # -----------------------------------------------------------------------------
 # dropped packets
@@ -97,7 +97,7 @@ chart labels: device=!wl* *
      warn: $this >= 2
     delay: up 1m down 1h multiplier 1.5 max 2h
      info: ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes
-       to: sysadmin
+       to: silent
 
  template: outbound_packets_dropped_ratio
        on: net.packets
@@ -114,7 +114,7 @@ chart labels: device=!wl* *
      warn: $this >= 2
     delay: up 1m down 1h multiplier 1.5 max 2h
      info: ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes
-       to: sysadmin
+       to: silent
 
  template: wifi_inbound_packets_dropped_ratio
        on: net.packets
@@ -131,7 +131,7 @@ chart labels: device=wl*
      warn: $this >= 10
     delay: up 1m down 1h multiplier 1.5 max 2h
      info: ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes
-       to: sysadmin
+       to: silent
 
  template: wifi_outbound_packets_dropped_ratio
        on: net.packets
@@ -148,7 +148,7 @@ chart labels: device=wl*
      warn: $this >= 10
     delay: up 1m down 1h multiplier 1.5 max 2h
      info: ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes
-       to: sysadmin
+       to: silent
 
 # -----------------------------------------------------------------------------
 # interface errors
@@ -166,7 +166,7 @@ component: Network
      warn: $this >= 5
     delay: down 1h multiplier 1.5 max 2h
      info: number of inbound errors for the network interface ${label:device} in the last 10 minutes
-       to: sysadmin
+       to: silent
 
  template: interface_outbound_errors
        on: net.errors
@@ -181,7 +181,7 @@ component: Network
      warn: $this >= 5
     delay: down 1h multiplier 1.5 max 2h
      info: number of outbound errors for the network interface ${label:device} in the last 10 minutes
-       to: sysadmin
+       to: silent
 
 # -----------------------------------------------------------------------------
 # FIFO errors
@@ -204,7 +204,7 @@ component: Network
      warn: $this > 0
     delay: down 1h multiplier 1.5 max 2h
      info: number of FIFO errors for the network interface ${label:device} in the last 10 minutes
-       to: sysadmin
+       to: silent
 
 # -----------------------------------------------------------------------------
 # check for packet storms
@@ -243,4 +243,4 @@ component: Network
   options: no-clear-notification
      info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
            compared to the rate over the last minute
-       to: sysadmin
+       to: silent
diff --git a/health/health.d/qos.conf b/health/health.d/qos.conf
index 7290d15ff..4b0a5cb96 100644
--- a/health/health.d/qos.conf
+++ b/health/health.d/qos.conf
@@ -5,14 +5,13 @@
 # the alarm is checked every 10 seconds
 # and examines the last minute of data
 
-#template: 10min_qos_packet_drops
-#      on: tc.qos_dropped
-#      os: linux
-#   hosts: *
-#  lookup: sum -10m unaligned absolute
-#   every: 30s
-#    warn: $this > 0
-#   delay: up 0 down 30m multiplier 1.5 max 1h
-#   units: packets
-#    info: dropped packets in the last 30 minutes
-#      to: sysadmin
+template: 10min_qos_packet_drops
+      on: tc.qos_dropped
+      os: linux
+   hosts: *
+  lookup: sum -5m unaligned absolute
+   every: 30s
+    warn: $this > 0
+   units: packets
+    info: dropped packets in the last 5 minutes
+      to: silent
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
index 34e5431a8..c121264f7 100644
--- a/health/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -30,7 +30,7 @@ component: Memory
      warn: $this < (($status >= $WARNING)  ? (15) : (10))
     delay: down 15m multiplier 1.5 max 1h
      info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
-       to: sysadmin
+       to: silent
 
       alarm: oom_kill
          on: mem.oom_kill
@@ -41,9 +41,8 @@ component: Memory
       every: 5m
        warn: $this > 0
       delay: down 10m
-host labels: _is_k8s_node = false
        info: number of out of memory kills in the last 30 minutes
-         to: sysadmin
+         to: silent
 
 ## FreeBSD
     alarm: ram_in_use
@@ -75,4 +74,4 @@ component: Memory
      warn: $this < (($status >= $WARNING)  ? (15) : (10))
     delay: down 15m multiplier 1.5 max 1h
      info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
-       to: sysadmin
+       to: silent
diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf
index 345f87505..b621d969d 100644
--- a/health/health.d/softnet.conf
+++ b/health/health.d/softnet.conf
@@ -17,7 +17,7 @@ component: Network
     delay: down 1h multiplier 1.5 max 2h
      info: average number of dropped packets in the last minute \
            due to exceeded net.core.netdev_max_backlog
-       to: sysadmin
+       to: silent
 
     alarm: 1min_netdev_budget_ran_outs
        on: system.softnet_stat
@@ -51,4 +51,4 @@ component: Network
      info: average number of drops in the last minute \
            due to exceeded sysctl net.route.netisr_maxqlen \
            (this can be a cause for dropped packets)
-       to: sysadmin
+       to: silent
diff --git a/health/health.d/swap.conf b/health/health.d/swap.conf
index d30c74cee..3adcae9db 100644
--- a/health/health.d/swap.conf
+++ b/health/health.d/swap.conf
@@ -2,7 +2,7 @@
 # you can disable an alarm notification by setting the 'to' line to: silent
 
     alarm: 30min_ram_swapped_out
-       on: system.swapio
+       on: mem.swapio
     class: Workload
      type: System
 component: Memory
@@ -16,10 +16,10 @@ component: Memory
      warn: $this > (($status >= $WARNING)  ? (20) : (30))
     delay: down 15m multiplier 1.5 max 1h
      info: percentage of the system RAM swapped in the last 30 minutes
-       to: sysadmin
+       to: silent
 
     alarm: used_swap
-       on: system.swap
+       on: mem.swap
     class: Utilization
      type: System
 component: Memory
diff --git a/health/health.d/synchronization.conf b/health/health.d/synchronization.conf
index 417624adb..837bb1b32 100644
--- a/health/health.d/synchronization.conf
+++ b/health/health.d/synchronization.conf
@@ -9,4 +9,4 @@
     info: number of sync() system calls. \
           Every call causes all pending modifications to filesystem metadata and \
           cached file data to be written to the underlying filesystems.
-      to: sysadmin
+      to: silent
diff --git a/health/health.d/systemdunits.conf b/health/health.d/systemdunits.conf
index 531d62fac..aadf8452b 100644
--- a/health/health.d/systemdunits.conf
+++ b/health/health.d/systemdunits.conf
@@ -6,6 +6,7 @@
     class: Errors
      type: Linux
 component: Systemd units
+   module: !* *
      calc: $failed
     units: state
     every: 10s
@@ -20,6 +21,7 @@ component: Systemd units
     class: Errors
      type: Linux
 component: Systemd units
+   module: !* *
      calc: $failed
     units: state
     every: 10s
@@ -34,6 +36,7 @@ component: Systemd units
     class: Errors
      type: Linux
 component: Systemd units
+   module: !* *
      calc: $failed
     units: state
     every: 10s
@@ -48,6 +51,7 @@ component: Systemd units
     class: Errors
      type: Linux
 component: Systemd units
+   module: !* *
      calc: $failed
     units: state
     every: 10s
@@ -62,6 +66,7 @@ component: Systemd units
     class: Errors
      type: Linux
 component: Systemd units
+   module: !* *
      calc: $failed
     units: state
     every: 10s
@@ -76,6 +81,7 @@ component: Systemd units
     class: Errors
      type: Linux
 component: Systemd units
+   module: !* *
      calc: $failed
     units: state
     every: 10s
@@ -90,6 +96,7 @@ component: Systemd units
     class: Errors
      type: Linux
 component: Systemd units
+   module: !* *
      calc: $failed
     units: state
     every: 10s
@@ -104,6 +111,7 @@ component: Systemd units
     class: Errors
      type: Linux
 component: Systemd units
+   module: !* *
      calc: $failed
     units: state
     every: 10s
@@ -118,6 +126,7 @@ component: Systemd units
     class: Errors
      type: Linux
 component: Systemd units
+   module: !* *
      calc: $failed
     units: state
     every: 10s
@@ -132,6 +141,7 @@ component: Systemd units
     class: Errors
      type: Linux
 component: Systemd units
+   module: !* *
      calc: $failed
     units: state
     every: 10s
diff --git a/health/health.d/tcp_listen.conf b/health/health.d/tcp_listen.conf
index d4bcfa248..00ee055d0 100644
--- a/health/health.d/tcp_listen.conf
+++ b/health/health.d/tcp_listen.conf
@@ -32,7 +32,7 @@ component: Network
      crit: $this > (($status == $CRITICAL) ? (1) : (5))
     delay: up 0 down 5m multiplier 1.5 max 1h
      info: average number of overflows in the TCP accept queue over the last minute
-       to: sysadmin
+       to: silent
 
 # THIS IS TOO GENERIC
 # CHECK: https://github.com/netdata/netdata/issues/3234#issuecomment-423935842
@@ -50,7 +50,7 @@ component: Network
      crit: $this > (($status == $CRITICAL) ? (1) : (5))
     delay: up 0 down 5m multiplier 1.5 max 1h
      info: average number of dropped packets in the TCP accept queue over the last minute
-       to: sysadmin
+       to: silent
 
 
 # -----------------------------------------------------------------------------
@@ -76,7 +76,7 @@ component: Network
     delay: up 10 down 5m multiplier 1.5 max 1h
      info: average number of SYN requests was dropped due to the full TCP SYN queue over the last minute \
            (SYN cookies were not enabled)
-       to: sysadmin
+       to: silent
 
     alarm: 1m_tcp_syn_queue_cookies
        on: ip.tcp_syn_queue
@@ -92,5 +92,5 @@ component: Network
      crit: $this > (($status == $CRITICAL) ? (0) : (5))
     delay: up 10 down 5m multiplier 1.5 max 1h
      info: average number of sent SYN cookies due to the full TCP SYN queue over the last minute
-       to: sysadmin
+       to: silent
 
diff --git a/health/health.d/tcp_mem.conf b/health/health.d/tcp_mem.conf
index 318be20ac..f472d9533 100644
--- a/health/health.d/tcp_mem.conf
+++ b/health/health.d/tcp_mem.conf
@@ -20,4 +20,4 @@ component: Network
      crit: ${mem} > (($status == $CRITICAL ) ? ( ${tcp_mem_pressure}       ) : ( ${tcp_mem_high} * 0.9 ))
     delay: up 0 down 5m multiplier 1.5 max 1h
      info: TCP memory utilization
-       to: sysadmin
+       to: silent
diff --git a/health/health.d/tcp_orphans.conf b/health/health.d/tcp_orphans.conf
index cbd628da5..07022af30 100644
--- a/health/health.d/tcp_orphans.conf
+++ b/health/health.d/tcp_orphans.conf
@@ -21,4 +21,4 @@ component: Network
      crit: $this > (($status == $CRITICAL) ? ( 25 ) : ( 50 ))
     delay: up 0 down 5m multiplier 1.5 max 1h
      info: orphan IPv4 TCP sockets utilization
-       to: sysadmin
+       to: silent
diff --git a/health/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf
index ff116db64..089ac988d 100644
--- a/health/health.d/tcp_resets.conf
+++ b/health/health.d/tcp_resets.conf
@@ -33,7 +33,7 @@ component: Network
            This can indicate a port scan, \
            or that a service running on this host has crashed. \
            Netdata will not send a clear notification for this alarm.
-       to: sysadmin
+       to: silent
 
 # -----------------------------------------------------------------------------
 # tcp resets this host receives
@@ -66,4 +66,4 @@ component: Network
      info: average number of received TCP RESETS over the last 10 seconds. \
            This can be an indication that a service this host needs has crashed. \
            Netdata will not send a clear notification for this alarm.
-       to: sysadmin
+       to: silent
diff --git a/health/health.d/udp_errors.conf b/health/health.d/udp_errors.conf
index 64f47dfa7..00593c583 100644
--- a/health/health.d/udp_errors.conf
+++ b/health/health.d/udp_errors.conf
@@ -17,7 +17,7 @@ component: Network
      warn: $this > (($status >= $WARNING) ? (0) : (10))
      info: average number of UDP receive buffer errors over the last minute
     delay: up 1m down 60m multiplier 1.2 max 2h
-       to: sysadmin
+       to: silent
 
 # -----------------------------------------------------------------------------
 # UDP send buffer errors
@@ -35,4 +35,4 @@ component: Network
      warn: $this > (($status >= $WARNING) ? (0) : (10))
      info: average number of UDP send buffer errors over the last minute
     delay: up 1m down 60m multiplier 1.2 max 2h
-       to: sysadmin
+       to: silent
diff --git a/health/health.d/windows.conf b/health/health.d/windows.conf
index 28a886386..9ef4c202f 100644
--- a/health/health.d/windows.conf
+++ b/health/health.d/windows.conf
@@ -15,7 +15,7 @@ component: CPU
      crit: $this > (($status == $CRITICAL) ? (85) : (95))
     delay: down 15m multiplier 1.5 max 1h
      info: average CPU utilization over the last 10 minutes
-       to: sysadmin
+       to: silent
 
 
 ## Memory
@@ -52,7 +52,7 @@ component: Network
      warn: $this >= 5
     delay: down 1h multiplier 1.5 max 2h
      info: number of inbound discarded packets for the network interface in the last 10 minutes
-       to: sysadmin
+       to: silent
 
  template: windows_outbound_packets_discarded
        on: windows.net_nic_discarded
@@ -67,7 +67,7 @@ component: Network
      warn: $this >= 5
     delay: down 1h multiplier 1.5 max 2h
      info: number of outbound discarded packets for the network interface in the last 10 minutes
-       to: sysadmin
+       to: silent
 
  template: windows_inbound_packets_errors
        on: windows.net_nic_errors
@@ -82,7 +82,7 @@ component: Network
      warn: $this >= 5
     delay: down 1h multiplier 1.5 max 2h
      info: number of inbound errors for the network interface in the last 10 minutes
-       to: sysadmin
+       to: silent
 
  template: windows_outbound_packets_errors
        on: windows.net_nic_errors
@@ -97,7 +97,7 @@ component: Network
      warn: $this >= 5
     delay: down 1h multiplier 1.5 max 2h
      info: number of outbound errors for the network interface in the last 10 minutes
-       to: sysadmin
+       to: silent
 
 
 ## Disk
diff --git a/health/health.d/zfs.conf b/health/health.d/zfs.conf
index 7f8ea2793..40ec4ce8a 100644
--- a/health/health.d/zfs.conf
+++ b/health/health.d/zfs.conf
@@ -10,7 +10,7 @@ component: File system
      warn: $this > 0
     delay: down 1h multiplier 1.5 max 2h
      info: number of times ZFS had to limit the ARC growth in the last 10 minutes
-       to: sysadmin
+       to: silent
 
 # ZFS pool state