summaryrefslogtreecommitdiffstats
path: root/health/health.d
diff options
context:
space:
mode:
Diffstat (limited to 'health/health.d')
-rw-r--r--health/health.d/bcache.conf4
-rw-r--r--health/health.d/btrfs.conf9
-rw-r--r--health/health.d/cgroups.conf155
-rw-r--r--health/health.d/cpu.conf10
-rw-r--r--health/health.d/disks.conf87
-rw-r--r--health/health.d/file_descriptors.conf10
-rw-r--r--health/health.d/ioping.conf2
-rw-r--r--health/health.d/ipmi.conf2
-rw-r--r--health/health.d/linux_power_supply.conf2
-rw-r--r--health/health.d/load.conf6
-rw-r--r--health/health.d/mdstat.conf16
-rw-r--r--health/health.d/memory.conf63
-rw-r--r--health/health.d/net.conf20
-rw-r--r--health/health.d/qos.conf21
-rw-r--r--health/health.d/ram.conf7
-rw-r--r--health/health.d/softnet.conf4
-rw-r--r--health/health.d/swap.conf6
-rw-r--r--health/health.d/synchronization.conf2
-rw-r--r--health/health.d/systemdunits.conf10
-rw-r--r--health/health.d/tcp_listen.conf8
-rw-r--r--health/health.d/tcp_mem.conf2
-rw-r--r--health/health.d/tcp_orphans.conf2
-rw-r--r--health/health.d/tcp_resets.conf4
-rw-r--r--health/health.d/udp_errors.conf4
-rw-r--r--health/health.d/windows.conf10
-rw-r--r--health/health.d/zfs.conf2
26 files changed, 249 insertions, 219 deletions
diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf
index 3f92e80d..8492bb6c 100644
--- a/health/health.d/bcache.conf
+++ b/health/health.d/bcache.conf
@@ -12,7 +12,7 @@ component: Disk
info: number of times data was read from the cache, \
the bucket was reused and invalidated in the last 10 minutes \
(when this occurs the data is reread from the backing device)
- to: sysadmin
+ to: silent
template: bcache_cache_dirty
on: disk.bcache_cache_alloc
@@ -26,4 +26,4 @@ component: Disk
delay: up 1m down 1h multiplier 1.5 max 2h
info: percentage of cache space used for dirty data and metadata \
(this usually means your SSD cache is too small)
- to: sysadmin
+ to: silent
diff --git a/health/health.d/btrfs.conf b/health/health.d/btrfs.conf
index 97b7a3a9..b2a50682 100644
--- a/health/health.d/btrfs.conf
+++ b/health/health.d/btrfs.conf
@@ -9,11 +9,10 @@ component: File system
calc: 100 - ($unallocated * 100 / ($unallocated + $data_used + $data_free + $meta_used + $meta_free + $sys_used + $sys_free))
units: %
every: 10s
- warn: $this > (($status >= $WARNING) ? (90) : (95))
- crit: $this > (($status == $CRITICAL) ? (95) : (98))
+ warn: $this > (($status == $CRITICAL) ? (95) : (98))
delay: up 1m down 15m multiplier 1.5 max 1h
info: percentage of allocated BTRFS physical disk space
- to: sysadmin
+ to: silent
template: btrfs_data
on: btrfs.data
@@ -86,7 +85,7 @@ component: File system
hosts: *
units: errors
lookup: max -10m every 1m of write_errs
- warn: $this > 0
+ crit: $this > 0
delay: up 1m down 15m multiplier 1.5 max 1h
info: number of encountered BTRFS write errors
to: sysadmin
@@ -100,7 +99,7 @@ component: File system
hosts: *
units: errors
lookup: max -10m every 1m of flush_errs
- warn: $this > 0
+ crit: $this > 0
delay: up 1m down 15m multiplier 1.5 max 1h
info: number of encountered BTRFS flush errors
to: sysadmin
diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf
index f625e545..53a6ea00 100644
--- a/health/health.d/cgroups.conf
+++ b/health/health.d/cgroups.conf
@@ -11,11 +11,10 @@ component: CPU
lookup: average -10m unaligned
units: %
every: 1m
- warn: $this > (($status >= $WARNING) ? (75) : (85))
- crit: $this > (($status == $CRITICAL) ? (85) : (95))
+ warn: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 15m multiplier 1.5 max 1h
info: average cgroup CPU utilization over the last 10 minutes
- to: sysadmin
+ to: silent
template: cgroup_ram_in_use
on: cgroup.mem_usage
@@ -31,44 +30,45 @@ component: Memory
crit: $this > (($status == $CRITICAL) ? (90) : (98))
delay: down 15m multiplier 1.5 max 1h
info: cgroup memory utilization
- to: sysadmin
-
-# -----------------------------------------------------------------------------
-# check for packet storms
-
-# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
-# 2. do the same for the last 10s
-# 3. raise an alarm if the later is 10x or 20x the first
-# we assume the minimum packet storm should at least have
-# 10000 packets/s, average of the last 10 seconds
-
- template: cgroup_1m_received_packets_rate
- on: cgroup.net_packets
- class: Workload
- type: Cgroups
-component: Network
- hosts: *
- lookup: average -1m unaligned of received
- units: packets
- every: 10s
- info: average number of packets received by the network interface ${label:device} over the last minute
-
- template: cgroup_10s_received_packets_storm
- on: cgroup.net_packets
- class: Workload
- type: Cgroups
-component: Network
- hosts: *
- lookup: average -10s unaligned of received
- calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
- every: 10s
- units: %
- warn: $this > (($status >= $WARNING)?(200):(5000))
- options: no-clear-notification
- info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
- compared to the rate over the last minute
- to: sysadmin
+ to: silent
+# FIXME COMMENTED DUE TO A BUG IN NETDATA
+## -----------------------------------------------------------------------------
+## check for packet storms
+#
+## 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
+## 2. do the same for the last 10s
+## 3. raise an alarm if the later is 10x or 20x the first
+## we assume the minimum packet storm should at least have
+## 10000 packets/s, average of the last 10 seconds
+#
+# template: cgroup_1m_received_packets_rate
+# on: cgroup.net_packets
+# class: Workload
+# type: Cgroups
+#component: Network
+# hosts: *
+# lookup: average -1m unaligned of received
+# units: packets
+# every: 10s
+# info: average number of packets received by the network interface ${label:device} over the last minute
+#
+# template: cgroup_10s_received_packets_storm
+# on: cgroup.net_packets
+# class: Workload
+# type: Cgroups
+#component: Network
+# hosts: *
+# lookup: average -10s unaligned of received
+# calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
+# every: 10s
+# units: %
+# warn: $this > (($status >= $WARNING)?(200):(5000))
+# options: no-clear-notification
+# info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
+# compared to the rate over the last minute
+# to: sysadmin
+#
# ---------------------------------K8s containers--------------------------------------------
template: k8s_cgroup_10min_cpu_usage
@@ -83,8 +83,9 @@ component: CPU
every: 1m
warn: $this > (($status >= $WARNING) ? (75) : (85))
delay: down 15m multiplier 1.5 max 1h
- info: average cgroup CPU utilization over the last 10 minutes
- to: sysadmin
+ info: container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \
+ average CPU utilization over the last 10 minutes
+ to: silent
template: k8s_cgroup_ram_in_use
on: k8s.cgroup.mem_usage
@@ -99,40 +100,42 @@ component: Memory
warn: $this > (($status >= $WARNING) ? (80) : (90))
crit: $this > (($status == $CRITICAL) ? (90) : (98))
delay: down 15m multiplier 1.5 max 1h
- info: cgroup memory utilization
- to: sysadmin
+ info: container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \
+ memory utilization
+ to: silent
# check for packet storms
-# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
-# 2. do the same for the last 10s
-# 3. raise an alarm if the later is 10x or 20x the first
-# we assume the minimum packet storm should at least have
-# 10000 packets/s, average of the last 10 seconds
-
- template: k8s_cgroup_1m_received_packets_rate
- on: k8s.cgroup.net_packets
- class: Workload
- type: Cgroups
-component: Network
- hosts: *
- lookup: average -1m unaligned of received
- units: packets
- every: 10s
- info: average number of packets received by the network interface ${label:device} over the last minute
-
- template: k8s_cgroup_10s_received_packets_storm
- on: k8s.cgroup.net_packets
- class: Workload
- type: Cgroups
-component: Network
- hosts: *
- lookup: average -10s unaligned of received
- calc: $this * 100 / (($k8s_cgroup_10s_received_packets_storm < 1000)?(1000):($k8s_cgroup_10s_received_packets_storm))
- every: 10s
- units: %
- warn: $this > (($status >= $WARNING)?(200):(5000))
- options: no-clear-notification
- info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
- compared to the rate over the last minute
- to: sysadmin
+# FIXME COMMENTED DUE TO A BUG IN NETDATA
+## 1. calculate the rate packets are received in 1m: 1m_received_packets_rate
+## 2. do the same for the last 10s
+## 3. raise an alarm if the later is 10x or 20x the first
+## we assume the minimum packet storm should at least have
+## 10000 packets/s, average of the last 10 seconds
+#
+# template: k8s_cgroup_1m_received_packets_rate
+# on: k8s.cgroup.net_packets
+# class: Workload
+# type: Cgroups
+#component: Network
+# hosts: *
+# lookup: average -1m unaligned of received
+# units: packets
+# every: 10s
+# info: average number of packets received by the network interface ${label:device} over the last minute
+#
+# template: k8s_cgroup_10s_received_packets_storm
+# on: k8s.cgroup.net_packets
+# class: Workload
+# type: Cgroups
+#component: Network
+# hosts: *
+# lookup: average -10s unaligned of received
+# calc: $this * 100 / (($k8s_cgroup_10s_received_packets_storm < 1000)?(1000):($k8s_cgroup_10s_received_packets_storm))
+# every: 10s
+# units: %
+# warn: $this > (($status >= $WARNING)?(200):(5000))
+# options: no-clear-notification
+# info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
+# compared to the rate over the last minute
+# to: sysadmin
diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf
index 907d6ff8..4de5edd7 100644
--- a/health/health.d/cpu.conf
+++ b/health/health.d/cpu.conf
@@ -15,7 +15,7 @@ component: CPU
crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 15m multiplier 1.5 max 1h
info: average CPU utilization over the last 10 minutes (excluding iowait, nice and steal)
- to: sysadmin
+ to: silent
template: 10min_cpu_iowait
on: system.cpu
@@ -28,9 +28,9 @@ component: CPU
units: %
every: 1m
warn: $this > (($status >= $WARNING) ? (20) : (40))
- delay: down 15m multiplier 1.5 max 1h
+ delay: up 30m down 30m multiplier 1.5 max 2h
info: average CPU iowait time over the last 10 minutes
- to: sysadmin
+ to: silent
template: 20min_steal_cpu
on: system.cpu
@@ -45,7 +45,7 @@ component: CPU
warn: $this > (($status >= $WARNING) ? (5) : (10))
delay: down 1h multiplier 1.5 max 2h
info: average CPU steal time over the last 20 minutes
- to: sysadmin
+ to: silent
## FreeBSD
template: 10min_cpu_usage
@@ -62,4 +62,4 @@ component: CPU
crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 15m multiplier 1.5 max 1h
info: average CPU utilization over the last 10 minutes (excluding nice)
- to: sysadmin
+ to: silent
diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf
index 7bd4f120..27f5d669 100644
--- a/health/health.d/disks.conf
+++ b/health/health.d/disks.conf
@@ -21,7 +21,7 @@ chart labels: mount_point=!/dev !/dev/* !/run !/run/* *
units: %
every: 1m
warn: $this > (($status >= $WARNING ) ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ crit: ($this > (($status == $CRITICAL) ? (90) : (98))) && $avail < 5
delay: up 1m down 15m multiplier 1.5 max 1h
info: disk ${label:mount_point} space utilization
to: sysadmin
@@ -55,33 +55,32 @@ chart labels: mount_point=!/dev !/dev/* !/run !/run/* *
# we will use it in the next template to find
# the hours remaining
-# template: disk_fill_rate
-# on: disk.space
-# os: linux freebsd
-# hosts: *
-# lookup: min -10m at -50m unaligned of avail
-# calc: ($this - $avail) / (($now - $after) / 3600)
-# every: 1m
-# units: GB/hour
-# info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour
-
+template: disk_fill_rate
+ on: disk.space
+ os: linux freebsd
+ hosts: *
+ lookup: min -10m at -50m unaligned of avail
+ calc: ($this - $avail) / (($now - $after) / 3600)
+ every: 1m
+ units: GB/hour
+ info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour
# calculate the hours remaining
# if the disk continues to fill
# in this rate
-# template: out_of_disk_space_time
-# on: disk.space
-# os: linux freebsd
-# hosts: *
-# calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf)
-# units: hours
-# every: 10s
-# warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
-# crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
-# delay: down 15m multiplier 1.2 max 1h
-# info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour
-# to: sysadmin
+template: out_of_disk_space_time
+ on: disk.space
+ os: linux freebsd
+ hosts: *
+ calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf)
+ units: hours
+ every: 10s
+ warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
+ crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+ delay: down 15m multiplier 1.2 max 1h
+ info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour
+ to: silent
# -----------------------------------------------------------------------------
@@ -95,32 +94,32 @@ chart labels: mount_point=!/dev !/dev/* !/run !/run/* *
# we will use it in the next template to find
# the hours remaining
-# template: disk_inode_rate
-# on: disk.inodes
-# os: linux freebsd
-# hosts: *
-# lookup: min -10m at -50m unaligned of avail
-# calc: ($this - $avail) / (($now - $after) / 3600)
-# every: 1m
-# units: inodes/hour
-# info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour
+template: disk_inode_rate
+ on: disk.inodes
+ os: linux freebsd
+ hosts: *
+ lookup: min -10m at -50m unaligned of avail
+ calc: ($this - $avail) / (($now - $after) / 3600)
+ every: 1m
+ units: inodes/hour
+ info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour
# calculate the hours remaining
# if the disk inodes are allocated
# in this rate
-# template: out_of_disk_inodes_time
-# on: disk.inodes
-# os: linux freebsd
-# hosts: *
-# calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf)
-# units: hours
-# every: 10s
-# warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
-# crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
-# delay: down 15m multiplier 1.2 max 1h
-# info: estimated time the disk will run out of inodes, if the system continues to allocate inodes with the rate of the last hour
-# to: sysadmin
+template: out_of_disk_inodes_time
+ on: disk.inodes
+ os: linux freebsd
+ hosts: *
+ calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf)
+ units: hours
+ every: 10s
+ warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8))
+ crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2))
+ delay: down 15m multiplier 1.2 max 1h
+ info: estimated time the disk will run out of inodes, if the system continues to allocate inodes with the rate of the last hour
+ to: silent
# -----------------------------------------------------------------------------
diff --git a/health/health.d/file_descriptors.conf b/health/health.d/file_descriptors.conf
index d136ea51..60bb8d38 100644
--- a/health/health.d/file_descriptors.conf
+++ b/health/health.d/file_descriptors.conf
@@ -20,12 +20,12 @@
type: System
component: Process
os: linux
- module: !* *
+ module: *
hosts: *
- lookup: max -1m unaligned foreach *
+ lookup: max -10s unaligned foreach *
units: %
- every: 1m
- warn: $this > (($status >= $WARNING) ? (85) : (90))
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (85) : (95))
delay: down 15m multiplier 1.5 max 1h
- info: maximum utilization of open files among all application group PIDs
+ info: open files percentage against the processes limits, among all PIDs in application group
to: sysadmin
diff --git a/health/health.d/ioping.conf b/health/health.d/ioping.conf
index 47ac4453..5fd785b8 100644
--- a/health/health.d/ioping.conf
+++ b/health/health.d/ioping.conf
@@ -10,4 +10,4 @@ component: Disk
warn: $this > $green
delay: down 30m multiplier 1.5 max 2h
info: average I/O latency over the last 10 seconds
- to: sysadmin
+ to: silent
diff --git a/health/health.d/ipmi.conf b/health/health.d/ipmi.conf
index 4d6478cc..1775783d 100644
--- a/health/health.d/ipmi.conf
+++ b/health/health.d/ipmi.conf
@@ -23,4 +23,4 @@ component: IPMI
warn: $this > 0
delay: up 5m down 15m multiplier 1.5 max 1h
info: number of events in the IPMI System Event Log (SEL)
- to: sysadmin
+ to: silent
diff --git a/health/health.d/linux_power_supply.conf b/health/health.d/linux_power_supply.conf
index 4562122c..71a5be28 100644
--- a/health/health.d/linux_power_supply.conf
+++ b/health/health.d/linux_power_supply.conf
@@ -11,4 +11,4 @@ component: Battery
warn: $this < 10
delay: up 30s down 5m multiplier 1.2 max 1h
info: percentage of remaining power supply capacity
- to: sysadmin
+ to: silent
diff --git a/health/health.d/load.conf b/health/health.d/load.conf
index 75989c57..20f6781c 100644
--- a/health/health.d/load.conf
+++ b/health/health.d/load.conf
@@ -34,7 +34,7 @@ component: Load
warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 175 : 200)
delay: down 15m multiplier 1.5 max 1h
info: system fifteen-minute load average
- to: sysadmin
+ to: silent
alarm: load_average_5
on: system.load
@@ -50,7 +50,7 @@ component: Load
warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 350 : 400)
delay: down 15m multiplier 1.5 max 1h
info: system five-minute load average
- to: sysadmin
+ to: silent
alarm: load_average_1
on: system.load
@@ -66,4 +66,4 @@ component: Load
warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 700 : 800)
delay: down 15m multiplier 1.5 max 1h
info: system one-minute load average
- to: sysadmin
+ to: silent
diff --git a/health/health.d/mdstat.conf b/health/health.d/mdstat.conf
index b90455a5..4dc0bf20 100644
--- a/health/health.d/mdstat.conf
+++ b/health/health.d/mdstat.conf
@@ -1,15 +1,3 @@
- template: mdstat_last_collected
- on: md.disks
- class: Latency
- type: System
-component: RAID
- calc: $now - $last_collected_t
- units: seconds ago
- every: 10s
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- info: number of seconds since the last successful data collection
- to: sysadmin
template: mdstat_disks
on: md.disks
@@ -19,7 +7,7 @@ component: RAID
units: failed devices
every: 10s
calc: $down
- crit: $this > 0
+ warn: $this > 0
info: number of devices in the down state for the ${label:device} ${label:raid_level} array. \
Any number > 0 indicates that the array is degraded.
to: sysadmin
@@ -36,7 +24,7 @@ chart labels: raid_level=!raid1 !raid10 *
warn: $this > 1024
delay: up 30m
info: number of unsynchronized blocks for the ${label:device} ${label:raid_level} array
- to: sysadmin
+ to: silent
template: mdstat_nonredundant_last_collected
on: md.nonredundant
diff --git a/health/health.d/memory.conf b/health/health.d/memory.conf
index 010cbbd7..8badf09c 100644
--- a/health/health.d/memory.conf
+++ b/health/health.d/memory.conf
@@ -1,47 +1,80 @@
-
# you can disable an alarm notification by setting the 'to' line to: silent
- alarm: 1hour_ecc_memory_correctable
- on: mem.ecc_ce
+ alarm: 1hour_memory_hw_corrupted
+ on: mem.hwcorrupt
class: Errors
type: System
component: Memory
os: linux
hosts: *
- lookup: sum -10m unaligned
+ calc: $HardwareCorrupted
+ units: MB
+ every: 10s
+ warn: $this > 0
+ delay: down 1h multiplier 1.5 max 1h
+ info: amount of memory corrupted due to a hardware failure
+ to: sysadmin
+
+## ECC Controller
+
+ template: ecc_memory_mc_correctable
+ on: mem.edac_mc
+ class: Errors
+ type: System
+component: Memory
+ os: linux
+ hosts: *
+ lookup: sum -10m unaligned of correctable, correctable_noinfo
units: errors
every: 1m
warn: $this > 0
delay: down 1h multiplier 1.5 max 1h
- info: number of ECC correctable errors in the last 10 minutes
+ info: memory controller ${label:controller} ECC correctable errors in the last 10 minutes
to: sysadmin
- alarm: 1hour_ecc_memory_uncorrectable
- on: mem.ecc_ue
+ template: ecc_memory_mc_uncorrectable
+ on: mem.edac_mc
class: Errors
type: System
component: Memory
os: linux
hosts: *
- lookup: sum -10m unaligned
+ lookup: sum -10m unaligned of uncorrectable,uncorrectable_noinfo
units: errors
every: 1m
crit: $this > 0
delay: down 1h multiplier 1.5 max 1h
- info: number of ECC uncorrectable errors in the last 10 minutes
+ info: memory controller ${label:controller} ECC uncorrectable errors in the last 10 minutes
to: sysadmin
- alarm: 1hour_memory_hw_corrupted
- on: mem.hwcorrupt
+## ECC DIMM
+
+ template: ecc_memory_dimm_correctable
+ on: mem.edac_mc_dimm
class: Errors
type: System
component: Memory
os: linux
hosts: *
- calc: $HardwareCorrupted
- units: MB
- every: 10s
+ lookup: sum -10m unaligned of correctable
+ units: errors
+ every: 1m
warn: $this > 0
delay: down 1h multiplier 1.5 max 1h
- info: amount of memory corrupted due to a hardware failure
+ info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC correctable errors in the last 10 minutes
+ to: sysadmin
+
+ template: ecc_memory_dimm_uncorrectable
+ on: mem.edac_mc_dimm
+ class: Errors
+ type: System
+component: Memory
+ os: linux
+ hosts: *
+ lookup: sum -10m unaligned of uncorrectable
+ units: errors
+ every: 1m
+ crit: $this > 0
+ delay: down 1h multiplier 1.5 max 1h
+ info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC uncorrectable errors in the last 10 minutes
to: sysadmin
diff --git a/health/health.d/net.conf b/health/health.d/net.conf
index 08a4eecb..095d488d 100644
--- a/health/health.d/net.conf
+++ b/health/health.d/net.conf
@@ -30,7 +30,7 @@ component: Network
warn: $this > (($status >= $WARNING) ? (85) : (90))
delay: up 1m down 1m multiplier 1.5 max 1h
info: average inbound utilization for the network interface ${label:device} over the last minute
- to: sysadmin
+ to: silent
template: 1m_sent_traffic_overflow
on: net.net
@@ -46,7 +46,7 @@ component: Network
warn: $this > (($status >= $WARNING) ? (85) : (90))
delay: up 1m down 1m multiplier 1.5 max 1h
info: average outbound utilization for the network interface ${label:device} over the last minute
- to: sysadmin
+ to: silent
# -----------------------------------------------------------------------------
# dropped packets
@@ -97,7 +97,7 @@ chart labels: device=!wl* *
warn: $this >= 2
delay: up 1m down 1h multiplier 1.5 max 2h
info: ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes
- to: sysadmin
+ to: silent
template: outbound_packets_dropped_ratio
on: net.packets
@@ -114,7 +114,7 @@ chart labels: device=!wl* *
warn: $this >= 2
delay: up 1m down 1h multiplier 1.5 max 2h
info: ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes
- to: sysadmin
+ to: silent
template: wifi_inbound_packets_dropped_ratio
on: net.packets
@@ -131,7 +131,7 @@ chart labels: device=wl*
warn: $this >= 10
delay: up 1m down 1h multiplier 1.5 max 2h
info: ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes
- to: sysadmin
+ to: silent
template: wifi_outbound_packets_dropped_ratio
on: net.packets
@@ -148,7 +148,7 @@ chart labels: device=wl*
warn: $this >= 10
delay: up 1m down 1h multiplier 1.5 max 2h
info: ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes
- to: sysadmin
+ to: silent
# -----------------------------------------------------------------------------
# interface errors
@@ -166,7 +166,7 @@ component: Network
warn: $this >= 5
delay: down 1h multiplier 1.5 max 2h
info: number of inbound errors for the network interface ${label:device} in the last 10 minutes
- to: sysadmin
+ to: silent
template: interface_outbound_errors
on: net.errors
@@ -181,7 +181,7 @@ component: Network
warn: $this >= 5
delay: down 1h multiplier 1.5 max 2h
info: number of outbound errors for the network interface ${label:device} in the last 10 minutes
- to: sysadmin
+ to: silent
# -----------------------------------------------------------------------------
# FIFO errors
@@ -204,7 +204,7 @@ component: Network
warn: $this > 0
delay: down 1h multiplier 1.5 max 2h
info: number of FIFO errors for the network interface ${label:device} in the last 10 minutes
- to: sysadmin
+ to: silent
# -----------------------------------------------------------------------------
# check for packet storms
@@ -243,4 +243,4 @@ component: Network
options: no-clear-notification
info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
compared to the rate over the last minute
- to: sysadmin
+ to: silent
diff --git a/health/health.d/qos.conf b/health/health.d/qos.conf
index 7290d15f..4b0a5cb9 100644
--- a/health/health.d/qos.conf
+++ b/health/health.d/qos.conf
@@ -5,14 +5,13 @@
# the alarm is checked every 10 seconds
# and examines the last minute of data
-#template: 10min_qos_packet_drops
-# on: tc.qos_dropped
-# os: linux
-# hosts: *
-# lookup: sum -10m unaligned absolute
-# every: 30s
-# warn: $this > 0
-# delay: up 0 down 30m multiplier 1.5 max 1h
-# units: packets
-# info: dropped packets in the last 30 minutes
-# to: sysadmin
+template: 10min_qos_packet_drops
+ on: tc.qos_dropped
+ os: linux
+ hosts: *
+ lookup: sum -5m unaligned absolute
+ every: 30s
+ warn: $this > 0
+ units: packets
+ info: dropped packets in the last 5 minutes
+ to: silent
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
index 34e5431a..c121264f 100644
--- a/health/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -30,7 +30,7 @@ component: Memory
warn: $this < (($status >= $WARNING) ? (15) : (10))
delay: down 15m multiplier 1.5 max 1h
info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
- to: sysadmin
+ to: silent
alarm: oom_kill
on: mem.oom_kill
@@ -41,9 +41,8 @@ component: Memory
every: 5m
warn: $this > 0
delay: down 10m
-host labels: _is_k8s_node = false
info: number of out of memory kills in the last 30 minutes
- to: sysadmin
+ to: silent
## FreeBSD
alarm: ram_in_use
@@ -75,4 +74,4 @@ component: Memory
warn: $this < (($status >= $WARNING) ? (15) : (10))
delay: down 15m multiplier 1.5 max 1h
info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
- to: sysadmin
+ to: silent
diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf
index 345f8750..b621d969 100644
--- a/health/health.d/softnet.conf
+++ b/health/health.d/softnet.conf
@@ -17,7 +17,7 @@ component: Network
delay: down 1h multiplier 1.5 max 2h
info: average number of dropped packets in the last minute \
due to exceeded net.core.netdev_max_backlog
- to: sysadmin
+ to: silent
alarm: 1min_netdev_budget_ran_outs
on: system.softnet_stat
@@ -51,4 +51,4 @@ component: Network
info: average number of drops in the last minute \
due to exceeded sysctl net.route.netisr_maxqlen \
(this can be a cause for dropped packets)
- to: sysadmin
+ to: silent
diff --git a/health/health.d/swap.conf b/health/health.d/swap.conf
index d30c74ce..3adcae9d 100644
--- a/health/health.d/swap.conf
+++ b/health/health.d/swap.conf
@@ -2,7 +2,7 @@
# you can disable an alarm notification by setting the 'to' line to: silent
alarm: 30min_ram_swapped_out
- on: system.swapio
+ on: mem.swapio
class: Workload
type: System
component: Memory
@@ -16,10 +16,10 @@ component: Memory
warn: $this > (($status >= $WARNING) ? (20) : (30))
delay: down 15m multiplier 1.5 max 1h
info: percentage of the system RAM swapped in the last 30 minutes
- to: sysadmin
+ to: silent
alarm: used_swap
- on: system.swap
+ on: mem.swap
class: Utilization
type: System
component: Memory
diff --git a/health/health.d/synchronization.conf b/health/health.d/synchronization.conf
index 417624ad..837bb1b3 100644
--- a/health/health.d/synchronization.conf
+++ b/health/health.d/synchronization.conf
@@ -9,4 +9,4 @@
info: number of sync() system calls. \
Every call causes all pending modifications to filesystem metadata and \
cached file data to be written to the underlying filesystems.
- to: sysadmin
+ to: silent
diff --git a/health/health.d/systemdunits.conf b/health/health.d/systemdunits.conf
index 531d62fa..aadf8452 100644
--- a/health/health.d/systemdunits.conf
+++ b/health/health.d/systemdunits.conf
@@ -6,6 +6,7 @@
class: Errors
type: Linux
component: Systemd units
+ module: !* *
calc: $failed
units: state
every: 10s
@@ -20,6 +21,7 @@ component: Systemd units
class: Errors
type: Linux
component: Systemd units
+ module: !* *
calc: $failed
units: state
every: 10s
@@ -34,6 +36,7 @@ component: Systemd units
class: Errors
type: Linux
component: Systemd units
+ module: !* *
calc: $failed
units: state
every: 10s
@@ -48,6 +51,7 @@ component: Systemd units
class: Errors
type: Linux
component: Systemd units
+ module: !* *
calc: $failed
units: state
every: 10s
@@ -62,6 +66,7 @@ component: Systemd units
class: Errors
type: Linux
component: Systemd units
+ module: !* *
calc: $failed
units: state
every: 10s
@@ -76,6 +81,7 @@ component: Systemd units
class: Errors
type: Linux
component: Systemd units
+ module: !* *
calc: $failed
units: state
every: 10s
@@ -90,6 +96,7 @@ component: Systemd units
class: Errors
type: Linux
component: Systemd units
+ module: !* *
calc: $failed
units: state
every: 10s
@@ -104,6 +111,7 @@ component: Systemd units
class: Errors
type: Linux
component: Systemd units
+ module: !* *
calc: $failed
units: state
every: 10s
@@ -118,6 +126,7 @@ component: Systemd units
class: Errors
type: Linux
component: Systemd units
+ module: !* *
calc: $failed
units: state
every: 10s
@@ -132,6 +141,7 @@ component: Systemd units
class: Errors
type: Linux
component: Systemd units
+ module: !* *
calc: $failed
units: state
every: 10s
diff --git a/health/health.d/tcp_listen.conf b/health/health.d/tcp_listen.conf
index d4bcfa24..00ee055d 100644
--- a/health/health.d/tcp_listen.conf
+++ b/health/health.d/tcp_listen.conf
@@ -32,7 +32,7 @@ component: Network
crit: $this > (($status == $CRITICAL) ? (1) : (5))
delay: up 0 down 5m multiplier 1.5 max 1h
info: average number of overflows in the TCP accept queue over the last minute
- to: sysadmin
+ to: silent
# THIS IS TOO GENERIC
# CHECK: https://github.com/netdata/netdata/issues/3234#issuecomment-423935842
@@ -50,7 +50,7 @@ component: Network
crit: $this > (($status == $CRITICAL) ? (1) : (5))
delay: up 0 down 5m multiplier 1.5 max 1h
info: average number of dropped packets in the TCP accept queue over the last minute
- to: sysadmin
+ to: silent
# -----------------------------------------------------------------------------
@@ -76,7 +76,7 @@ component: Network
delay: up 10 down 5m multiplier 1.5 max 1h
info: average number of SYN requests was dropped due to the full TCP SYN queue over the last minute \
(SYN cookies were not enabled)
- to: sysadmin
+ to: silent
alarm: 1m_tcp_syn_queue_cookies
on: ip.tcp_syn_queue
@@ -92,5 +92,5 @@ component: Network
crit: $this > (($status == $CRITICAL) ? (0) : (5))
delay: up 10 down 5m multiplier 1.5 max 1h
info: average number of sent SYN cookies due to the full TCP SYN queue over the last minute
- to: sysadmin
+ to: silent
diff --git a/health/health.d/tcp_mem.conf b/health/health.d/tcp_mem.conf
index 318be20a..f472d953 100644
--- a/health/health.d/tcp_mem.conf
+++ b/health/health.d/tcp_mem.conf
@@ -20,4 +20,4 @@ component: Network
crit: ${mem} > (($status == $CRITICAL ) ? ( ${tcp_mem_pressure} ) : ( ${tcp_mem_high} * 0.9 ))
delay: up 0 down 5m multiplier 1.5 max 1h
info: TCP memory utilization
- to: sysadmin
+ to: silent
diff --git a/health/health.d/tcp_orphans.conf b/health/health.d/tcp_orphans.conf
index cbd628da..07022af3 100644
--- a/health/health.d/tcp_orphans.conf
+++ b/health/health.d/tcp_orphans.conf
@@ -21,4 +21,4 @@ component: Network
crit: $this > (($status == $CRITICAL) ? ( 25 ) : ( 50 ))
delay: up 0 down 5m multiplier 1.5 max 1h
info: orphan IPv4 TCP sockets utilization
- to: sysadmin
+ to: silent
diff --git a/health/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf
index ff116db6..089ac988 100644
--- a/health/health.d/tcp_resets.conf
+++ b/health/health.d/tcp_resets.conf
@@ -33,7 +33,7 @@ component: Network
This can indicate a port scan, \
or that a service running on this host has crashed. \
Netdata will not send a clear notification for this alarm.
- to: sysadmin
+ to: silent
# -----------------------------------------------------------------------------
# tcp resets this host receives
@@ -66,4 +66,4 @@ component: Network
info: average number of received TCP RESETS over the last 10 seconds. \
This can be an indication that a service this host needs has crashed. \
Netdata will not send a clear notification for this alarm.
- to: sysadmin
+ to: silent
diff --git a/health/health.d/udp_errors.conf b/health/health.d/udp_errors.conf
index 64f47dfa..00593c58 100644
--- a/health/health.d/udp_errors.conf
+++ b/health/health.d/udp_errors.conf
@@ -17,7 +17,7 @@ component: Network
warn: $this > (($status >= $WARNING) ? (0) : (10))
info: average number of UDP receive buffer errors over the last minute
delay: up 1m down 60m multiplier 1.2 max 2h
- to: sysadmin
+ to: silent
# -----------------------------------------------------------------------------
# UDP send buffer errors
@@ -35,4 +35,4 @@ component: Network
warn: $this > (($status >= $WARNING) ? (0) : (10))
info: average number of UDP send buffer errors over the last minute
delay: up 1m down 60m multiplier 1.2 max 2h
- to: sysadmin
+ to: silent
diff --git a/health/health.d/windows.conf b/health/health.d/windows.conf
index 28a88638..9ef4c202 100644
--- a/health/health.d/windows.conf
+++ b/health/health.d/windows.conf
@@ -15,7 +15,7 @@ component: CPU
crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 15m multiplier 1.5 max 1h
info: average CPU utilization over the last 10 minutes
- to: sysadmin
+ to: silent
## Memory
@@ -52,7 +52,7 @@ component: Network
warn: $this >= 5
delay: down 1h multiplier 1.5 max 2h
info: number of inbound discarded packets for the network interface in the last 10 minutes
- to: sysadmin
+ to: silent
template: windows_outbound_packets_discarded
on: windows.net_nic_discarded
@@ -67,7 +67,7 @@ component: Network
warn: $this >= 5
delay: down 1h multiplier 1.5 max 2h
info: number of outbound discarded packets for the network interface in the last 10 minutes
- to: sysadmin
+ to: silent
template: windows_inbound_packets_errors
on: windows.net_nic_errors
@@ -82,7 +82,7 @@ component: Network
warn: $this >= 5
delay: down 1h multiplier 1.5 max 2h
info: number of inbound errors for the network interface in the last 10 minutes
- to: sysadmin
+ to: silent
template: windows_outbound_packets_errors
on: windows.net_nic_errors
@@ -97,7 +97,7 @@ component: Network
warn: $this >= 5
delay: down 1h multiplier 1.5 max 2h
info: number of outbound errors for the network interface in the last 10 minutes
- to: sysadmin
+ to: silent
## Disk
diff --git a/health/health.d/zfs.conf b/health/health.d/zfs.conf
index 7f8ea279..40ec4ce8 100644
--- a/health/health.d/zfs.conf
+++ b/health/health.d/zfs.conf
@@ -10,7 +10,7 @@ component: File system
warn: $this > 0
delay: down 1h multiplier 1.5 max 2h
info: number of times ZFS had to limit the ARC growth in the last 10 minutes
- to: sysadmin
+ to: silent
# ZFS pool state