diff options
Diffstat (limited to 'health/health.d')
26 files changed, 249 insertions, 219 deletions
diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf index 3f92e80df..8492bb6c7 100644 --- a/health/health.d/bcache.conf +++ b/health/health.d/bcache.conf @@ -12,7 +12,7 @@ component: Disk info: number of times data was read from the cache, \ the bucket was reused and invalidated in the last 10 minutes \ (when this occurs the data is reread from the backing device) - to: sysadmin + to: silent template: bcache_cache_dirty on: disk.bcache_cache_alloc @@ -26,4 +26,4 @@ component: Disk delay: up 1m down 1h multiplier 1.5 max 2h info: percentage of cache space used for dirty data and metadata \ (this usually means your SSD cache is too small) - to: sysadmin + to: silent diff --git a/health/health.d/btrfs.conf b/health/health.d/btrfs.conf index 97b7a3a94..b2a50682b 100644 --- a/health/health.d/btrfs.conf +++ b/health/health.d/btrfs.conf @@ -9,11 +9,10 @@ component: File system calc: 100 - ($unallocated * 100 / ($unallocated + $data_used + $data_free + $meta_used + $meta_free + $sys_used + $sys_free)) units: % every: 10s - warn: $this > (($status >= $WARNING) ? (90) : (95)) - crit: $this > (($status == $CRITICAL) ? (95) : (98)) + warn: $this > (($status == $CRITICAL) ? (95) : (98)) delay: up 1m down 15m multiplier 1.5 max 1h info: percentage of allocated BTRFS physical disk space - to: sysadmin + to: silent template: btrfs_data on: btrfs.data @@ -86,7 +85,7 @@ component: File system hosts: * units: errors lookup: max -10m every 1m of write_errs - warn: $this > 0 + crit: $this > 0 delay: up 1m down 15m multiplier 1.5 max 1h info: number of encountered BTRFS write errors to: sysadmin @@ -100,7 +99,7 @@ component: File system hosts: * units: errors lookup: max -10m every 1m of flush_errs - warn: $this > 0 + crit: $this > 0 delay: up 1m down 15m multiplier 1.5 max 1h info: number of encountered BTRFS flush errors to: sysadmin diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf index f625e5455..53a6ea00f 100644 --- a/health/health.d/cgroups.conf +++ b/health/health.d/cgroups.conf @@ -11,11 +11,10 @@ component: CPU lookup: average -10m unaligned units: % every: 1m - warn: $this > (($status >= $WARNING) ? (75) : (85)) - crit: $this > (($status == $CRITICAL) ? (85) : (95)) + warn: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 15m multiplier 1.5 max 1h info: average cgroup CPU utilization over the last 10 minutes - to: sysadmin + to: silent template: cgroup_ram_in_use on: cgroup.mem_usage @@ -31,44 +30,45 @@ component: Memory crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 15m multiplier 1.5 max 1h info: cgroup memory utilization - to: sysadmin - -# ----------------------------------------------------------------------------- -# check for packet storms - -# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate -# 2. do the same for the last 10s -# 3. raise an alarm if the later is 10x or 20x the first -# we assume the minimum packet storm should at least have -# 10000 packets/s, average of the last 10 seconds - - template: cgroup_1m_received_packets_rate - on: cgroup.net_packets - class: Workload - type: Cgroups -component: Network - hosts: * - lookup: average -1m unaligned of received - units: packets - every: 10s - info: average number of packets received by the network interface ${label:device} over the last minute - - template: cgroup_10s_received_packets_storm - on: cgroup.net_packets - class: Workload - type: Cgroups -component: Network - hosts: * - lookup: average -10s unaligned of received - calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate)) - every: 10s - units: % - warn: $this > (($status >= $WARNING)?(200):(5000)) - options: no-clear-notification - info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \ - compared to the rate over the last minute - to: sysadmin + to: silent +# FIXME COMMENTED DUE TO A BUG IN NETDATA +## ----------------------------------------------------------------------------- +## check for packet storms +# +## 1. calculate the rate packets are received in 1m: 1m_received_packets_rate +## 2. do the same for the last 10s +## 3. raise an alarm if the later is 10x or 20x the first +## we assume the minimum packet storm should at least have +## 10000 packets/s, average of the last 10 seconds +# +# template: cgroup_1m_received_packets_rate +# on: cgroup.net_packets +# class: Workload +# type: Cgroups +#component: Network +# hosts: * +# lookup: average -1m unaligned of received +# units: packets +# every: 10s +# info: average number of packets received by the network interface ${label:device} over the last minute +# +# template: cgroup_10s_received_packets_storm +# on: cgroup.net_packets +# class: Workload +# type: Cgroups +#component: Network +# hosts: * +# lookup: average -10s unaligned of received +# calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate)) +# every: 10s +# units: % +# warn: $this > (($status >= $WARNING)?(200):(5000)) +# options: no-clear-notification +# info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \ +# compared to the rate over the last minute +# to: sysadmin +# # ---------------------------------K8s containers-------------------------------------------- template: k8s_cgroup_10min_cpu_usage @@ -83,8 +83,9 @@ component: CPU every: 1m warn: $this > (($status >= $WARNING) ? (75) : (85)) delay: down 15m multiplier 1.5 max 1h - info: average cgroup CPU utilization over the last 10 minutes - to: sysadmin + info: container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \ + average CPU utilization over the last 10 minutes + to: silent template: k8s_cgroup_ram_in_use on: k8s.cgroup.mem_usage @@ -99,40 +100,42 @@ component: Memory warn: $this > (($status >= $WARNING) ? (80) : (90)) crit: $this > (($status == $CRITICAL) ? (90) : (98)) delay: down 15m multiplier 1.5 max 1h - info: cgroup memory utilization - to: sysadmin + info: container ${label:k8s_container_name} of pod ${label:k8s_pod_name} of namespace ${label:k8s_namespace}, \ + memory utilization + to: silent # check for packet storms -# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate -# 2. do the same for the last 10s -# 3. raise an alarm if the later is 10x or 20x the first -# we assume the minimum packet storm should at least have -# 10000 packets/s, average of the last 10 seconds - - template: k8s_cgroup_1m_received_packets_rate - on: k8s.cgroup.net_packets - class: Workload - type: Cgroups -component: Network - hosts: * - lookup: average -1m unaligned of received - units: packets - every: 10s - info: average number of packets received by the network interface ${label:device} over the last minute - - template: k8s_cgroup_10s_received_packets_storm - on: k8s.cgroup.net_packets - class: Workload - type: Cgroups -component: Network - hosts: * - lookup: average -10s unaligned of received - calc: $this * 100 / (($k8s_cgroup_10s_received_packets_storm < 1000)?(1000):($k8s_cgroup_10s_received_packets_storm)) - every: 10s - units: % - warn: $this > (($status >= $WARNING)?(200):(5000)) - options: no-clear-notification - info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \ - compared to the rate over the last minute - to: sysadmin +# FIXME COMMENTED DUE TO A BUG IN NETDATA +## 1. calculate the rate packets are received in 1m: 1m_received_packets_rate +## 2. do the same for the last 10s +## 3. raise an alarm if the later is 10x or 20x the first +## we assume the minimum packet storm should at least have +## 10000 packets/s, average of the last 10 seconds +# +# template: k8s_cgroup_1m_received_packets_rate +# on: k8s.cgroup.net_packets +# class: Workload +# type: Cgroups +#component: Network +# hosts: * +# lookup: average -1m unaligned of received +# units: packets +# every: 10s +# info: average number of packets received by the network interface ${label:device} over the last minute +# +# template: k8s_cgroup_10s_received_packets_storm +# on: k8s.cgroup.net_packets +# class: Workload +# type: Cgroups +#component: Network +# hosts: * +# lookup: average -10s unaligned of received +# calc: $this * 100 / (($k8s_cgroup_10s_received_packets_storm < 1000)?(1000):($k8s_cgroup_10s_received_packets_storm)) +# every: 10s +# units: % +# warn: $this > (($status >= $WARNING)?(200):(5000)) +# options: no-clear-notification +# info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \ +# compared to the rate over the last minute +# to: sysadmin diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf index 907d6ff8a..4de5edd75 100644 --- a/health/health.d/cpu.conf +++ b/health/health.d/cpu.conf @@ -15,7 +15,7 @@ component: CPU crit: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 15m multiplier 1.5 max 1h info: average CPU utilization over the last 10 minutes (excluding iowait, nice and steal) - to: sysadmin + to: silent template: 10min_cpu_iowait on: system.cpu @@ -28,9 +28,9 @@ component: CPU units: % every: 1m warn: $this > (($status >= $WARNING) ? (20) : (40)) - delay: down 15m multiplier 1.5 max 1h + delay: up 30m down 30m multiplier 1.5 max 2h info: average CPU iowait time over the last 10 minutes - to: sysadmin + to: silent template: 20min_steal_cpu on: system.cpu @@ -45,7 +45,7 @@ component: CPU warn: $this > (($status >= $WARNING) ? (5) : (10)) delay: down 1h multiplier 1.5 max 2h info: average CPU steal time over the last 20 minutes - to: sysadmin + to: silent ## FreeBSD template: 10min_cpu_usage @@ -62,4 +62,4 @@ component: CPU crit: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 15m multiplier 1.5 max 1h info: average CPU utilization over the last 10 minutes (excluding nice) - to: sysadmin + to: silent diff --git a/health/health.d/disks.conf b/health/health.d/disks.conf index 7bd4f120c..27f5d6691 100644 --- a/health/health.d/disks.conf +++ b/health/health.d/disks.conf @@ -21,7 +21,7 @@ chart labels: mount_point=!/dev !/dev/* !/run !/run/* * units: % every: 1m warn: $this > (($status >= $WARNING ) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) + crit: ($this > (($status == $CRITICAL) ? (90) : (98))) && $avail < 5 delay: up 1m down 15m multiplier 1.5 max 1h info: disk ${label:mount_point} space utilization to: sysadmin @@ -55,33 +55,32 @@ chart labels: mount_point=!/dev !/dev/* !/run !/run/* * # we will use it in the next template to find # the hours remaining -# template: disk_fill_rate -# on: disk.space -# os: linux freebsd -# hosts: * -# lookup: min -10m at -50m unaligned of avail -# calc: ($this - $avail) / (($now - $after) / 3600) -# every: 1m -# units: GB/hour -# info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour - +template: disk_fill_rate + on: disk.space + os: linux freebsd + hosts: * + lookup: min -10m at -50m unaligned of avail + calc: ($this - $avail) / (($now - $after) / 3600) + every: 1m + units: GB/hour + info: average rate the disk fills up (positive), or frees up (negative) space, for the last hour # calculate the hours remaining # if the disk continues to fill # in this rate -# template: out_of_disk_space_time -# on: disk.space -# os: linux freebsd -# hosts: * -# calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf) -# units: hours -# every: 10s -# warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) -# crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) -# delay: down 15m multiplier 1.2 max 1h -# info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour -# to: sysadmin +template: out_of_disk_space_time + on: disk.space + os: linux freebsd + hosts: * + calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf) + units: hours + every: 10s + warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) + crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) + delay: down 15m multiplier 1.2 max 1h + info: estimated time the disk will run out of space, if the system continues to add data with the rate of the last hour + to: silent # ----------------------------------------------------------------------------- @@ -95,32 +94,32 @@ chart labels: mount_point=!/dev !/dev/* !/run !/run/* * # we will use it in the next template to find # the hours remaining -# template: disk_inode_rate -# on: disk.inodes -# os: linux freebsd -# hosts: * -# lookup: min -10m at -50m unaligned of avail -# calc: ($this - $avail) / (($now - $after) / 3600) -# every: 1m -# units: inodes/hour -# info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour +template: disk_inode_rate + on: disk.inodes + os: linux freebsd + hosts: * + lookup: min -10m at -50m unaligned of avail + calc: ($this - $avail) / (($now - $after) / 3600) + every: 1m + units: inodes/hour + info: average rate at which disk inodes are allocated (positive), or freed (negative), for the last hour # calculate the hours remaining # if the disk inodes are allocated # in this rate -# template: out_of_disk_inodes_time -# on: disk.inodes -# os: linux freebsd -# hosts: * -# calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf) -# units: hours -# every: 10s -# warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) -# crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) -# delay: down 15m multiplier 1.2 max 1h -# info: estimated time the disk will run out of inodes, if the system continues to allocate inodes with the rate of the last hour -# to: sysadmin +template: out_of_disk_inodes_time + on: disk.inodes + os: linux freebsd + hosts: * + calc: ($disk_inode_rate > 0) ? ($avail / $disk_inode_rate) : (inf) + units: hours + every: 10s + warn: $this > 0 and $this < (($status >= $WARNING) ? (48) : (8)) + crit: $this > 0 and $this < (($status == $CRITICAL) ? (24) : (2)) + delay: down 15m multiplier 1.2 max 1h + info: estimated time the disk will run out of inodes, if the system continues to allocate inodes with the rate of the last hour + to: silent # ----------------------------------------------------------------------------- diff --git a/health/health.d/file_descriptors.conf b/health/health.d/file_descriptors.conf index d136ea517..60bb8d384 100644 --- a/health/health.d/file_descriptors.conf +++ b/health/health.d/file_descriptors.conf @@ -20,12 +20,12 @@ type: System component: Process os: linux - module: !* * + module: * hosts: * - lookup: max -1m unaligned foreach * + lookup: max -10s unaligned foreach * units: % - every: 1m - warn: $this > (($status >= $WARNING) ? (85) : (90)) + every: 10s + warn: $this > (($status >= $WARNING) ? (85) : (95)) delay: down 15m multiplier 1.5 max 1h - info: maximum utilization of open files among all application group PIDs + info: open files percentage against the processes limits, among all PIDs in application group to: sysadmin diff --git a/health/health.d/ioping.conf b/health/health.d/ioping.conf index 47ac4453c..5fd785b84 100644 --- a/health/health.d/ioping.conf +++ b/health/health.d/ioping.conf @@ -10,4 +10,4 @@ component: Disk warn: $this > $green delay: down 30m multiplier 1.5 max 2h info: average I/O latency over the last 10 seconds - to: sysadmin + to: silent diff --git a/health/health.d/ipmi.conf b/health/health.d/ipmi.conf index 4d6478cca..1775783df 100644 --- a/health/health.d/ipmi.conf +++ b/health/health.d/ipmi.conf @@ -23,4 +23,4 @@ component: IPMI warn: $this > 0 delay: up 5m down 15m multiplier 1.5 max 1h info: number of events in the IPMI System Event Log (SEL) - to: sysadmin + to: silent diff --git a/health/health.d/linux_power_supply.conf b/health/health.d/linux_power_supply.conf index 4562122ca..71a5be284 100644 --- a/health/health.d/linux_power_supply.conf +++ b/health/health.d/linux_power_supply.conf @@ -11,4 +11,4 @@ component: Battery warn: $this < 10 delay: up 30s down 5m multiplier 1.2 max 1h info: percentage of remaining power supply capacity - to: sysadmin + to: silent diff --git a/health/health.d/load.conf b/health/health.d/load.conf index 75989c57f..20f6781c8 100644 --- a/health/health.d/load.conf +++ b/health/health.d/load.conf @@ -34,7 +34,7 @@ component: Load warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 175 : 200) delay: down 15m multiplier 1.5 max 1h info: system fifteen-minute load average - to: sysadmin + to: silent alarm: load_average_5 on: system.load @@ -50,7 +50,7 @@ component: Load warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 350 : 400) delay: down 15m multiplier 1.5 max 1h info: system five-minute load average - to: sysadmin + to: silent alarm: load_average_1 on: system.load @@ -66,4 +66,4 @@ component: Load warn: ($this * 100 / $load_cpu_number) > (($status >= $WARNING) ? 700 : 800) delay: down 15m multiplier 1.5 max 1h info: system one-minute load average - to: sysadmin + to: silent diff --git a/health/health.d/mdstat.conf b/health/health.d/mdstat.conf index b90455a58..4dc0bf207 100644 --- a/health/health.d/mdstat.conf +++ b/health/health.d/mdstat.conf @@ -1,15 +1,3 @@ - template: mdstat_last_collected - on: md.disks - class: Latency - type: System -component: RAID - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - info: number of seconds since the last successful data collection - to: sysadmin template: mdstat_disks on: md.disks @@ -19,7 +7,7 @@ component: RAID units: failed devices every: 10s calc: $down - crit: $this > 0 + warn: $this > 0 info: number of devices in the down state for the ${label:device} ${label:raid_level} array. \ Any number > 0 indicates that the array is degraded. to: sysadmin @@ -36,7 +24,7 @@ chart labels: raid_level=!raid1 !raid10 * warn: $this > 1024 delay: up 30m info: number of unsynchronized blocks for the ${label:device} ${label:raid_level} array - to: sysadmin + to: silent template: mdstat_nonredundant_last_collected on: md.nonredundant diff --git a/health/health.d/memory.conf b/health/health.d/memory.conf index 010cbbd7b..8badf09c4 100644 --- a/health/health.d/memory.conf +++ b/health/health.d/memory.conf @@ -1,47 +1,80 @@ - # you can disable an alarm notification by setting the 'to' line to: silent - alarm: 1hour_ecc_memory_correctable - on: mem.ecc_ce + alarm: 1hour_memory_hw_corrupted + on: mem.hwcorrupt class: Errors type: System component: Memory os: linux hosts: * - lookup: sum -10m unaligned + calc: $HardwareCorrupted + units: MB + every: 10s + warn: $this > 0 + delay: down 1h multiplier 1.5 max 1h + info: amount of memory corrupted due to a hardware failure + to: sysadmin + +## ECC Controller + + template: ecc_memory_mc_correctable + on: mem.edac_mc + class: Errors + type: System +component: Memory + os: linux + hosts: * + lookup: sum -10m unaligned of correctable, correctable_noinfo units: errors every: 1m warn: $this > 0 delay: down 1h multiplier 1.5 max 1h - info: number of ECC correctable errors in the last 10 minutes + info: memory controller ${label:controller} ECC correctable errors in the last 10 minutes to: sysadmin - alarm: 1hour_ecc_memory_uncorrectable - on: mem.ecc_ue + template: ecc_memory_mc_uncorrectable + on: mem.edac_mc class: Errors type: System component: Memory os: linux hosts: * - lookup: sum -10m unaligned + lookup: sum -10m unaligned of uncorrectable,uncorrectable_noinfo units: errors every: 1m crit: $this > 0 delay: down 1h multiplier 1.5 max 1h - info: number of ECC uncorrectable errors in the last 10 minutes + info: memory controller ${label:controller} ECC uncorrectable errors in the last 10 minutes to: sysadmin - alarm: 1hour_memory_hw_corrupted - on: mem.hwcorrupt +## ECC DIMM + + template: ecc_memory_dimm_correctable + on: mem.edac_mc_dimm class: Errors type: System component: Memory os: linux hosts: * - calc: $HardwareCorrupted - units: MB - every: 10s + lookup: sum -10m unaligned of correctable + units: errors + every: 1m warn: $this > 0 delay: down 1h multiplier 1.5 max 1h - info: amount of memory corrupted due to a hardware failure + info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC correctable errors in the last 10 minutes + to: sysadmin + + template: ecc_memory_dimm_uncorrectable + on: mem.edac_mc_dimm + class: Errors + type: System +component: Memory + os: linux + hosts: * + lookup: sum -10m unaligned of uncorrectable + units: errors + every: 1m + crit: $this > 0 + delay: down 1h multiplier 1.5 max 1h + info: DIMM ${label:dimm} controller ${label:controller} (location ${label:dimm_location}) ECC uncorrectable errors in the last 10 minutes to: sysadmin diff --git a/health/health.d/net.conf b/health/health.d/net.conf index 08a4eecb4..095d488da 100644 --- a/health/health.d/net.conf +++ b/health/health.d/net.conf @@ -30,7 +30,7 @@ component: Network warn: $this > (($status >= $WARNING) ? (85) : (90)) delay: up 1m down 1m multiplier 1.5 max 1h info: average inbound utilization for the network interface ${label:device} over the last minute - to: sysadmin + to: silent template: 1m_sent_traffic_overflow on: net.net @@ -46,7 +46,7 @@ component: Network warn: $this > (($status >= $WARNING) ? (85) : (90)) delay: up 1m down 1m multiplier 1.5 max 1h info: average outbound utilization for the network interface ${label:device} over the last minute - to: sysadmin + to: silent # ----------------------------------------------------------------------------- # dropped packets @@ -97,7 +97,7 @@ chart labels: device=!wl* * warn: $this >= 2 delay: up 1m down 1h multiplier 1.5 max 2h info: ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes - to: sysadmin + to: silent template: outbound_packets_dropped_ratio on: net.packets @@ -114,7 +114,7 @@ chart labels: device=!wl* * warn: $this >= 2 delay: up 1m down 1h multiplier 1.5 max 2h info: ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes - to: sysadmin + to: silent template: wifi_inbound_packets_dropped_ratio on: net.packets @@ -131,7 +131,7 @@ chart labels: device=wl* warn: $this >= 10 delay: up 1m down 1h multiplier 1.5 max 2h info: ratio of inbound dropped packets for the network interface ${label:device} over the last 10 minutes - to: sysadmin + to: silent template: wifi_outbound_packets_dropped_ratio on: net.packets @@ -148,7 +148,7 @@ chart labels: device=wl* warn: $this >= 10 delay: up 1m down 1h multiplier 1.5 max 2h info: ratio of outbound dropped packets for the network interface ${label:device} over the last 10 minutes - to: sysadmin + to: silent # ----------------------------------------------------------------------------- # interface errors @@ -166,7 +166,7 @@ component: Network warn: $this >= 5 delay: down 1h multiplier 1.5 max 2h info: number of inbound errors for the network interface ${label:device} in the last 10 minutes - to: sysadmin + to: silent template: interface_outbound_errors on: net.errors @@ -181,7 +181,7 @@ component: Network warn: $this >= 5 delay: down 1h multiplier 1.5 max 2h info: number of outbound errors for the network interface ${label:device} in the last 10 minutes - to: sysadmin + to: silent # ----------------------------------------------------------------------------- # FIFO errors @@ -204,7 +204,7 @@ component: Network warn: $this > 0 delay: down 1h multiplier 1.5 max 2h info: number of FIFO errors for the network interface ${label:device} in the last 10 minutes - to: sysadmin + to: silent # ----------------------------------------------------------------------------- # check for packet storms @@ -243,4 +243,4 @@ component: Network options: no-clear-notification info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \ compared to the rate over the last minute - to: sysadmin + to: silent diff --git a/health/health.d/qos.conf b/health/health.d/qos.conf index 7290d15ff..4b0a5cb96 100644 --- a/health/health.d/qos.conf +++ b/health/health.d/qos.conf @@ -5,14 +5,13 @@ # the alarm is checked every 10 seconds # and examines the last minute of data -#template: 10min_qos_packet_drops -# on: tc.qos_dropped -# os: linux -# hosts: * -# lookup: sum -10m unaligned absolute -# every: 30s -# warn: $this > 0 -# delay: up 0 down 30m multiplier 1.5 max 1h -# units: packets -# info: dropped packets in the last 30 minutes -# to: sysadmin +template: 10min_qos_packet_drops + on: tc.qos_dropped + os: linux + hosts: * + lookup: sum -5m unaligned absolute + every: 30s + warn: $this > 0 + units: packets + info: dropped packets in the last 5 minutes + to: silent diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf index 34e5431a8..c121264f7 100644 --- a/health/health.d/ram.conf +++ b/health/health.d/ram.conf @@ -30,7 +30,7 @@ component: Memory warn: $this < (($status >= $WARNING) ? (15) : (10)) delay: down 15m multiplier 1.5 max 1h info: percentage of estimated amount of RAM available for userspace processes, without causing swapping - to: sysadmin + to: silent alarm: oom_kill on: mem.oom_kill @@ -41,9 +41,8 @@ component: Memory every: 5m warn: $this > 0 delay: down 10m -host labels: _is_k8s_node = false info: number of out of memory kills in the last 30 minutes - to: sysadmin + to: silent ## FreeBSD alarm: ram_in_use @@ -75,4 +74,4 @@ component: Memory warn: $this < (($status >= $WARNING) ? (15) : (10)) delay: down 15m multiplier 1.5 max 1h info: percentage of estimated amount of RAM available for userspace processes, without causing swapping - to: sysadmin + to: silent diff --git a/health/health.d/softnet.conf b/health/health.d/softnet.conf index 345f87505..b621d969d 100644 --- a/health/health.d/softnet.conf +++ b/health/health.d/softnet.conf @@ -17,7 +17,7 @@ component: Network delay: down 1h multiplier 1.5 max 2h info: average number of dropped packets in the last minute \ due to exceeded net.core.netdev_max_backlog - to: sysadmin + to: silent alarm: 1min_netdev_budget_ran_outs on: system.softnet_stat @@ -51,4 +51,4 @@ component: Network info: average number of drops in the last minute \ due to exceeded sysctl net.route.netisr_maxqlen \ (this can be a cause for dropped packets) - to: sysadmin + to: silent diff --git a/health/health.d/swap.conf b/health/health.d/swap.conf index d30c74cee..3adcae9db 100644 --- a/health/health.d/swap.conf +++ b/health/health.d/swap.conf @@ -2,7 +2,7 @@ # you can disable an alarm notification by setting the 'to' line to: silent alarm: 30min_ram_swapped_out - on: system.swapio + on: mem.swapio class: Workload type: System component: Memory @@ -16,10 +16,10 @@ component: Memory warn: $this > (($status >= $WARNING) ? (20) : (30)) delay: down 15m multiplier 1.5 max 1h info: percentage of the system RAM swapped in the last 30 minutes - to: sysadmin + to: silent alarm: used_swap - on: system.swap + on: mem.swap class: Utilization type: System component: Memory diff --git a/health/health.d/synchronization.conf b/health/health.d/synchronization.conf index 417624adb..837bb1b32 100644 --- a/health/health.d/synchronization.conf +++ b/health/health.d/synchronization.conf @@ -9,4 +9,4 @@ info: number of sync() system calls. \ Every call causes all pending modifications to filesystem metadata and \ cached file data to be written to the underlying filesystems. - to: sysadmin + to: silent diff --git a/health/health.d/systemdunits.conf b/health/health.d/systemdunits.conf index 531d62fac..aadf8452b 100644 --- a/health/health.d/systemdunits.conf +++ b/health/health.d/systemdunits.conf @@ -6,6 +6,7 @@ class: Errors type: Linux component: Systemd units + module: !* * calc: $failed units: state every: 10s @@ -20,6 +21,7 @@ component: Systemd units class: Errors type: Linux component: Systemd units + module: !* * calc: $failed units: state every: 10s @@ -34,6 +36,7 @@ component: Systemd units class: Errors type: Linux component: Systemd units + module: !* * calc: $failed units: state every: 10s @@ -48,6 +51,7 @@ component: Systemd units class: Errors type: Linux component: Systemd units + module: !* * calc: $failed units: state every: 10s @@ -62,6 +66,7 @@ component: Systemd units class: Errors type: Linux component: Systemd units + module: !* * calc: $failed units: state every: 10s @@ -76,6 +81,7 @@ component: Systemd units class: Errors type: Linux component: Systemd units + module: !* * calc: $failed units: state every: 10s @@ -90,6 +96,7 @@ component: Systemd units class: Errors type: Linux component: Systemd units + module: !* * calc: $failed units: state every: 10s @@ -104,6 +111,7 @@ component: Systemd units class: Errors type: Linux component: Systemd units + module: !* * calc: $failed units: state every: 10s @@ -118,6 +126,7 @@ component: Systemd units class: Errors type: Linux component: Systemd units + module: !* * calc: $failed units: state every: 10s @@ -132,6 +141,7 @@ component: Systemd units class: Errors type: Linux component: Systemd units + module: !* * calc: $failed units: state every: 10s diff --git a/health/health.d/tcp_listen.conf b/health/health.d/tcp_listen.conf index d4bcfa248..00ee055d0 100644 --- a/health/health.d/tcp_listen.conf +++ b/health/health.d/tcp_listen.conf @@ -32,7 +32,7 @@ component: Network crit: $this > (($status == $CRITICAL) ? (1) : (5)) delay: up 0 down 5m multiplier 1.5 max 1h info: average number of overflows in the TCP accept queue over the last minute - to: sysadmin + to: silent # THIS IS TOO GENERIC # CHECK: https://github.com/netdata/netdata/issues/3234#issuecomment-423935842 @@ -50,7 +50,7 @@ component: Network crit: $this > (($status == $CRITICAL) ? (1) : (5)) delay: up 0 down 5m multiplier 1.5 max 1h info: average number of dropped packets in the TCP accept queue over the last minute - to: sysadmin + to: silent # ----------------------------------------------------------------------------- @@ -76,7 +76,7 @@ component: Network delay: up 10 down 5m multiplier 1.5 max 1h info: average number of SYN requests was dropped due to the full TCP SYN queue over the last minute \ (SYN cookies were not enabled) - to: sysadmin + to: silent alarm: 1m_tcp_syn_queue_cookies on: ip.tcp_syn_queue @@ -92,5 +92,5 @@ component: Network crit: $this > (($status == $CRITICAL) ? (0) : (5)) delay: up 10 down 5m multiplier 1.5 max 1h info: average number of sent SYN cookies due to the full TCP SYN queue over the last minute - to: sysadmin + to: silent diff --git a/health/health.d/tcp_mem.conf b/health/health.d/tcp_mem.conf index 318be20ac..f472d9533 100644 --- a/health/health.d/tcp_mem.conf +++ b/health/health.d/tcp_mem.conf @@ -20,4 +20,4 @@ component: Network crit: ${mem} > (($status == $CRITICAL ) ? ( ${tcp_mem_pressure} ) : ( ${tcp_mem_high} * 0.9 )) delay: up 0 down 5m multiplier 1.5 max 1h info: TCP memory utilization - to: sysadmin + to: silent diff --git a/health/health.d/tcp_orphans.conf b/health/health.d/tcp_orphans.conf index cbd628da5..07022af30 100644 --- a/health/health.d/tcp_orphans.conf +++ b/health/health.d/tcp_orphans.conf @@ -21,4 +21,4 @@ component: Network crit: $this > (($status == $CRITICAL) ? ( 25 ) : ( 50 )) delay: up 0 down 5m multiplier 1.5 max 1h info: orphan IPv4 TCP sockets utilization - to: sysadmin + to: silent diff --git a/health/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf index ff116db64..089ac988d 100644 --- a/health/health.d/tcp_resets.conf +++ b/health/health.d/tcp_resets.conf @@ -33,7 +33,7 @@ component: Network This can indicate a port scan, \ or that a service running on this host has crashed. \ Netdata will not send a clear notification for this alarm. - to: sysadmin + to: silent # ----------------------------------------------------------------------------- # tcp resets this host receives @@ -66,4 +66,4 @@ component: Network info: average number of received TCP RESETS over the last 10 seconds. \ This can be an indication that a service this host needs has crashed. \ Netdata will not send a clear notification for this alarm. - to: sysadmin + to: silent diff --git a/health/health.d/udp_errors.conf b/health/health.d/udp_errors.conf index 64f47dfa7..00593c583 100644 --- a/health/health.d/udp_errors.conf +++ b/health/health.d/udp_errors.conf @@ -17,7 +17,7 @@ component: Network warn: $this > (($status >= $WARNING) ? (0) : (10)) info: average number of UDP receive buffer errors over the last minute delay: up 1m down 60m multiplier 1.2 max 2h - to: sysadmin + to: silent # ----------------------------------------------------------------------------- # UDP send buffer errors @@ -35,4 +35,4 @@ component: Network warn: $this > (($status >= $WARNING) ? (0) : (10)) info: average number of UDP send buffer errors over the last minute delay: up 1m down 60m multiplier 1.2 max 2h - to: sysadmin + to: silent diff --git a/health/health.d/windows.conf b/health/health.d/windows.conf index 28a886386..9ef4c202f 100644 --- a/health/health.d/windows.conf +++ b/health/health.d/windows.conf @@ -15,7 +15,7 @@ component: CPU crit: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 15m multiplier 1.5 max 1h info: average CPU utilization over the last 10 minutes - to: sysadmin + to: silent ## Memory @@ -52,7 +52,7 @@ component: Network warn: $this >= 5 delay: down 1h multiplier 1.5 max 2h info: number of inbound discarded packets for the network interface in the last 10 minutes - to: sysadmin + to: silent template: windows_outbound_packets_discarded on: windows.net_nic_discarded @@ -67,7 +67,7 @@ component: Network warn: $this >= 5 delay: down 1h multiplier 1.5 max 2h info: number of outbound discarded packets for the network interface in the last 10 minutes - to: sysadmin + to: silent template: windows_inbound_packets_errors on: windows.net_nic_errors @@ -82,7 +82,7 @@ component: Network warn: $this >= 5 delay: down 1h multiplier 1.5 max 2h info: number of inbound errors for the network interface in the last 10 minutes - to: sysadmin + to: silent template: windows_outbound_packets_errors on: windows.net_nic_errors @@ -97,7 +97,7 @@ component: Network warn: $this >= 5 delay: down 1h multiplier 1.5 max 2h info: number of outbound errors for the network interface in the last 10 minutes - to: sysadmin + to: silent ## Disk diff --git a/health/health.d/zfs.conf b/health/health.d/zfs.conf index 7f8ea2793..40ec4ce8a 100644 --- a/health/health.d/zfs.conf +++ b/health/health.d/zfs.conf @@ -10,7 +10,7 @@ component: File system warn: $this > 0 delay: down 1h multiplier 1.5 max 2h info: number of times ZFS had to limit the ARC growth in the last 10 minutes - to: sysadmin + to: silent # ZFS pool state |