diff options
Diffstat (limited to 'conf.d/health.d')
-rw-r--r-- | conf.d/health.d/cpu.conf | 8 | ||||
-rw-r--r-- | conf.d/health.d/disks.conf | 16 | ||||
-rw-r--r-- | conf.d/health.d/entropy.conf | 2 | ||||
-rw-r--r-- | conf.d/health.d/ipc.conf | 6 | ||||
-rw-r--r-- | conf.d/health.d/memory.conf | 8 | ||||
-rw-r--r-- | conf.d/health.d/net.conf | 28 | ||||
-rw-r--r-- | conf.d/health.d/netfilter.conf | 6 | ||||
-rw-r--r-- | conf.d/health.d/qos.conf | 4 | ||||
-rw-r--r-- | conf.d/health.d/ram.conf | 6 | ||||
-rw-r--r-- | conf.d/health.d/softnet.conf | 7 | ||||
-rw-r--r-- | conf.d/health.d/swap.conf | 8 | ||||
-rw-r--r-- | conf.d/health.d/tcp_resets.conf | 13 | ||||
-rw-r--r-- | conf.d/health.d/udp_errors.conf | 9 |
13 files changed, 115 insertions, 6 deletions
diff --git a/conf.d/health.d/cpu.conf b/conf.d/health.d/cpu.conf index 30a714097..db6285561 100644 --- a/conf.d/health.d/cpu.conf +++ b/conf.d/health.d/cpu.conf @@ -1,6 +1,10 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + template: 10min_cpu_usage on: system.cpu + os: linux + hosts: * lookup: average -10m unaligned of user,system,softirq,irq,guest units: % every: 1m @@ -12,6 +16,8 @@ template: 10min_cpu_usage template: 10min_cpu_iowait on: system.cpu + os: linux + hosts: * lookup: average -10m unaligned of iowait units: % every: 1m @@ -23,6 +29,8 @@ template: 10min_cpu_iowait template: 20min_steal_cpu on: system.cpu + os: linux + hosts: * lookup: average -20m unaligned of steal units: % every: 5m diff --git a/conf.d/health.d/disks.conf b/conf.d/health.d/disks.conf index 9548f9ee0..63053491e 100644 --- a/conf.d/health.d/disks.conf +++ b/conf.d/health.d/disks.conf @@ -1,3 +1,7 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + + # ----------------------------------------------------------------------------- # low disk space @@ -7,6 +11,8 @@ template: disk_space_usage on: disk.space + os: linux + hosts: * families: * calc: $used * 100 / ($avail + $used) units: % @@ -19,6 +25,8 @@ families: * template: disk_inode_usage on: disk.inodes + os: linux + hosts: * families: * calc: $used * 100 / ($avail + $used) units: % @@ -43,6 +51,8 @@ families: * template: disk_fill_rate on: disk.space + os: linux + hosts: * families: * lookup: min -10m at -50m unaligned of avail calc: ($this - $avail) / (($now - $after) / 3600) @@ -57,6 +67,8 @@ families: * template: out_of_disk_space_time on: disk.space + os: linux + hosts: * families: * calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf) units: hours @@ -77,6 +89,8 @@ families: * template: 10min_disk_utilization on: disk.util + os: linux + hosts: * families: * lookup: average -10m unaligned units: % @@ -97,6 +111,8 @@ families: * template: 10min_disk_backlog on: disk.backlog + os: linux + hosts: * families: * lookup: average -10m unaligned units: ms diff --git a/conf.d/health.d/entropy.conf b/conf.d/health.d/entropy.conf index 5dd8af502..66d44ec13 100644 --- a/conf.d/health.d/entropy.conf +++ b/conf.d/health.d/entropy.conf @@ -5,6 +5,8 @@ alarm: lowest_entropy on: system.entropy + os: linux + hosts: * lookup: min -10m unaligned units: entries every: 5m diff --git a/conf.d/health.d/ipc.conf b/conf.d/health.d/ipc.conf index ee7c4badd..03cf264d8 100644 --- a/conf.d/health.d/ipc.conf +++ b/conf.d/health.d/ipc.conf @@ -1,6 +1,10 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + alarm: semaphores_used on: system.ipc_semaphores + os: linux + hosts: * calc: $semaphores * 100 / $ipc.semaphores.max units: % every: 10s @@ -12,6 +16,8 @@ alarm: semaphore_arrays_used on: system.ipc_semaphore_arrays + os: linux + hosts: * calc: $arrays * 100 / $ipc.semaphores.arrays.max units: % every: 10s diff --git a/conf.d/health.d/memory.conf b/conf.d/health.d/memory.conf index 3c904f6b1..4a0e6e522 100644 --- a/conf.d/health.d/memory.conf +++ b/conf.d/health.d/memory.conf @@ -1,6 +1,10 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + alarm: 1hour_ecc_memory_correctable on: mem.ecc_ce + os: linux + hosts: * lookup: sum -10m unaligned units: errors every: 1m @@ -11,6 +15,8 @@ alarm: 1hour_ecc_memory_uncorrectable on: mem.ecc_ue + os: linux + hosts: * lookup: sum -10m unaligned units: errors every: 1m @@ -21,6 +27,8 @@ alarm: 1hour_memory_hw_corrupted on: mem.hwcorrupt + os: linux + hosts: * calc: $HardwareCorrupted units: MB every: 10s diff --git a/conf.d/health.d/net.conf b/conf.d/health.d/net.conf index bd288817b..00a198612 100644 --- a/conf.d/health.d/net.conf +++ b/conf.d/health.d/net.conf @@ -1,4 +1,6 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + # ----------------------------------------------------------------------------- # dropped packets @@ -8,48 +10,56 @@ template: inbound_packets_dropped on: net.drops + os: linux + hosts: * families: * lookup: sum -10m unaligned absolute of inbound units: packets every: 1m - warn: $this > 0 + warn: $this >= 5 delay: down 1h multiplier 1.5 max 2h info: interface inbound dropped packets in the last 10 minutes to: sysadmin template: outbound_packets_dropped on: net.drops + os: linux + hosts: * families: * lookup: sum -10m unaligned absolute of outbound units: packets every: 1m - warn: $this > 0 + warn: $this >= 5 delay: down 1h multiplier 1.5 max 2h info: interface outbound dropped packets in the last 10 minutes to: sysadmin template: inbound_packets_dropped_ratio on: net.packets + os: linux + hosts: * families: * lookup: sum -10m unaligned absolute of received calc: (($inbound_packets_dropped != nan AND $this > 0) ? ($inbound_packets_dropped * 100 / $this) : (0)) units: % every: 1m - warn: $this > 0.5 - crit: $this > 3 + warn: $this >= 0.1 + crit: $this >= 2 delay: down 1h multiplier 1.5 max 2h info: the ratio of inbound dropped packets vs the total number of received packets of the network interface, during the last 10 minutes to: sysadmin template: outbound_packets_dropped_ratio on: net.packets + os: linux + hosts: * families: * lookup: sum -10m unaligned absolute of sent calc: (($outbound_packets_dropped != nan AND $this > 0) ? ($outbound_packets_dropped * 100 / $this) : (0)) units: % every: 1m - warn: $this > 0.5 - crit: $this > 3 + warn: $this >= 0.1 + crit: $this >= 2 delay: down 1h multiplier 1.5 max 2h info: the ratio of outbound dropped packets vs the total number of sent packets of the network interface, during the last 10 minutes to: sysadmin @@ -65,6 +75,8 @@ families: * template: 10min_fifo_errors on: net.fifo + os: linux + hosts: * families: * lookup: sum -10m unaligned absolute units: errors @@ -86,6 +98,8 @@ families: * template: 1m_received_packets_rate on: net.packets + os: linux + hosts: * families: * lookup: average -1m of received units: packets @@ -94,6 +108,8 @@ families: * template: 10s_received_packets_storm on: net.packets + os: linux + hosts: * families: * lookup: average -10s of received calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate)) diff --git a/conf.d/health.d/netfilter.conf b/conf.d/health.d/netfilter.conf index 3dd6a67b3..fa1732b33 100644 --- a/conf.d/health.d/netfilter.conf +++ b/conf.d/health.d/netfilter.conf @@ -1,6 +1,10 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + alarm: netfilter_last_collected_secs on: netfilter.conntrack_sockets + os: linux + hosts: * calc: $now - $last_collected_t units: seconds ago every: 10s @@ -12,6 +16,8 @@ alarm: netfilter_conntrack_full on: netfilter.conntrack_sockets + os: linux + hosts: * lookup: max -10s unaligned of connections calc: $this * 100 / $netfilter.conntrack.max units: % diff --git a/conf.d/health.d/qos.conf b/conf.d/health.d/qos.conf index 9e5939fdc..7290d15ff 100644 --- a/conf.d/health.d/qos.conf +++ b/conf.d/health.d/qos.conf @@ -1,10 +1,14 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + # check if a QoS class is dropping packets # the alarm is checked every 10 seconds # and examines the last minute of data #template: 10min_qos_packet_drops # on: tc.qos_dropped +# os: linux +# hosts: * # lookup: sum -10m unaligned absolute # every: 30s # warn: $this > 0 diff --git a/conf.d/health.d/ram.conf b/conf.d/health.d/ram.conf index b99e5e226..8d0e8838d 100644 --- a/conf.d/health.d/ram.conf +++ b/conf.d/health.d/ram.conf @@ -1,12 +1,18 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + alarm: used_ram_to_ignore on: system.ram + os: linux + hosts: * calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz) every: 10s info: the amount of memory that is reported as used, but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC) alarm: ram_in_use on: system.ram + os: linux + hosts: * # calc: $used * 100 / ($used + $cached + $free) calc: ($used - $used_ram_to_ignore) * 100 / ($used - $used_ram_to_ignore + $cached + $free) units: % diff --git a/conf.d/health.d/softnet.conf b/conf.d/health.d/softnet.conf index 5faf9a9ee..64e1c6784 100644 --- a/conf.d/health.d/softnet.conf +++ b/conf.d/health.d/softnet.conf @@ -1,7 +1,12 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + # check for common /proc/net/softnet_stat errors alarm: 10min_netdev_backlog_exceeded on: system.softnet_stat + os: linux + hosts: * lookup: sum -10m unaligned absolute of dropped units: packets every: 1m @@ -12,6 +17,8 @@ alarm: 10min_netdev_budget_ran_outs on: system.softnet_stat + os: linux + hosts: * lookup: sum -10m unaligned absolute of squeezed units: events every: 1m diff --git a/conf.d/health.d/swap.conf b/conf.d/health.d/swap.conf index 7f57560e2..830a9af95 100644 --- a/conf.d/health.d/swap.conf +++ b/conf.d/health.d/swap.conf @@ -1,6 +1,10 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + alarm: 30min_ram_swapped_out on: system.swapio + os: linux + hosts: * lookup: sum -30m unaligned absolute of out # we have to convert KB to MB by dividing $this (i.e. the result of the lookup) with 1024 calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free ) @@ -14,6 +18,8 @@ alarm: ram_in_swap on: system.swap + os: linux + hosts: * calc: $used * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free ) units: % of RAM every: 10s @@ -25,6 +31,8 @@ alarm: used_swap on: system.swap + os: linux + hosts: * calc: $used * 100 / ( $used + $free ) units: % every: 10s diff --git a/conf.d/health.d/tcp_resets.conf b/conf.d/health.d/tcp_resets.conf index 803c88a81..fec124ac7 100644 --- a/conf.d/health.d/tcp_resets.conf +++ b/conf.d/health.d/tcp_resets.conf @@ -1,7 +1,12 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + # ----------------------------------------------------------------------------- alarm: ipv4_tcphandshake_last_collected_secs on: ipv4.tcphandshake + os: linux + hosts: * calc: $now - $last_collected_t units: seconds ago every: 10s @@ -16,6 +21,8 @@ alarm: 1m_ipv4_tcp_resets_sent on: ipv4.tcphandshake + os: linux + hosts: * lookup: average -1m at -10s unaligned absolute of OutRsts units: tcp resets/s every: 10s @@ -23,6 +30,8 @@ alarm: 10s_ipv4_tcp_resets_sent on: ipv4.tcphandshake + os: linux + hosts: * lookup: average -10s unaligned absolute of OutRsts units: tcp resets/s every: 10s @@ -37,6 +46,8 @@ options: no-clear-notification alarm: 1m_ipv4_tcp_resets_received on: ipv4.tcphandshake + os: linux + hosts: * lookup: average -1m at -10s unaligned absolute of AttemptFails units: tcp resets/s every: 10s @@ -44,6 +55,8 @@ options: no-clear-notification alarm: 10s_ipv4_tcp_resets_received on: ipv4.tcphandshake + os: linux + hosts: * lookup: average -10s unaligned absolute of AttemptFails units: tcp resets/s every: 10s diff --git a/conf.d/health.d/udp_errors.conf b/conf.d/health.d/udp_errors.conf index 98e955c02..33338b83e 100644 --- a/conf.d/health.d/udp_errors.conf +++ b/conf.d/health.d/udp_errors.conf @@ -1,7 +1,12 @@ + +# you can disable an alarm notification by setting the 'to' line to: silent + # ----------------------------------------------------------------------------- alarm: ipv4_udperrors_last_collected_secs on: ipv4.udperrors + os: linux + hosts: * calc: $now - $last_collected_t units: seconds ago every: 10s @@ -16,6 +21,8 @@ alarm: 1m_ipv4_udp_receive_buffer_errors on: ipv4.udperrors + os: linux + hosts: * lookup: sum -1m unaligned absolute of RcvbufErrors units: errors every: 10s @@ -30,6 +37,8 @@ alarm: 1m_ipv4_udp_send_buffer_errors on: ipv4.udperrors + os: linux + hosts: * lookup: sum -1m unaligned absolute of SndbufErrors units: errors every: 10s |