summaryrefslogtreecommitdiffstats
path: root/conf.d/health.d
diff options
context:
space:
mode:
Diffstat (limited to 'conf.d/health.d')
-rw-r--r--conf.d/health.d/cpu.conf8
-rw-r--r--conf.d/health.d/disks.conf16
-rw-r--r--conf.d/health.d/entropy.conf2
-rw-r--r--conf.d/health.d/ipc.conf6
-rw-r--r--conf.d/health.d/memory.conf8
-rw-r--r--conf.d/health.d/net.conf28
-rw-r--r--conf.d/health.d/netfilter.conf6
-rw-r--r--conf.d/health.d/qos.conf4
-rw-r--r--conf.d/health.d/ram.conf6
-rw-r--r--conf.d/health.d/softnet.conf7
-rw-r--r--conf.d/health.d/swap.conf8
-rw-r--r--conf.d/health.d/tcp_resets.conf13
-rw-r--r--conf.d/health.d/udp_errors.conf9
13 files changed, 115 insertions, 6 deletions
diff --git a/conf.d/health.d/cpu.conf b/conf.d/health.d/cpu.conf
index 30a71409..db628556 100644
--- a/conf.d/health.d/cpu.conf
+++ b/conf.d/health.d/cpu.conf
@@ -1,6 +1,10 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
template: 10min_cpu_usage
on: system.cpu
+ os: linux
+ hosts: *
lookup: average -10m unaligned of user,system,softirq,irq,guest
units: %
every: 1m
@@ -12,6 +16,8 @@ template: 10min_cpu_usage
template: 10min_cpu_iowait
on: system.cpu
+ os: linux
+ hosts: *
lookup: average -10m unaligned of iowait
units: %
every: 1m
@@ -23,6 +29,8 @@ template: 10min_cpu_iowait
template: 20min_steal_cpu
on: system.cpu
+ os: linux
+ hosts: *
lookup: average -20m unaligned of steal
units: %
every: 5m
diff --git a/conf.d/health.d/disks.conf b/conf.d/health.d/disks.conf
index 9548f9ee..63053491 100644
--- a/conf.d/health.d/disks.conf
+++ b/conf.d/health.d/disks.conf
@@ -1,3 +1,7 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
+
# -----------------------------------------------------------------------------
# low disk space
@@ -7,6 +11,8 @@
template: disk_space_usage
on: disk.space
+ os: linux
+ hosts: *
families: *
calc: $used * 100 / ($avail + $used)
units: %
@@ -19,6 +25,8 @@ families: *
template: disk_inode_usage
on: disk.inodes
+ os: linux
+ hosts: *
families: *
calc: $used * 100 / ($avail + $used)
units: %
@@ -43,6 +51,8 @@ families: *
template: disk_fill_rate
on: disk.space
+ os: linux
+ hosts: *
families: *
lookup: min -10m at -50m unaligned of avail
calc: ($this - $avail) / (($now - $after) / 3600)
@@ -57,6 +67,8 @@ families: *
template: out_of_disk_space_time
on: disk.space
+ os: linux
+ hosts: *
families: *
calc: ($disk_fill_rate > 0) ? ($avail / $disk_fill_rate) : (inf)
units: hours
@@ -77,6 +89,8 @@ families: *
template: 10min_disk_utilization
on: disk.util
+ os: linux
+ hosts: *
families: *
lookup: average -10m unaligned
units: %
@@ -97,6 +111,8 @@ families: *
template: 10min_disk_backlog
on: disk.backlog
+ os: linux
+ hosts: *
families: *
lookup: average -10m unaligned
units: ms
diff --git a/conf.d/health.d/entropy.conf b/conf.d/health.d/entropy.conf
index 5dd8af50..66d44ec1 100644
--- a/conf.d/health.d/entropy.conf
+++ b/conf.d/health.d/entropy.conf
@@ -5,6 +5,8 @@
alarm: lowest_entropy
on: system.entropy
+ os: linux
+ hosts: *
lookup: min -10m unaligned
units: entries
every: 5m
diff --git a/conf.d/health.d/ipc.conf b/conf.d/health.d/ipc.conf
index ee7c4bad..03cf264d 100644
--- a/conf.d/health.d/ipc.conf
+++ b/conf.d/health.d/ipc.conf
@@ -1,6 +1,10 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
alarm: semaphores_used
on: system.ipc_semaphores
+ os: linux
+ hosts: *
calc: $semaphores * 100 / $ipc.semaphores.max
units: %
every: 10s
@@ -12,6 +16,8 @@
alarm: semaphore_arrays_used
on: system.ipc_semaphore_arrays
+ os: linux
+ hosts: *
calc: $arrays * 100 / $ipc.semaphores.arrays.max
units: %
every: 10s
diff --git a/conf.d/health.d/memory.conf b/conf.d/health.d/memory.conf
index 3c904f6b..4a0e6e52 100644
--- a/conf.d/health.d/memory.conf
+++ b/conf.d/health.d/memory.conf
@@ -1,6 +1,10 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
alarm: 1hour_ecc_memory_correctable
on: mem.ecc_ce
+ os: linux
+ hosts: *
lookup: sum -10m unaligned
units: errors
every: 1m
@@ -11,6 +15,8 @@
alarm: 1hour_ecc_memory_uncorrectable
on: mem.ecc_ue
+ os: linux
+ hosts: *
lookup: sum -10m unaligned
units: errors
every: 1m
@@ -21,6 +27,8 @@
alarm: 1hour_memory_hw_corrupted
on: mem.hwcorrupt
+ os: linux
+ hosts: *
calc: $HardwareCorrupted
units: MB
every: 10s
diff --git a/conf.d/health.d/net.conf b/conf.d/health.d/net.conf
index bd288817..00a19861 100644
--- a/conf.d/health.d/net.conf
+++ b/conf.d/health.d/net.conf
@@ -1,4 +1,6 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
# -----------------------------------------------------------------------------
# dropped packets
@@ -8,48 +10,56 @@
template: inbound_packets_dropped
on: net.drops
+ os: linux
+ hosts: *
families: *
lookup: sum -10m unaligned absolute of inbound
units: packets
every: 1m
- warn: $this > 0
+ warn: $this >= 5
delay: down 1h multiplier 1.5 max 2h
info: interface inbound dropped packets in the last 10 minutes
to: sysadmin
template: outbound_packets_dropped
on: net.drops
+ os: linux
+ hosts: *
families: *
lookup: sum -10m unaligned absolute of outbound
units: packets
every: 1m
- warn: $this > 0
+ warn: $this >= 5
delay: down 1h multiplier 1.5 max 2h
info: interface outbound dropped packets in the last 10 minutes
to: sysadmin
template: inbound_packets_dropped_ratio
on: net.packets
+ os: linux
+ hosts: *
families: *
lookup: sum -10m unaligned absolute of received
calc: (($inbound_packets_dropped != nan AND $this > 0) ? ($inbound_packets_dropped * 100 / $this) : (0))
units: %
every: 1m
- warn: $this > 0.5
- crit: $this > 3
+ warn: $this >= 0.1
+ crit: $this >= 2
delay: down 1h multiplier 1.5 max 2h
info: the ratio of inbound dropped packets vs the total number of received packets of the network interface, during the last 10 minutes
to: sysadmin
template: outbound_packets_dropped_ratio
on: net.packets
+ os: linux
+ hosts: *
families: *
lookup: sum -10m unaligned absolute of sent
calc: (($outbound_packets_dropped != nan AND $this > 0) ? ($outbound_packets_dropped * 100 / $this) : (0))
units: %
every: 1m
- warn: $this > 0.5
- crit: $this > 3
+ warn: $this >= 0.1
+ crit: $this >= 2
delay: down 1h multiplier 1.5 max 2h
info: the ratio of outbound dropped packets vs the total number of sent packets of the network interface, during the last 10 minutes
to: sysadmin
@@ -65,6 +75,8 @@ families: *
template: 10min_fifo_errors
on: net.fifo
+ os: linux
+ hosts: *
families: *
lookup: sum -10m unaligned absolute
units: errors
@@ -86,6 +98,8 @@ families: *
template: 1m_received_packets_rate
on: net.packets
+ os: linux
+ hosts: *
families: *
lookup: average -1m of received
units: packets
@@ -94,6 +108,8 @@ families: *
template: 10s_received_packets_storm
on: net.packets
+ os: linux
+ hosts: *
families: *
lookup: average -10s of received
calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate))
diff --git a/conf.d/health.d/netfilter.conf b/conf.d/health.d/netfilter.conf
index 3dd6a67b..fa1732b3 100644
--- a/conf.d/health.d/netfilter.conf
+++ b/conf.d/health.d/netfilter.conf
@@ -1,6 +1,10 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
alarm: netfilter_last_collected_secs
on: netfilter.conntrack_sockets
+ os: linux
+ hosts: *
calc: $now - $last_collected_t
units: seconds ago
every: 10s
@@ -12,6 +16,8 @@
alarm: netfilter_conntrack_full
on: netfilter.conntrack_sockets
+ os: linux
+ hosts: *
lookup: max -10s unaligned of connections
calc: $this * 100 / $netfilter.conntrack.max
units: %
diff --git a/conf.d/health.d/qos.conf b/conf.d/health.d/qos.conf
index 9e5939fd..7290d15f 100644
--- a/conf.d/health.d/qos.conf
+++ b/conf.d/health.d/qos.conf
@@ -1,10 +1,14 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
# check if a QoS class is dropping packets
# the alarm is checked every 10 seconds
# and examines the last minute of data
#template: 10min_qos_packet_drops
# on: tc.qos_dropped
+# os: linux
+# hosts: *
# lookup: sum -10m unaligned absolute
# every: 30s
# warn: $this > 0
diff --git a/conf.d/health.d/ram.conf b/conf.d/health.d/ram.conf
index b99e5e22..8d0e8838 100644
--- a/conf.d/health.d/ram.conf
+++ b/conf.d/health.d/ram.conf
@@ -1,12 +1,18 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
alarm: used_ram_to_ignore
on: system.ram
+ os: linux
+ hosts: *
calc: ($zfs.arc_size.arcsz = nan)?(0):($zfs.arc_size.arcsz)
every: 10s
info: the amount of memory that is reported as used, but it is actually capable for resizing itself based on the system needs (eg. ZFS ARC)
alarm: ram_in_use
on: system.ram
+ os: linux
+ hosts: *
# calc: $used * 100 / ($used + $cached + $free)
calc: ($used - $used_ram_to_ignore) * 100 / ($used - $used_ram_to_ignore + $cached + $free)
units: %
diff --git a/conf.d/health.d/softnet.conf b/conf.d/health.d/softnet.conf
index 5faf9a9e..64e1c678 100644
--- a/conf.d/health.d/softnet.conf
+++ b/conf.d/health.d/softnet.conf
@@ -1,7 +1,12 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
# check for common /proc/net/softnet_stat errors
alarm: 10min_netdev_backlog_exceeded
on: system.softnet_stat
+ os: linux
+ hosts: *
lookup: sum -10m unaligned absolute of dropped
units: packets
every: 1m
@@ -12,6 +17,8 @@
alarm: 10min_netdev_budget_ran_outs
on: system.softnet_stat
+ os: linux
+ hosts: *
lookup: sum -10m unaligned absolute of squeezed
units: events
every: 1m
diff --git a/conf.d/health.d/swap.conf b/conf.d/health.d/swap.conf
index 7f57560e..830a9af9 100644
--- a/conf.d/health.d/swap.conf
+++ b/conf.d/health.d/swap.conf
@@ -1,6 +1,10 @@
+# you can disable an alarm notification by setting the 'to' line to: silent
+
alarm: 30min_ram_swapped_out
on: system.swapio
+ os: linux
+ hosts: *
lookup: sum -30m unaligned absolute of out
# we have to convert KB to MB by dividing $this (i.e. the result of the lookup) with 1024
calc: $this / 1024 * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
@@ -14,6 +18,8 @@
alarm: ram_in_swap
on: system.swap
+ os: linux
+ hosts: *
calc: $used * 100 / ( $system.ram.used + $system.ram.cached + $system.ram.free )
units: % of RAM
every: 10s
@@ -25,6 +31,8 @@
alarm: used_swap
on: system.swap
+ os: linux
+ hosts: *
calc: $used * 100 / ( $used + $free )
units: %
every: 10s
diff --git a/conf.d/health.d/tcp_resets.conf b/conf.d/health.d/tcp_resets.conf
index 803c88a8..fec124ac 100644
--- a/conf.d/health.d/tcp_resets.conf
+++ b/conf.d/health.d/tcp_resets.conf
@@ -1,7 +1,12 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
# -----------------------------------------------------------------------------
alarm: ipv4_tcphandshake_last_collected_secs
on: ipv4.tcphandshake
+ os: linux
+ hosts: *
calc: $now - $last_collected_t
units: seconds ago
every: 10s
@@ -16,6 +21,8 @@
alarm: 1m_ipv4_tcp_resets_sent
on: ipv4.tcphandshake
+ os: linux
+ hosts: *
lookup: average -1m at -10s unaligned absolute of OutRsts
units: tcp resets/s
every: 10s
@@ -23,6 +30,8 @@
alarm: 10s_ipv4_tcp_resets_sent
on: ipv4.tcphandshake
+ os: linux
+ hosts: *
lookup: average -10s unaligned absolute of OutRsts
units: tcp resets/s
every: 10s
@@ -37,6 +46,8 @@ options: no-clear-notification
alarm: 1m_ipv4_tcp_resets_received
on: ipv4.tcphandshake
+ os: linux
+ hosts: *
lookup: average -1m at -10s unaligned absolute of AttemptFails
units: tcp resets/s
every: 10s
@@ -44,6 +55,8 @@ options: no-clear-notification
alarm: 10s_ipv4_tcp_resets_received
on: ipv4.tcphandshake
+ os: linux
+ hosts: *
lookup: average -10s unaligned absolute of AttemptFails
units: tcp resets/s
every: 10s
diff --git a/conf.d/health.d/udp_errors.conf b/conf.d/health.d/udp_errors.conf
index 98e955c0..33338b83 100644
--- a/conf.d/health.d/udp_errors.conf
+++ b/conf.d/health.d/udp_errors.conf
@@ -1,7 +1,12 @@
+
+# you can disable an alarm notification by setting the 'to' line to: silent
+
# -----------------------------------------------------------------------------
alarm: ipv4_udperrors_last_collected_secs
on: ipv4.udperrors
+ os: linux
+ hosts: *
calc: $now - $last_collected_t
units: seconds ago
every: 10s
@@ -16,6 +21,8 @@
alarm: 1m_ipv4_udp_receive_buffer_errors
on: ipv4.udperrors
+ os: linux
+ hosts: *
lookup: sum -1m unaligned absolute of RcvbufErrors
units: errors
every: 10s
@@ -30,6 +37,8 @@
alarm: 1m_ipv4_udp_send_buffer_errors
on: ipv4.udperrors
+ os: linux
+ hosts: *
lookup: sum -1m unaligned absolute of SndbufErrors
units: errors
every: 10s