diff options
Diffstat (limited to 'health/health.d')
-rw-r--r-- | health/health.d/backend.conf | 42 | ||||
-rw-r--r-- | health/health.d/cgroups.conf | 37 | ||||
-rw-r--r-- | health/health.d/fronius.conf | 14 | ||||
-rw-r--r-- | health/health.d/ioping.conf | 10 | ||||
-rw-r--r-- | health/health.d/net.conf | 8 | ||||
-rw-r--r-- | health/health.d/nut.conf | 47 | ||||
-rw-r--r-- | health/health.d/stiebeleltron.conf | 14 | ||||
-rw-r--r-- | health/health.d/tcp_resets.conf | 2 | ||||
-rw-r--r-- | health/health.d/timex.conf | 2 |
9 files changed, 95 insertions, 81 deletions
diff --git a/health/health.d/backend.conf b/health/health.d/backend.conf deleted file mode 100644 index 91d469395..000000000 --- a/health/health.d/backend.conf +++ /dev/null @@ -1,42 +0,0 @@ -# Alert that backends subsystem will be disabled soon - alarm: backend_metrics_eol - on: netdata.backend_metrics - class: Errors - type: Netdata -component: Exporting engine - units: boolean - calc: $now - $last_collected_t - every: 1m - warn: $this > 0 - delay: down 5m multiplier 1.5 max 1h - info: the backends subsystem is deprecated and will be removed soon. Migrate your configuration to exporting.conf. - to: sysadmin - -# make sure we are sending data to backend - - alarm: backend_last_buffering - on: netdata.backend_metrics - class: Latency - type: Netdata -component: Exporting engine - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful buffering of backend data - to: dba - - alarm: backend_metrics_sent - on: netdata.backend_metrics - class: Workload - type: Netdata -component: Exporting engine - units: % - calc: abs($sent) * 100 / abs($buffered) - every: 10s - warn: $this != 100 - delay: down 5m multiplier 1.5 max 1h - info: percentage of metrics sent to the backend server - to: dba diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf index 45b34806c..aa416c795 100644 --- a/health/health.d/cgroups.conf +++ b/health/health.d/cgroups.conf @@ -32,3 +32,40 @@ component: Memory delay: down 15m multiplier 1.5 max 1h info: cgroup memory utilization to: sysadmin + +# ----------------------------------------------------------------------------- +# check for packet storms + +# 1. calculate the rate packets are received in 1m: 1m_received_packets_rate +# 2. do the same for the last 10s +# 3. raise an alarm if the later is 10x or 20x the first +# we assume the minimum packet storm should at least have +# 10000 packets/s, average of the last 10 seconds + + template: cgroup_1m_received_packets_rate + on: cgroup.net_packets + class: Workload + type: Cgroups +component: Network + hosts: * + lookup: average -1m unaligned of received + units: packets + every: 10s + info: average number of packets received by the network interface $family over the last minute + + template: cgroup_10s_received_packets_storm + on: cgroup.net_packets + class: Workload + type: Cgroups +component: Network + hosts: * + lookup: average -10s unaligned of received + calc: $this * 100 / (($1m_received_packets_rate < 1000)?(1000):($1m_received_packets_rate)) + every: 10s + units: % + warn: $this > (($status >= $WARNING)?(200):(5000)) + crit: $this > (($status == $CRITICAL)?(5000):(6000)) + options: no-clear-notification + info: ratio of average number of received packets for the network interface $family over the last 10 seconds, \ + compared to the rate over the last minute + to: sysadmin diff --git a/health/health.d/fronius.conf b/health/health.d/fronius.conf deleted file mode 100644 index 853bd7fbc..000000000 --- a/health/health.d/fronius.conf +++ /dev/null @@ -1,14 +0,0 @@ - template: fronius_last_collected_secs - families: * - on: fronius.power - class: Latency - type: Power Supply -component: Solar - calc: $now - $last_collected_t - every: 10s - units: seconds ago - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sitemgr diff --git a/health/health.d/ioping.conf b/health/health.d/ioping.conf index ee4befbea..8b498ad3c 100644 --- a/health/health.d/ioping.conf +++ b/health/health.d/ioping.conf @@ -4,12 +4,12 @@ class: Latency type: System component: Disk - lookup: average -10s unaligned of average - units: ms + lookup: average -10s unaligned of latency + units: microseconds every: 10s - green: 500 - red: 1000 - warn: $this > $green OR $max > $red + green: 5000 + red: 10000 + warn: $this > $green crit: $this > $red delay: down 30m multiplier 1.5 max 2h info: average I/O latency over the last 10 seconds diff --git a/health/health.d/net.conf b/health/health.d/net.conf index 028ca7b81..9d5b3b8d3 100644 --- a/health/health.d/net.conf +++ b/health/health.d/net.conf @@ -68,7 +68,7 @@ component: Network component: Network os: linux hosts: * - families: !net* * + families: * lookup: sum -10m unaligned absolute of inbound units: packets every: 1m @@ -81,7 +81,7 @@ component: Network component: Network os: linux hosts: * - families: !net* * + families: * lookup: sum -10m unaligned absolute of outbound units: packets every: 1m @@ -94,7 +94,7 @@ component: Network component: Network os: linux hosts: * - families: !net* !wl* * + families: !wl* * lookup: sum -10m unaligned absolute of received calc: (($inbound_packets_dropped != nan AND $this > 10000) ? ($inbound_packets_dropped * 100 / $this) : (0)) units: % @@ -111,7 +111,7 @@ component: Network component: Network os: linux hosts: * - families: !net* !wl* * + families: !wl* * lookup: sum -10m unaligned absolute of sent calc: (($outbound_packets_dropped != nan AND $this > 1000) ? ($outbound_packets_dropped * 100 / $this) : (0)) units: % diff --git a/health/health.d/nut.conf b/health/health.d/nut.conf new file mode 100644 index 000000000..6231dd97b --- /dev/null +++ b/health/health.d/nut.conf @@ -0,0 +1,47 @@ +# you can disable an alarm notification by setting the 'to' line to: silent + + template: nut_10min_ups_load + on: nut.load + class: Utilization + type: Power Supply +component: UPS + os: * + hosts: * + lookup: average -10m unaligned of load + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (85) : (95)) + delay: down 10m multiplier 1.5 max 1h + info: average UPS load over the last 10 minutes + to: sitemgr + + template: nut_ups_charge + on: nut.charge + class: Errors + type: Power Supply +component: UPS + os: * + hosts: * + lookup: average -60s unaligned of battery_charge + units: % + every: 60s + warn: $this < 100 + crit: $this < (($status == $CRITICAL) ? (60) : (50)) + delay: down 10m multiplier 1.5 max 1h + info: average UPS charge over the last minute + to: sitemgr + + template: nut_last_collected_secs + on: nut.load + class: Latency + type: Power Supply +component: UPS device + calc: $now - $last_collected_t + every: 10s + units: seconds ago + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: sitemgr diff --git a/health/health.d/stiebeleltron.conf b/health/health.d/stiebeleltron.conf deleted file mode 100644 index 493c8b73a..000000000 --- a/health/health.d/stiebeleltron.conf +++ /dev/null @@ -1,14 +0,0 @@ - template: stiebeleltron_last_collected_secs - families: * - on: stiebeleltron.heating.hc1 - class: Latency - type: Other -component: Sensors - calc: $now - $last_collected_t - every: 10s - units: seconds ago - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: sitemgr diff --git a/health/health.d/tcp_resets.conf b/health/health.d/tcp_resets.conf index 190271e47..35cb6366c 100644 --- a/health/health.d/tcp_resets.conf +++ b/health/health.d/tcp_resets.conf @@ -26,7 +26,7 @@ component: Network lookup: average -10s unaligned absolute of OutRsts units: tcp resets/s every: 10s - warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (20))) + warn: $this > ((($1m_ipv4_tcp_resets_sent < 5)?(5):($1m_ipv4_tcp_resets_sent)) * (($status >= $WARNING) ? (1) : (10))) delay: up 20s down 60m multiplier 1.2 max 2h options: no-clear-notification info: average number of sent TCP RESETS over the last 10 seconds. \ diff --git a/health/health.d/timex.conf b/health/health.d/timex.conf index ea90c4000..23c18ba10 100644 --- a/health/health.d/timex.conf +++ b/health/health.d/timex.conf @@ -13,5 +13,5 @@ component: Clock every: 10s warn: $system.uptime.uptime > 17 * 60 AND $this == 0 delay: down 5m - info: the system time is not synchronized to a reliable server + info: when set to 0, the system kernel believes the system clock is not properly synchronized to a reliable server to: silent |