summaryrefslogtreecommitdiffstats
path: root/health/health.d
diff options
context:
space:
mode:
Diffstat (limited to 'health/health.d')
-rw-r--r--health/health.d/apcupsd.conf75
-rw-r--r--health/health.d/exporting.conf4
-rw-r--r--health/health.d/ipmi.conf2
-rw-r--r--health/health.d/net.conf6
-rw-r--r--health/health.d/nut.conf50
5 files changed, 81 insertions, 56 deletions
diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf
index fc8f2cd0f..90a72af19 100644
--- a/health/health.d/apcupsd.conf
+++ b/health/health.d/apcupsd.conf
@@ -48,3 +48,78 @@ component: UPS device
summary: APC UPS last collection
info: APC UPS number of seconds since the last successful data collection
to: sitemgr
+
+#Send out a warning when SELFTEST code is BT or NG. Code descriptions can be found at:
+#http://www.apcupsd.org/manual/#:~:text=or%20N/A.-,SELFTEST,-The%20results%20of
+ template: apcupsd_selftest_warning
+ on: apcupsd.selftest
+ lookup: max -1s unaligned match-names of BT,NG
+ units: status
+ every: 10s
+ warn: $this == 1
+ delay: up 0 down 15m multiplier 1.5 max 1h
+ info: APC UPS self-test failed due to insufficient battery capacity or due to overload.
+ to: sitemgr
+
+#Send out a warning when STATUS code is ONBATT,OVERLOAD,LOWBATT,REPLACEBATT,NOBATT,COMMLOST
+#https://man.archlinux.org/man/apcaccess.8.en#:~:text=apcupsd%20was%20started-,STATUS,-%3A%20UPS%20status.%20One
+
+ template: apcupsd_status_onbatt
+ on: apcupsd.status
+ lookup: max -1s unaligned match-names of ONBATT
+ units: status
+ every: 10s
+ warn: $this == 1
+ delay: up 1m down 15m multiplier 1.5 max 1h
+ info: APC UPS has switched to battery power because the input power has failed
+ to: sitemgr
+
+ template: apcupsd_status_overload
+ on: apcupsd.status
+ lookup: max -1s unaligned match-names of OVERLOAD
+ units: status
+ every: 10s
+ warn: $this == 1
+ delay: up 0 down 15m multiplier 1.5 max 1h
+ info: APC UPS is overloaded and cannot supply enough power to the load
+ to: sitemgr
+
+ template: apcupsd_status_lowbatt
+ on: apcupsd.status
+ lookup: max -1s unaligned match-names of LOWBATT
+ units: status
+ every: 10s
+ warn: $this == 1
+ delay: up 0 down 15m multiplier 1.5 max 1h
+ info: APC UPS battery is low and needs to be recharged
+ to: sitemgr
+
+ template: apcupsd_status_replacebatt
+ on: apcupsd.status
+ lookup: max -1s unaligned match-names of REPLACEBATT
+ units: status
+ every: 10s
+ warn: $this == 1
+ delay: up 0 down 15m multiplier 1.5 max 1h
+ info: APC UPS battery has reached the end of its lifespan and needs to be replaced
+ to: sitemgr
+
+ template: apcupsd_status_nobatt
+ on: apcupsd.status
+ lookup: max -1s unaligned match-names of NOBATT
+ units: status
+ every: 10s
+ warn: $this == 1
+ delay: up 0 down 15m multiplier 1.5 max 1h
+ info: APC UPS has no battery
+ to: sitemgr
+
+ template: apcupsd_status_commlost
+ on: apcupsd.status
+ lookup: max -1s unaligned match-names of COMMLOST
+ units: status
+ every: 10s
+ warn: $this == 1
+ delay: up 0 down 15m multiplier 1.5 max 1h
+ info: APC UPS communication link is lost
+ to: sitemgr
diff --git a/health/health.d/exporting.conf b/health/health.d/exporting.conf
index 37d4fd648..c0320193c 100644
--- a/health/health.d/exporting.conf
+++ b/health/health.d/exporting.conf
@@ -1,6 +1,6 @@
template: exporting_last_buffering
- on: exporting_data_size
+ on: netdata.exporting_data_size
class: Latency
type: Netdata
component: Exporting engine
@@ -15,7 +15,7 @@ component: Exporting engine
to: dba
template: exporting_metrics_sent
- on: exporting_data_size
+ on: netdata.exporting_data_size
class: Workload
type: Netdata
component: Exporting engine
diff --git a/health/health.d/ipmi.conf b/health/health.d/ipmi.conf
index 942dc070b..cec2320a9 100644
--- a/health/health.d/ipmi.conf
+++ b/health/health.d/ipmi.conf
@@ -20,7 +20,7 @@ component: IPMI
component: IPMI
calc: $events
units: events
- every: 10s
+ every: 30s
warn: $this > 0
delay: up 5m down 15m multiplier 1.5 max 1h
summary: IPMI entries in System Event Log
diff --git a/health/health.d/net.conf b/health/health.d/net.conf
index ea4954187..2dfe6bbaf 100644
--- a/health/health.d/net.conf
+++ b/health/health.d/net.conf
@@ -11,7 +11,7 @@
component: Network
os: *
hosts: *
- calc: ( $nic_speed_max > 0 ) ? ( $nic_speed_max) : ( nan )
+ calc: ( $nic_speed_max > 0 ) ? ( $nic_speed_max / 1000) : ( nan )
units: Mbit
every: 10s
info: Network interface ${label:device} current speed
@@ -24,7 +24,7 @@ component: Network
os: linux
hosts: *
lookup: average -1m unaligned absolute of received
- calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed)) : ( nan )
+ calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan )
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (85) : (90))
@@ -41,7 +41,7 @@ component: Network
os: linux
hosts: *
lookup: average -1m unaligned absolute of sent
- calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed)) : ( nan )
+ calc: ($interface_speed > 0) ? ($this * 100 / ($interface_speed * 1000)) : ( nan )
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (85) : (90))
diff --git a/health/health.d/nut.conf b/health/health.d/nut.conf
deleted file mode 100644
index 7a74653e9..000000000
--- a/health/health.d/nut.conf
+++ /dev/null
@@ -1,50 +0,0 @@
-# you can disable an alarm notification by setting the 'to' line to: silent
-
- template: nut_10min_ups_load
- on: nut.load
- class: Utilization
- type: Power Supply
-component: UPS
- os: *
- hosts: *
- lookup: average -10m unaligned of load
- units: %
- every: 1m
- warn: $this > (($status >= $WARNING) ? (70) : (80))
- crit: $this > (($status == $CRITICAL) ? (85) : (95))
- delay: down 10m multiplier 1.5 max 1h
- summary: UPS load
- info: UPS average load over the last 10 minutes
- to: sitemgr
-
- template: nut_ups_charge
- on: nut.charge
- class: Errors
- type: Power Supply
-component: UPS
- os: *
- hosts: *
- lookup: average -60s unaligned of battery_charge
- units: %
- every: 60s
- warn: $this < 75
- crit: $this < 40
- delay: down 10m multiplier 1.5 max 1h
- summary: UPS battery charge
- info: UPS average battery charge over the last minute
- to: sitemgr
-
- template: nut_last_collected_secs
- on: nut.load
- class: Latency
- type: Power Supply
-component: UPS device
- calc: $now - $last_collected_t
- every: 10s
- units: seconds ago
- warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
- delay: down 5m multiplier 1.5 max 1h
- summary: NUT last collected
- info: Number of seconds since the last successful data collection
- to: sitemgr