summaryrefslogtreecommitdiffstats
path: root/health/health.d
diff options
context:
space:
mode:
Diffstat (limited to 'health/health.d')
-rw-r--r--health/health.d/apcupsd.conf4
-rw-r--r--health/health.d/bcache.conf3
-rw-r--r--health/health.d/beanstalkd.conf3
-rw-r--r--health/health.d/bind_rndc.conf1
-rw-r--r--health/health.d/boinc.conf4
-rw-r--r--health/health.d/cgroups.conf3
-rw-r--r--health/health.d/consul.conf2
-rw-r--r--health/health.d/cpu.conf2
-rw-r--r--health/health.d/dnsmasq_dhcp.conf1
-rw-r--r--health/health.d/docker.conf2
-rw-r--r--health/health.d/elasticsearch.conf2
-rw-r--r--health/health.d/file_descriptors.conf31
-rw-r--r--health/health.d/gearman.conf1
-rw-r--r--health/health.d/geth.conf1
-rw-r--r--health/health.d/ioping.conf4
-rw-r--r--health/health.d/ipc.conf2
-rw-r--r--health/health.d/ipmi.conf10
-rw-r--r--health/health.d/linux_power_supply.conf1
-rw-r--r--health/health.d/nut.conf4
-rw-r--r--health/health.d/pihole.conf3
-rw-r--r--health/health.d/ram.conf2
-rw-r--r--health/health.d/scaleio.conf4
-rw-r--r--health/health.d/vcsa.conf3
-rw-r--r--health/health.d/windows.conf40
24 files changed, 61 insertions, 72 deletions
diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf
index 65f1a69ab..7a0afcd18 100644
--- a/health/health.d/apcupsd.conf
+++ b/health/health.d/apcupsd.conf
@@ -11,7 +11,6 @@ component: UPS
units: %
every: 1m
warn: $this > (($status >= $WARNING) ? (70) : (80))
- crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 10m multiplier 1.5 max 1h
info: average UPS load over the last 10 minutes
to: sitemgr
@@ -29,7 +28,7 @@ component: UPS
units: %
every: 60s
warn: $this < 100
- crit: $this < (($status == $CRITICAL) ? (60) : (50))
+ crit: $this < 40
delay: down 10m multiplier 1.5 max 1h
info: average UPS charge over the last minute
to: sitemgr
@@ -43,7 +42,6 @@ component: UPS device
every: 10s
units: seconds ago
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
- crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
to: sitemgr
diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf
index 49cb5ad0f..3f92e80df 100644
--- a/health/health.d/bcache.conf
+++ b/health/health.d/bcache.conf
@@ -22,8 +22,7 @@ component: Disk
calc: $dirty + $metadata + $undefined
units: %
every: 1m
- warn: $this > ( ($status >= $WARNING ) ? ( 70 ) : ( 90 ) )
- crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
+ warn: $this > 75
delay: up 1m down 1h multiplier 1.5 max 2h
info: percentage of cache space used for dirty data and metadata \
(this usually means your SSD cache is too small)
diff --git a/health/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf
index 13ac8c182..4ee8bc0bd 100644
--- a/health/health.d/beanstalkd.conf
+++ b/health/health.d/beanstalkd.conf
@@ -8,8 +8,7 @@ component: Beanstalk
calc: $buried
units: jobs
every: 10s
- warn: $this > 0
- crit: $this > 10
+ warn: $this > 3
delay: up 0 down 5m multiplier 1.2 max 1h
info: number of buried jobs across all tubes. \
You need to manually kick them so they can be processed. \
diff --git a/health/health.d/bind_rndc.conf b/health/health.d/bind_rndc.conf
index 7c09225ff..b3e75a239 100644
--- a/health/health.d/bind_rndc.conf
+++ b/health/health.d/bind_rndc.conf
@@ -7,6 +7,5 @@ component: BIND
every: 60
calc: $stats_size
warn: $this > 512
- crit: $this > 1024
info: BIND statistics-file size
to: sysadmin
diff --git a/health/health.d/boinc.conf b/health/health.d/boinc.conf
index 6f37787d7..b7dcbe316 100644
--- a/health/health.d/boinc.conf
+++ b/health/health.d/boinc.conf
@@ -12,7 +12,6 @@ component: BOINC
units: tasks
every: 1m
warn: $this > 0
- crit: $this > 1
delay: up 1m down 5m multiplier 1.5 max 1h
info: average number of compute errors over the last 10 minutes
to: sysadmin
@@ -29,7 +28,6 @@ component: BOINC
units: tasks
every: 1m
warn: $this > 0
- crit: $this > 1
delay: up 1m down 5m multiplier 1.5 max 1h
info: average number of failed uploads over the last 10 minutes
to: sysadmin
@@ -46,7 +44,6 @@ component: BOINC
units: tasks
every: 1m
warn: $this < 1
- crit: $this < 0.1
delay: up 5m down 10m multiplier 1.5 max 1h
info: average number of total tasks over the last 10 minutes
to: sysadmin
@@ -64,7 +61,6 @@ component: BOINC
units: tasks
every: 1m
warn: $this < 1
- crit: $this < 0.1
delay: up 5m down 10m multiplier 1.5 max 1h
info: average number of active tasks over the last 10 minutes
to: sysadmin
diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf
index 08260ff6d..f625e5455 100644
--- a/health/health.d/cgroups.conf
+++ b/health/health.d/cgroups.conf
@@ -64,7 +64,6 @@ component: Network
every: 10s
units: %
warn: $this > (($status >= $WARNING)?(200):(5000))
- crit: $this > (($status == $CRITICAL)?(5000):(6000))
options: no-clear-notification
info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
compared to the rate over the last minute
@@ -83,7 +82,6 @@ component: CPU
units: %
every: 1m
warn: $this > (($status >= $WARNING) ? (75) : (85))
- crit: $this > (($status == $CRITICAL) ? (85) : (95))
delay: down 15m multiplier 1.5 max 1h
info: average cgroup CPU utilization over the last 10 minutes
to: sysadmin
@@ -134,7 +132,6 @@ component: Network
every: 10s
units: %
warn: $this > (($status >= $WARNING)?(200):(5000))
- crit: $this > (($status == $CRITICAL)?(5000):(6000))
options: no-clear-notification
info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \
compared to the rate over the last minute
diff --git a/health/health.d/consul.conf b/health/health.d/consul.conf
index dff6d2df3..7edca6563 100644
--- a/health/health.d/consul.conf
+++ b/health/health.d/consul.conf
@@ -10,7 +10,7 @@ component: Consul
units: seconds
warn: $this < 14*24*60*60
crit: $this < 7*24*60*60
- info: Consul Enterprise licence expiration time on node ${label:node_name} datacenter ${label:datacenter}
+ info: Consul Enterprise license expiration time on node ${label:node_name} datacenter ${label:datacenter}
to: sysadmin
template: consul_autopilot_health_status
diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf
index ad6952825..907d6ff8a 100644
--- a/health/health.d/cpu.conf
+++ b/health/health.d/cpu.conf
@@ -28,7 +28,6 @@ component: CPU
units: %
every: 1m
warn: $this > (($status >= $WARNING) ? (20) : (40))
- crit: $this > (($status == $CRITICAL) ? (40) : (50))
delay: down 15m multiplier 1.5 max 1h
info: average CPU iowait time over the last 10 minutes
to: sysadmin
@@ -44,7 +43,6 @@ component: CPU
units: %
every: 5m
warn: $this > (($status >= $WARNING) ? (5) : (10))
- crit: $this > (($status == $CRITICAL) ? (20) : (30))
delay: down 1h multiplier 1.5 max 2h
info: average CPU steal time over the last 20 minutes
to: sysadmin
diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf
index 010b94599..81d37df64 100644
--- a/health/health.d/dnsmasq_dhcp.conf
+++ b/health/health.d/dnsmasq_dhcp.conf
@@ -9,7 +9,6 @@ component: Dnsmasq
units: %
calc: $used
warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) )
- crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) )
delay: down 5m
info: DHCP range utilization
to: sysadmin
diff --git a/health/health.d/docker.conf b/health/health.d/docker.conf
index f17028472..01919dc0d 100644
--- a/health/health.d/docker.conf
+++ b/health/health.d/docker.conf
@@ -6,6 +6,6 @@ component: Docker
units: status
every: 10s
lookup: average -10s of unhealthy
- crit: $this > 0
+ warn: $this > 0
info: ${label:container_name} docker container health status is unhealthy
to: sysadmin
diff --git a/health/health.d/elasticsearch.conf b/health/health.d/elasticsearch.conf
index 47f8e1eb9..29f1e9b27 100644
--- a/health/health.d/elasticsearch.conf
+++ b/health/health.d/elasticsearch.conf
@@ -10,7 +10,7 @@ component: Elasticsearch
lookup: average -5s unaligned of *ed
every: 10s
units: status
- warn: $this == 1
+ crit: $this == 1
delay: down 5m multiplier 1.5 max 1h
info: cluster health status is red.
to: sysadmin
diff --git a/health/health.d/file_descriptors.conf b/health/health.d/file_descriptors.conf
new file mode 100644
index 000000000..d136ea517
--- /dev/null
+++ b/health/health.d/file_descriptors.conf
@@ -0,0 +1,31 @@
+ # you can disable an alarm notification by setting the 'to' line to: silent
+
+ template: system_file_descriptors_utilization
+ on: system.file_nr_utilization
+ class: Utilization
+ type: System
+ component: Processes
+ hosts: *
+ lookup: max -1m unaligned
+ units: %
+ every: 1m
+ crit: $this > 90
+ delay: down 15m multiplier 1.5 max 1h
+ info: system-wide utilization of open files
+ to: sysadmin
+
+ template: apps_group_file_descriptors_utilization
+ on: apps.fd_limit
+ class: Utilization
+ type: System
+component: Process
+ os: linux
+ module: !* *
+ hosts: *
+ lookup: max -1m unaligned foreach *
+ units: %
+ every: 1m
+ warn: $this > (($status >= $WARNING) ? (85) : (90))
+ delay: down 15m multiplier 1.5 max 1h
+ info: maximum utilization of open files among all application group PIDs
+ to: sysadmin
diff --git a/health/health.d/gearman.conf b/health/health.d/gearman.conf
index 14010d445..580d114f8 100644
--- a/health/health.d/gearman.conf
+++ b/health/health.d/gearman.conf
@@ -8,7 +8,6 @@ component: Gearman
units: workers
every: 10s
warn: $this > 30000
- crit: $this > 100000
delay: down 5m multiplier 1.5 max 1h
info: average number of queued jobs over the last 10 minutes
to: sysadmin
diff --git a/health/health.d/geth.conf b/health/health.d/geth.conf
index dd1eb4701..361b6b41f 100644
--- a/health/health.d/geth.conf
+++ b/health/health.d/geth.conf
@@ -8,5 +8,4 @@ component: geth
calc: $chain_head_block - $chain_head_header
units: blocks
warn: $this != 0
- crit: $this > 5
delay: down 1m multiplier 1.5 max 1h
diff --git a/health/health.d/ioping.conf b/health/health.d/ioping.conf
index 2786cbd62..47ac4453c 100644
--- a/health/health.d/ioping.conf
+++ b/health/health.d/ioping.conf
@@ -6,10 +6,8 @@ component: Disk
lookup: average -10s unaligned of latency
units: microseconds
every: 10s
- green: 5000
- red: 10000
+ green: 10000
warn: $this > $green
- crit: $this > $red
delay: down 30m multiplier 1.5 max 2h
info: average I/O latency over the last 10 seconds
to: sysadmin
diff --git a/health/health.d/ipc.conf b/health/health.d/ipc.conf
index c178a410a..3d1b46c02 100644
--- a/health/health.d/ipc.conf
+++ b/health/health.d/ipc.conf
@@ -12,7 +12,6 @@ component: IPC
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (70) : (80))
- crit: $this > (($status == $CRITICAL) ? (70) : (90))
delay: down 5m multiplier 1.5 max 1h
info: IPC semaphore utilization
to: sysadmin
@@ -28,7 +27,6 @@ component: IPC
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (70) : (80))
- crit: $this > (($status == $CRITICAL) ? (70) : (90))
delay: down 5m multiplier 1.5 max 1h
info: IPC semaphore arrays utilization
to: sysadmin
diff --git a/health/health.d/ipmi.conf b/health/health.d/ipmi.conf
index feadba1b7..4d6478cca 100644
--- a/health/health.d/ipmi.conf
+++ b/health/health.d/ipmi.conf
@@ -1,15 +1,15 @@
- alarm: ipmi_sensors_states
- on: ipmi.sensors_states
+ template: ipmi_sensor_state
+ on: ipmi.sensor_state
class: Errors
type: System
component: IPMI
calc: $warning + $critical
- units: sensors
+ units: state
every: 10s
- warn: $this > 0
+ warn: $warning > 0
crit: $critical > 0
delay: up 5m down 15m multiplier 1.5 max 1h
- info: number of IPMI sensors in non-nominal state
+ info: IPMI sensor ${label:sensor} (${label:component}) state
to: sysadmin
alarm: ipmi_events
diff --git a/health/health.d/linux_power_supply.conf b/health/health.d/linux_power_supply.conf
index c0bc6de8a..4562122ca 100644
--- a/health/health.d/linux_power_supply.conf
+++ b/health/health.d/linux_power_supply.conf
@@ -9,7 +9,6 @@ component: Battery
units: %
every: 10s
warn: $this < 10
- crit: $this < 5
delay: up 30s down 5m multiplier 1.2 max 1h
info: percentage of remaining power supply capacity
to: sysadmin
diff --git a/health/health.d/nut.conf b/health/health.d/nut.conf
index 6231dd97b..67843205c 100644
--- a/health/health.d/nut.conf
+++ b/health/health.d/nut.conf
@@ -26,8 +26,8 @@ component: UPS
lookup: average -60s unaligned of battery_charge
units: %
every: 60s
- warn: $this < 100
- crit: $this < (($status == $CRITICAL) ? (60) : (50))
+ warn: $this < 75
+ crit: $this < 40
delay: down 10m multiplier 1.5 max 1h
info: average UPS charge over the last minute
to: sitemgr
diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf
index ee6c57cc5..045930ae5 100644
--- a/health/health.d/pihole.conf
+++ b/health/health.d/pihole.conf
@@ -10,8 +10,7 @@ component: Pi-hole
every: 10s
units: seconds
calc: $ago
- warn: $this > 60 * 60 * 24 * 8
- crit: $this > 60 * 60 * 24 * 8 * 2
+ warn: $this > 60 * 60 * 24 * 30
info: gravity.list (blocklist) file last update time
to: sysadmin
diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf
index ab382c43b..34e5431a8 100644
--- a/health/health.d/ram.conf
+++ b/health/health.d/ram.conf
@@ -28,7 +28,6 @@ component: Memory
units: %
every: 10s
warn: $this < (($status >= $WARNING) ? (15) : (10))
- crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
delay: down 15m multiplier 1.5 max 1h
info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
to: sysadmin
@@ -74,7 +73,6 @@ component: Memory
units: %
every: 10s
warn: $this < (($status >= $WARNING) ? (15) : (10))
- crit: $this < (($status == $CRITICAL) ? (10) : ( 5))
delay: down 15m multiplier 1.5 max 1h
info: percentage of estimated amount of RAM available for userspace processes, without causing swapping
to: sysadmin
diff --git a/health/health.d/scaleio.conf b/health/health.d/scaleio.conf
index ab110bf07..27a857fcd 100644
--- a/health/health.d/scaleio.conf
+++ b/health/health.d/scaleio.conf
@@ -9,8 +9,8 @@ component: ScaleIO
calc: $used
units: %
every: 10s
- warn: $this > (($status >= $WARNING) ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
+ warn: $this > (($status >= $WARNING) ? (80) : (85))
+ crit: $this > (($status == $CRITICAL) ? (85) : (90))
delay: down 15m multiplier 1.5 max 1h
info: storage pool capacity utilization
to: sysadmin
diff --git a/health/health.d/vcsa.conf b/health/health.d/vcsa.conf
index a9cc7ceef..bff34cd39 100644
--- a/health/health.d/vcsa.conf
+++ b/health/health.d/vcsa.conf
@@ -133,8 +133,7 @@ component: VMware vCenter
lookup: max -10s unaligned of software_packages
units: status
every: 10s
- warn: $this == 4
- crit: $this == 3
+ warn: ($this == 3) || ($this == 4)
delay: down 1m multiplier 1.5 max 1h
info: software updates availability status \
(-1: unknown, 0: green, 2: orange, 3: red, 4: grey)
diff --git a/health/health.d/windows.conf b/health/health.d/windows.conf
index d4bc7639c..28a886386 100644
--- a/health/health.d/windows.conf
+++ b/health/health.d/windows.conf
@@ -6,7 +6,7 @@
class: Utilization
type: Windows
component: CPU
- os: linux
+ os: *
hosts: *
lookup: average -10m unaligned match-names of dpc,user,privileged,interrupt
units: %
@@ -25,7 +25,7 @@ component: CPU
class: Utilization
type: Windows
component: Memory
- os: linux
+ os: *
hosts: *
calc: ($used) * 100 / ($used + $available)
units: %
@@ -36,31 +36,15 @@ component: Memory
info: memory utilization
to: sysadmin
- template: windows_swap_in_use
- on: windows.memory_swap_utilization
- class: Utilization
- type: Windows
-component: Memory
- os: linux
- hosts: *
- calc: ($used) * 100 / ($used + $available)
- units: %
- every: 10s
- warn: $this > (($status >= $WARNING) ? (80) : (90))
- crit: $this > (($status == $CRITICAL) ? (90) : (98))
- delay: down 15m multiplier 1.5 max 1h
- info: swap memory utilization
- to: sysadmin
-
## Network
template: windows_inbound_packets_discarded
- on: windows.net_discarded
+ on: windows.net_nic_discarded
class: Errors
type: Windows
component: Network
- os: linux
+ os: *
hosts: *
lookup: sum -10m unaligned absolute match-names of inbound
units: packets
@@ -71,11 +55,11 @@ component: Network
to: sysadmin
template: windows_outbound_packets_discarded
- on: windows.net_discarded
+ on: windows.net_nic_discarded
class: Errors
type: Windows
component: Network
- os: linux
+ os: *
hosts: *
lookup: sum -10m unaligned absolute match-names of outbound
units: packets
@@ -86,11 +70,11 @@ component: Network
to: sysadmin
template: windows_inbound_packets_errors
- on: windows.net_errors
+ on: windows.net_nic_errors
class: Errors
type: Windows
component: Network
- os: linux
+ os: *
hosts: *
lookup: sum -10m unaligned absolute match-names of inbound
units: packets
@@ -101,11 +85,11 @@ component: Network
to: sysadmin
template: windows_outbound_packets_errors
- on: windows.net_errors
+ on: windows.net_nic_errors
class: Errors
type: Windows
component: Network
- os: linux
+ os: *
hosts: *
lookup: sum -10m unaligned absolute match-names of outbound
units: packets
@@ -119,11 +103,11 @@ component: Network
## Disk
template: windows_disk_in_use
- on: windows.logical_disk_utilization
+ on: windows.logical_disk_space_usage
class: Utilization
type: Windows
component: Disk
- os: linux
+ os: *
hosts: *
calc: ($used) * 100 / ($used + $free)
units: %