diff options
Diffstat (limited to 'health/health.d')
-rw-r--r-- | health/health.d/apcupsd.conf | 4 | ||||
-rw-r--r-- | health/health.d/bcache.conf | 3 | ||||
-rw-r--r-- | health/health.d/beanstalkd.conf | 3 | ||||
-rw-r--r-- | health/health.d/bind_rndc.conf | 1 | ||||
-rw-r--r-- | health/health.d/boinc.conf | 4 | ||||
-rw-r--r-- | health/health.d/cgroups.conf | 3 | ||||
-rw-r--r-- | health/health.d/consul.conf | 2 | ||||
-rw-r--r-- | health/health.d/cpu.conf | 2 | ||||
-rw-r--r-- | health/health.d/dnsmasq_dhcp.conf | 1 | ||||
-rw-r--r-- | health/health.d/docker.conf | 2 | ||||
-rw-r--r-- | health/health.d/elasticsearch.conf | 2 | ||||
-rw-r--r-- | health/health.d/file_descriptors.conf | 31 | ||||
-rw-r--r-- | health/health.d/gearman.conf | 1 | ||||
-rw-r--r-- | health/health.d/geth.conf | 1 | ||||
-rw-r--r-- | health/health.d/ioping.conf | 4 | ||||
-rw-r--r-- | health/health.d/ipc.conf | 2 | ||||
-rw-r--r-- | health/health.d/ipmi.conf | 10 | ||||
-rw-r--r-- | health/health.d/linux_power_supply.conf | 1 | ||||
-rw-r--r-- | health/health.d/nut.conf | 4 | ||||
-rw-r--r-- | health/health.d/pihole.conf | 3 | ||||
-rw-r--r-- | health/health.d/ram.conf | 2 | ||||
-rw-r--r-- | health/health.d/scaleio.conf | 4 | ||||
-rw-r--r-- | health/health.d/vcsa.conf | 3 | ||||
-rw-r--r-- | health/health.d/windows.conf | 40 |
24 files changed, 61 insertions, 72 deletions
diff --git a/health/health.d/apcupsd.conf b/health/health.d/apcupsd.conf index 65f1a69ab..7a0afcd18 100644 --- a/health/health.d/apcupsd.conf +++ b/health/health.d/apcupsd.conf @@ -11,7 +11,6 @@ component: UPS units: % every: 1m warn: $this > (($status >= $WARNING) ? (70) : (80)) - crit: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 10m multiplier 1.5 max 1h info: average UPS load over the last 10 minutes to: sitemgr @@ -29,7 +28,7 @@ component: UPS units: % every: 60s warn: $this < 100 - crit: $this < (($status == $CRITICAL) ? (60) : (50)) + crit: $this < 40 delay: down 10m multiplier 1.5 max 1h info: average UPS charge over the last minute to: sitemgr @@ -43,7 +42,6 @@ component: UPS device every: 10s units: seconds ago warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) delay: down 5m multiplier 1.5 max 1h info: number of seconds since the last successful data collection to: sitemgr diff --git a/health/health.d/bcache.conf b/health/health.d/bcache.conf index 49cb5ad0f..3f92e80df 100644 --- a/health/health.d/bcache.conf +++ b/health/health.d/bcache.conf @@ -22,8 +22,7 @@ component: Disk calc: $dirty + $metadata + $undefined units: % every: 1m - warn: $this > ( ($status >= $WARNING ) ? ( 70 ) : ( 90 ) ) - crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) ) + warn: $this > 75 delay: up 1m down 1h multiplier 1.5 max 2h info: percentage of cache space used for dirty data and metadata \ (this usually means your SSD cache is too small) diff --git a/health/health.d/beanstalkd.conf b/health/health.d/beanstalkd.conf index 13ac8c182..4ee8bc0bd 100644 --- a/health/health.d/beanstalkd.conf +++ b/health/health.d/beanstalkd.conf @@ -8,8 +8,7 @@ component: Beanstalk calc: $buried units: jobs every: 10s - warn: $this > 0 - crit: $this > 10 + warn: $this > 3 delay: up 0 down 5m multiplier 1.2 max 1h info: number of buried jobs across all tubes. \ You need to manually kick them so they can be processed. \ diff --git a/health/health.d/bind_rndc.conf b/health/health.d/bind_rndc.conf index 7c09225ff..b3e75a239 100644 --- a/health/health.d/bind_rndc.conf +++ b/health/health.d/bind_rndc.conf @@ -7,6 +7,5 @@ component: BIND every: 60 calc: $stats_size warn: $this > 512 - crit: $this > 1024 info: BIND statistics-file size to: sysadmin diff --git a/health/health.d/boinc.conf b/health/health.d/boinc.conf index 6f37787d7..b7dcbe316 100644 --- a/health/health.d/boinc.conf +++ b/health/health.d/boinc.conf @@ -12,7 +12,6 @@ component: BOINC units: tasks every: 1m warn: $this > 0 - crit: $this > 1 delay: up 1m down 5m multiplier 1.5 max 1h info: average number of compute errors over the last 10 minutes to: sysadmin @@ -29,7 +28,6 @@ component: BOINC units: tasks every: 1m warn: $this > 0 - crit: $this > 1 delay: up 1m down 5m multiplier 1.5 max 1h info: average number of failed uploads over the last 10 minutes to: sysadmin @@ -46,7 +44,6 @@ component: BOINC units: tasks every: 1m warn: $this < 1 - crit: $this < 0.1 delay: up 5m down 10m multiplier 1.5 max 1h info: average number of total tasks over the last 10 minutes to: sysadmin @@ -64,7 +61,6 @@ component: BOINC units: tasks every: 1m warn: $this < 1 - crit: $this < 0.1 delay: up 5m down 10m multiplier 1.5 max 1h info: average number of active tasks over the last 10 minutes to: sysadmin diff --git a/health/health.d/cgroups.conf b/health/health.d/cgroups.conf index 08260ff6d..f625e5455 100644 --- a/health/health.d/cgroups.conf +++ b/health/health.d/cgroups.conf @@ -64,7 +64,6 @@ component: Network every: 10s units: % warn: $this > (($status >= $WARNING)?(200):(5000)) - crit: $this > (($status == $CRITICAL)?(5000):(6000)) options: no-clear-notification info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \ compared to the rate over the last minute @@ -83,7 +82,6 @@ component: CPU units: % every: 1m warn: $this > (($status >= $WARNING) ? (75) : (85)) - crit: $this > (($status == $CRITICAL) ? (85) : (95)) delay: down 15m multiplier 1.5 max 1h info: average cgroup CPU utilization over the last 10 minutes to: sysadmin @@ -134,7 +132,6 @@ component: Network every: 10s units: % warn: $this > (($status >= $WARNING)?(200):(5000)) - crit: $this > (($status == $CRITICAL)?(5000):(6000)) options: no-clear-notification info: ratio of average number of received packets for the network interface ${label:device} over the last 10 seconds, \ compared to the rate over the last minute diff --git a/health/health.d/consul.conf b/health/health.d/consul.conf index dff6d2df3..7edca6563 100644 --- a/health/health.d/consul.conf +++ b/health/health.d/consul.conf @@ -10,7 +10,7 @@ component: Consul units: seconds warn: $this < 14*24*60*60 crit: $this < 7*24*60*60 - info: Consul Enterprise licence expiration time on node ${label:node_name} datacenter ${label:datacenter} + info: Consul Enterprise license expiration time on node ${label:node_name} datacenter ${label:datacenter} to: sysadmin template: consul_autopilot_health_status diff --git a/health/health.d/cpu.conf b/health/health.d/cpu.conf index ad6952825..907d6ff8a 100644 --- a/health/health.d/cpu.conf +++ b/health/health.d/cpu.conf @@ -28,7 +28,6 @@ component: CPU units: % every: 1m warn: $this > (($status >= $WARNING) ? (20) : (40)) - crit: $this > (($status == $CRITICAL) ? (40) : (50)) delay: down 15m multiplier 1.5 max 1h info: average CPU iowait time over the last 10 minutes to: sysadmin @@ -44,7 +43,6 @@ component: CPU units: % every: 5m warn: $this > (($status >= $WARNING) ? (5) : (10)) - crit: $this > (($status == $CRITICAL) ? (20) : (30)) delay: down 1h multiplier 1.5 max 2h info: average CPU steal time over the last 20 minutes to: sysadmin diff --git a/health/health.d/dnsmasq_dhcp.conf b/health/health.d/dnsmasq_dhcp.conf index 010b94599..81d37df64 100644 --- a/health/health.d/dnsmasq_dhcp.conf +++ b/health/health.d/dnsmasq_dhcp.conf @@ -9,7 +9,6 @@ component: Dnsmasq units: % calc: $used warn: $this > ( ($status >= $WARNING ) ? ( 80 ) : ( 90 ) ) - crit: $this > ( ($status == $CRITICAL) ? ( 90 ) : ( 95 ) ) delay: down 5m info: DHCP range utilization to: sysadmin diff --git a/health/health.d/docker.conf b/health/health.d/docker.conf index f17028472..01919dc0d 100644 --- a/health/health.d/docker.conf +++ b/health/health.d/docker.conf @@ -6,6 +6,6 @@ component: Docker units: status every: 10s lookup: average -10s of unhealthy - crit: $this > 0 + warn: $this > 0 info: ${label:container_name} docker container health status is unhealthy to: sysadmin diff --git a/health/health.d/elasticsearch.conf b/health/health.d/elasticsearch.conf index 47f8e1eb9..29f1e9b27 100644 --- a/health/health.d/elasticsearch.conf +++ b/health/health.d/elasticsearch.conf @@ -10,7 +10,7 @@ component: Elasticsearch lookup: average -5s unaligned of *ed every: 10s units: status - warn: $this == 1 + crit: $this == 1 delay: down 5m multiplier 1.5 max 1h info: cluster health status is red. to: sysadmin diff --git a/health/health.d/file_descriptors.conf b/health/health.d/file_descriptors.conf new file mode 100644 index 000000000..d136ea517 --- /dev/null +++ b/health/health.d/file_descriptors.conf @@ -0,0 +1,31 @@ + # you can disable an alarm notification by setting the 'to' line to: silent + + template: system_file_descriptors_utilization + on: system.file_nr_utilization + class: Utilization + type: System + component: Processes + hosts: * + lookup: max -1m unaligned + units: % + every: 1m + crit: $this > 90 + delay: down 15m multiplier 1.5 max 1h + info: system-wide utilization of open files + to: sysadmin + + template: apps_group_file_descriptors_utilization + on: apps.fd_limit + class: Utilization + type: System +component: Process + os: linux + module: !* * + hosts: * + lookup: max -1m unaligned foreach * + units: % + every: 1m + warn: $this > (($status >= $WARNING) ? (85) : (90)) + delay: down 15m multiplier 1.5 max 1h + info: maximum utilization of open files among all application group PIDs + to: sysadmin diff --git a/health/health.d/gearman.conf b/health/health.d/gearman.conf index 14010d445..580d114f8 100644 --- a/health/health.d/gearman.conf +++ b/health/health.d/gearman.conf @@ -8,7 +8,6 @@ component: Gearman units: workers every: 10s warn: $this > 30000 - crit: $this > 100000 delay: down 5m multiplier 1.5 max 1h info: average number of queued jobs over the last 10 minutes to: sysadmin diff --git a/health/health.d/geth.conf b/health/health.d/geth.conf index dd1eb4701..361b6b41f 100644 --- a/health/health.d/geth.conf +++ b/health/health.d/geth.conf @@ -8,5 +8,4 @@ component: geth calc: $chain_head_block - $chain_head_header units: blocks warn: $this != 0 - crit: $this > 5 delay: down 1m multiplier 1.5 max 1h diff --git a/health/health.d/ioping.conf b/health/health.d/ioping.conf index 2786cbd62..47ac4453c 100644 --- a/health/health.d/ioping.conf +++ b/health/health.d/ioping.conf @@ -6,10 +6,8 @@ component: Disk lookup: average -10s unaligned of latency units: microseconds every: 10s - green: 5000 - red: 10000 + green: 10000 warn: $this > $green - crit: $this > $red delay: down 30m multiplier 1.5 max 2h info: average I/O latency over the last 10 seconds to: sysadmin diff --git a/health/health.d/ipc.conf b/health/health.d/ipc.conf index c178a410a..3d1b46c02 100644 --- a/health/health.d/ipc.conf +++ b/health/health.d/ipc.conf @@ -12,7 +12,6 @@ component: IPC units: % every: 10s warn: $this > (($status >= $WARNING) ? (70) : (80)) - crit: $this > (($status == $CRITICAL) ? (70) : (90)) delay: down 5m multiplier 1.5 max 1h info: IPC semaphore utilization to: sysadmin @@ -28,7 +27,6 @@ component: IPC units: % every: 10s warn: $this > (($status >= $WARNING) ? (70) : (80)) - crit: $this > (($status == $CRITICAL) ? (70) : (90)) delay: down 5m multiplier 1.5 max 1h info: IPC semaphore arrays utilization to: sysadmin diff --git a/health/health.d/ipmi.conf b/health/health.d/ipmi.conf index feadba1b7..4d6478cca 100644 --- a/health/health.d/ipmi.conf +++ b/health/health.d/ipmi.conf @@ -1,15 +1,15 @@ - alarm: ipmi_sensors_states - on: ipmi.sensors_states + template: ipmi_sensor_state + on: ipmi.sensor_state class: Errors type: System component: IPMI calc: $warning + $critical - units: sensors + units: state every: 10s - warn: $this > 0 + warn: $warning > 0 crit: $critical > 0 delay: up 5m down 15m multiplier 1.5 max 1h - info: number of IPMI sensors in non-nominal state + info: IPMI sensor ${label:sensor} (${label:component}) state to: sysadmin alarm: ipmi_events diff --git a/health/health.d/linux_power_supply.conf b/health/health.d/linux_power_supply.conf index c0bc6de8a..4562122ca 100644 --- a/health/health.d/linux_power_supply.conf +++ b/health/health.d/linux_power_supply.conf @@ -9,7 +9,6 @@ component: Battery units: % every: 10s warn: $this < 10 - crit: $this < 5 delay: up 30s down 5m multiplier 1.2 max 1h info: percentage of remaining power supply capacity to: sysadmin diff --git a/health/health.d/nut.conf b/health/health.d/nut.conf index 6231dd97b..67843205c 100644 --- a/health/health.d/nut.conf +++ b/health/health.d/nut.conf @@ -26,8 +26,8 @@ component: UPS lookup: average -60s unaligned of battery_charge units: % every: 60s - warn: $this < 100 - crit: $this < (($status == $CRITICAL) ? (60) : (50)) + warn: $this < 75 + crit: $this < 40 delay: down 10m multiplier 1.5 max 1h info: average UPS charge over the last minute to: sitemgr diff --git a/health/health.d/pihole.conf b/health/health.d/pihole.conf index ee6c57cc5..045930ae5 100644 --- a/health/health.d/pihole.conf +++ b/health/health.d/pihole.conf @@ -10,8 +10,7 @@ component: Pi-hole every: 10s units: seconds calc: $ago - warn: $this > 60 * 60 * 24 * 8 - crit: $this > 60 * 60 * 24 * 8 * 2 + warn: $this > 60 * 60 * 24 * 30 info: gravity.list (blocklist) file last update time to: sysadmin diff --git a/health/health.d/ram.conf b/health/health.d/ram.conf index ab382c43b..34e5431a8 100644 --- a/health/health.d/ram.conf +++ b/health/health.d/ram.conf @@ -28,7 +28,6 @@ component: Memory units: % every: 10s warn: $this < (($status >= $WARNING) ? (15) : (10)) - crit: $this < (($status == $CRITICAL) ? (10) : ( 5)) delay: down 15m multiplier 1.5 max 1h info: percentage of estimated amount of RAM available for userspace processes, without causing swapping to: sysadmin @@ -74,7 +73,6 @@ component: Memory units: % every: 10s warn: $this < (($status >= $WARNING) ? (15) : (10)) - crit: $this < (($status == $CRITICAL) ? (10) : ( 5)) delay: down 15m multiplier 1.5 max 1h info: percentage of estimated amount of RAM available for userspace processes, without causing swapping to: sysadmin diff --git a/health/health.d/scaleio.conf b/health/health.d/scaleio.conf index ab110bf07..27a857fcd 100644 --- a/health/health.d/scaleio.conf +++ b/health/health.d/scaleio.conf @@ -9,8 +9,8 @@ component: ScaleIO calc: $used units: % every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) + warn: $this > (($status >= $WARNING) ? (80) : (85)) + crit: $this > (($status == $CRITICAL) ? (85) : (90)) delay: down 15m multiplier 1.5 max 1h info: storage pool capacity utilization to: sysadmin diff --git a/health/health.d/vcsa.conf b/health/health.d/vcsa.conf index a9cc7ceef..bff34cd39 100644 --- a/health/health.d/vcsa.conf +++ b/health/health.d/vcsa.conf @@ -133,8 +133,7 @@ component: VMware vCenter lookup: max -10s unaligned of software_packages units: status every: 10s - warn: $this == 4 - crit: $this == 3 + warn: ($this == 3) || ($this == 4) delay: down 1m multiplier 1.5 max 1h info: software updates availability status \ (-1: unknown, 0: green, 2: orange, 3: red, 4: grey) diff --git a/health/health.d/windows.conf b/health/health.d/windows.conf index d4bc7639c..28a886386 100644 --- a/health/health.d/windows.conf +++ b/health/health.d/windows.conf @@ -6,7 +6,7 @@ class: Utilization type: Windows component: CPU - os: linux + os: * hosts: * lookup: average -10m unaligned match-names of dpc,user,privileged,interrupt units: % @@ -25,7 +25,7 @@ component: CPU class: Utilization type: Windows component: Memory - os: linux + os: * hosts: * calc: ($used) * 100 / ($used + $available) units: % @@ -36,31 +36,15 @@ component: Memory info: memory utilization to: sysadmin - template: windows_swap_in_use - on: windows.memory_swap_utilization - class: Utilization - type: Windows -component: Memory - os: linux - hosts: * - calc: ($used) * 100 / ($used + $available) - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (80) : (90)) - crit: $this > (($status == $CRITICAL) ? (90) : (98)) - delay: down 15m multiplier 1.5 max 1h - info: swap memory utilization - to: sysadmin - ## Network template: windows_inbound_packets_discarded - on: windows.net_discarded + on: windows.net_nic_discarded class: Errors type: Windows component: Network - os: linux + os: * hosts: * lookup: sum -10m unaligned absolute match-names of inbound units: packets @@ -71,11 +55,11 @@ component: Network to: sysadmin template: windows_outbound_packets_discarded - on: windows.net_discarded + on: windows.net_nic_discarded class: Errors type: Windows component: Network - os: linux + os: * hosts: * lookup: sum -10m unaligned absolute match-names of outbound units: packets @@ -86,11 +70,11 @@ component: Network to: sysadmin template: windows_inbound_packets_errors - on: windows.net_errors + on: windows.net_nic_errors class: Errors type: Windows component: Network - os: linux + os: * hosts: * lookup: sum -10m unaligned absolute match-names of inbound units: packets @@ -101,11 +85,11 @@ component: Network to: sysadmin template: windows_outbound_packets_errors - on: windows.net_errors + on: windows.net_nic_errors class: Errors type: Windows component: Network - os: linux + os: * hosts: * lookup: sum -10m unaligned absolute match-names of outbound units: packets @@ -119,11 +103,11 @@ component: Network ## Disk template: windows_disk_in_use - on: windows.logical_disk_utilization + on: windows.logical_disk_space_usage class: Utilization type: Windows component: Disk - os: linux + os: * hosts: * calc: ($used) * 100 / ($used + $free) units: % |