diff options
Diffstat (limited to 'health/health.d')
-rw-r--r-- | health/health.d/btrfs.conf | 75 | ||||
-rw-r--r-- | health/health.d/docker.conf | 11 | ||||
-rw-r--r-- | health/health.d/dockerd.conf | 11 | ||||
-rw-r--r-- | health/health.d/windows.conf (renamed from health/health.d/wmi.conf) | 32 |
4 files changed, 102 insertions, 27 deletions
diff --git a/health/health.d/btrfs.conf b/health/health.d/btrfs.conf index 8d197aa8d..ab63ff28d 100644 --- a/health/health.d/btrfs.conf +++ b/health/health.d/btrfs.conf @@ -66,3 +66,78 @@ component: File system delay: up 1m down 15m multiplier 1.5 max 1h info: utilization of BTRFS system space to: sysadmin + + template: btrfs_device_read_errors + on: btrfs.device_errors + class: Errors + type: System +component: File system + os: * + hosts: * + families: * + units: errors + lookup: max -10m every 1m of read_errs + warn: $this > 0 + delay: up 1m down 15m multiplier 1.5 max 1h + info: number of encountered BTRFS read errors + to: sysadmin + + template: btrfs_device_write_errors + on: btrfs.device_errors + class: Errors + type: System +component: File system + os: * + hosts: * + families: * + units: errors + lookup: max -10m every 1m of write_errs + warn: $this > 0 + delay: up 1m down 15m multiplier 1.5 max 1h + info: number of encountered BTRFS write errors + to: sysadmin + + template: btrfs_device_flush_errors + on: btrfs.device_errors + class: Errors + type: System +component: File system + os: * + hosts: * + families: * + units: errors + lookup: max -10m every 1m of flush_errs + warn: $this > 0 + delay: up 1m down 15m multiplier 1.5 max 1h + info: number of encountered BTRFS flush errors + to: sysadmin + + template: btrfs_device_corruption_errors + on: btrfs.device_errors + class: Errors + type: System +component: File system + os: * + hosts: * + families: * + units: errors + lookup: max -10m every 1m of corruption_errs + warn: $this > 0 + delay: up 1m down 15m multiplier 1.5 max 1h + info: number of encountered BTRFS corruption errors + to: sysadmin + + template: btrfs_device_generation_errors + on: btrfs.device_errors + class: Errors + type: System +component: File system + os: * + hosts: * + families: * + units: errors + lookup: max -10m every 1m of generation_errs + warn: $this > 0 + delay: up 1m down 15m multiplier 1.5 max 1h + info: number of encountered BTRFS generation errors + to: sysadmin diff --git a/health/health.d/docker.conf b/health/health.d/docker.conf new file mode 100644 index 000000000..f17028472 --- /dev/null +++ b/health/health.d/docker.conf @@ -0,0 +1,11 @@ + template: docker_container_unhealthy + on: docker.container_health_status + class: Errors + type: Containers +component: Docker + units: status + every: 10s + lookup: average -10s of unhealthy + crit: $this > 0 + info: ${label:container_name} docker container health status is unhealthy + to: sysadmin diff --git a/health/health.d/dockerd.conf b/health/health.d/dockerd.conf deleted file mode 100644 index 220ddd664..000000000 --- a/health/health.d/dockerd.conf +++ /dev/null @@ -1,11 +0,0 @@ - template: docker_unhealthy_containers - on: docker.unhealthy_containers - class: Errors - type: Containers -component: Docker - units: unhealthy containers - every: 10s - lookup: average -10s - crit: $this > 0 - info: average number of unhealthy docker containers over the last 10 seconds - to: sysadmin diff --git a/health/health.d/wmi.conf b/health/health.d/windows.conf index 90d39ce9d..d678ac3ae 100644 --- a/health/health.d/wmi.conf +++ b/health/health.d/windows.conf @@ -1,8 +1,8 @@ ## CPU - template: wmi_10min_cpu_usage - on: wmi.cpu_utilization_total + template: windows_10min_cpu_usage + on: windows.cpu_utilization_total class: Utilization type: Windows component: CPU @@ -20,8 +20,8 @@ component: CPU ## Memory - template: wmi_ram_in_use - on: wmi.memory_utilization + template: windows_ram_in_use + on: windows.memory_utilization class: Utilization type: Windows component: Memory @@ -36,8 +36,8 @@ component: Memory info: memory utilization to: sysadmin - template: wmi_swap_in_use - on: wmi.memory_swap_utilization + template: windows_swap_in_use + on: windows.memory_swap_utilization class: Utilization type: Windows component: Memory @@ -55,8 +55,8 @@ component: Memory ## Network - template: wmi_inbound_packets_discarded - on: wmi.net_discarded + template: windows_inbound_packets_discarded + on: windows.net_discarded class: Errors type: Windows component: Network @@ -71,8 +71,8 @@ component: Network info: number of inbound discarded packets for the network interface in the last 10 minutes to: sysadmin - template: wmi_outbound_packets_discarded - on: wmi.net_discarded + template: windows_outbound_packets_discarded + on: windows.net_discarded class: Errors type: Windows component: Network @@ -87,8 +87,8 @@ component: Network info: number of outbound discarded packets for the network interface in the last 10 minutes to: sysadmin - template: wmi_inbound_packets_errors - on: wmi.net_errors + template: windows_inbound_packets_errors + on: windows.net_errors class: Errors type: Windows component: Network @@ -103,8 +103,8 @@ component: Network info: number of inbound errors for the network interface in the last 10 minutes to: sysadmin - template: wmi_outbound_packets_errors - on: wmi.net_errors + template: windows_outbound_packets_errors + on: windows.net_errors class: Errors type: Windows component: Network @@ -122,8 +122,8 @@ component: Network ## Disk - template: wmi_disk_in_use - on: wmi.logical_disk_utilization + template: windows_disk_in_use + on: windows.logical_disk_utilization class: Utilization type: Windows component: Disk |