summaryrefslogtreecommitdiffstats
path: root/health/health.d
diff options
context:
space:
mode:
Diffstat (limited to 'health/health.d')
-rw-r--r--health/health.d/btrfs.conf75
-rw-r--r--health/health.d/docker.conf11
-rw-r--r--health/health.d/dockerd.conf11
-rw-r--r--health/health.d/windows.conf (renamed from health/health.d/wmi.conf)32
4 files changed, 102 insertions, 27 deletions
diff --git a/health/health.d/btrfs.conf b/health/health.d/btrfs.conf
index 8d197aa8d..ab63ff28d 100644
--- a/health/health.d/btrfs.conf
+++ b/health/health.d/btrfs.conf
@@ -66,3 +66,78 @@ component: File system
delay: up 1m down 15m multiplier 1.5 max 1h
info: utilization of BTRFS system space
to: sysadmin
+
+ template: btrfs_device_read_errors
+ on: btrfs.device_errors
+ class: Errors
+ type: System
+component: File system
+ os: *
+ hosts: *
+ families: *
+ units: errors
+ lookup: max -10m every 1m of read_errs
+ warn: $this > 0
+ delay: up 1m down 15m multiplier 1.5 max 1h
+ info: number of encountered BTRFS read errors
+ to: sysadmin
+
+ template: btrfs_device_write_errors
+ on: btrfs.device_errors
+ class: Errors
+ type: System
+component: File system
+ os: *
+ hosts: *
+ families: *
+ units: errors
+ lookup: max -10m every 1m of write_errs
+ warn: $this > 0
+ delay: up 1m down 15m multiplier 1.5 max 1h
+ info: number of encountered BTRFS write errors
+ to: sysadmin
+
+ template: btrfs_device_flush_errors
+ on: btrfs.device_errors
+ class: Errors
+ type: System
+component: File system
+ os: *
+ hosts: *
+ families: *
+ units: errors
+ lookup: max -10m every 1m of flush_errs
+ warn: $this > 0
+ delay: up 1m down 15m multiplier 1.5 max 1h
+ info: number of encountered BTRFS flush errors
+ to: sysadmin
+
+ template: btrfs_device_corruption_errors
+ on: btrfs.device_errors
+ class: Errors
+ type: System
+component: File system
+ os: *
+ hosts: *
+ families: *
+ units: errors
+ lookup: max -10m every 1m of corruption_errs
+ warn: $this > 0
+ delay: up 1m down 15m multiplier 1.5 max 1h
+ info: number of encountered BTRFS corruption errors
+ to: sysadmin
+
+ template: btrfs_device_generation_errors
+ on: btrfs.device_errors
+ class: Errors
+ type: System
+component: File system
+ os: *
+ hosts: *
+ families: *
+ units: errors
+ lookup: max -10m every 1m of generation_errs
+ warn: $this > 0
+ delay: up 1m down 15m multiplier 1.5 max 1h
+ info: number of encountered BTRFS generation errors
+ to: sysadmin
diff --git a/health/health.d/docker.conf b/health/health.d/docker.conf
new file mode 100644
index 000000000..f17028472
--- /dev/null
+++ b/health/health.d/docker.conf
@@ -0,0 +1,11 @@
+ template: docker_container_unhealthy
+ on: docker.container_health_status
+ class: Errors
+ type: Containers
+component: Docker
+ units: status
+ every: 10s
+ lookup: average -10s of unhealthy
+ crit: $this > 0
+ info: ${label:container_name} docker container health status is unhealthy
+ to: sysadmin
diff --git a/health/health.d/dockerd.conf b/health/health.d/dockerd.conf
deleted file mode 100644
index 220ddd664..000000000
--- a/health/health.d/dockerd.conf
+++ /dev/null
@@ -1,11 +0,0 @@
- template: docker_unhealthy_containers
- on: docker.unhealthy_containers
- class: Errors
- type: Containers
-component: Docker
- units: unhealthy containers
- every: 10s
- lookup: average -10s
- crit: $this > 0
- info: average number of unhealthy docker containers over the last 10 seconds
- to: sysadmin
diff --git a/health/health.d/wmi.conf b/health/health.d/windows.conf
index 90d39ce9d..d678ac3ae 100644
--- a/health/health.d/wmi.conf
+++ b/health/health.d/windows.conf
@@ -1,8 +1,8 @@
## CPU
- template: wmi_10min_cpu_usage
- on: wmi.cpu_utilization_total
+ template: windows_10min_cpu_usage
+ on: windows.cpu_utilization_total
class: Utilization
type: Windows
component: CPU
@@ -20,8 +20,8 @@ component: CPU
## Memory
- template: wmi_ram_in_use
- on: wmi.memory_utilization
+ template: windows_ram_in_use
+ on: windows.memory_utilization
class: Utilization
type: Windows
component: Memory
@@ -36,8 +36,8 @@ component: Memory
info: memory utilization
to: sysadmin
- template: wmi_swap_in_use
- on: wmi.memory_swap_utilization
+ template: windows_swap_in_use
+ on: windows.memory_swap_utilization
class: Utilization
type: Windows
component: Memory
@@ -55,8 +55,8 @@ component: Memory
## Network
- template: wmi_inbound_packets_discarded
- on: wmi.net_discarded
+ template: windows_inbound_packets_discarded
+ on: windows.net_discarded
class: Errors
type: Windows
component: Network
@@ -71,8 +71,8 @@ component: Network
info: number of inbound discarded packets for the network interface in the last 10 minutes
to: sysadmin
- template: wmi_outbound_packets_discarded
- on: wmi.net_discarded
+ template: windows_outbound_packets_discarded
+ on: windows.net_discarded
class: Errors
type: Windows
component: Network
@@ -87,8 +87,8 @@ component: Network
info: number of outbound discarded packets for the network interface in the last 10 minutes
to: sysadmin
- template: wmi_inbound_packets_errors
- on: wmi.net_errors
+ template: windows_inbound_packets_errors
+ on: windows.net_errors
class: Errors
type: Windows
component: Network
@@ -103,8 +103,8 @@ component: Network
info: number of inbound errors for the network interface in the last 10 minutes
to: sysadmin
- template: wmi_outbound_packets_errors
- on: wmi.net_errors
+ template: windows_outbound_packets_errors
+ on: windows.net_errors
class: Errors
type: Windows
component: Network
@@ -122,8 +122,8 @@ component: Network
## Disk
- template: wmi_disk_in_use
- on: wmi.logical_disk_utilization
+ template: windows_disk_in_use
+ on: windows.logical_disk_utilization
class: Utilization
type: Windows
component: Disk