summaryrefslogtreecommitdiffstats
path: root/health/health.d/mdstat.conf
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 14:31:17 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-04 14:31:17 +0000
commit8020f71afd34d7696d7933659df2d763ab05542f (patch)
tree2fdf1b5447ffd8bdd61e702ca183e814afdcb4fc /health/health.d/mdstat.conf
parentInitial commit. (diff)
downloadnetdata-8020f71afd34d7696d7933659df2d763ab05542f.tar.xz
netdata-8020f71afd34d7696d7933659df2d763ab05542f.zip
Adding upstream version 1.37.1.upstream/1.37.1upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r--health/health.d/mdstat.conf52
1 files changed, 52 insertions, 0 deletions
diff --git a/health/health.d/mdstat.conf b/health/health.d/mdstat.conf
new file mode 100644
index 0000000..cedaa00
--- /dev/null
+++ b/health/health.d/mdstat.conf
@@ -0,0 +1,52 @@
+ template: mdstat_last_collected
+ on: md.disks
+ class: Latency
+ type: System
+component: RAID
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ info: number of seconds since the last successful data collection
+ to: sysadmin
+
+ template: mdstat_disks
+ on: md.disks
+ class: Errors
+ type: System
+component: RAID
+ units: failed devices
+ every: 10s
+ calc: $down
+ crit: $this > 0
+ info: number of devices in the down state for the $family array. \
+ Any number > 0 indicates that the array is degraded.
+ to: sysadmin
+
+ template: mdstat_mismatch_cnt
+ on: md.mismatch_cnt
+ class: Errors
+ type: System
+component: RAID
+ families: !*(raid1) !*(raid10) *
+ units: unsynchronized blocks
+ calc: $count
+ every: 60s
+ warn: $this > 1024
+ delay: up 30m
+ info: number of unsynchronized blocks for the $family array
+ to: sysadmin
+
+ template: mdstat_nonredundant_last_collected
+ on: md.nonredundant
+ class: Latency
+ type: System
+component: RAID
+ calc: $now - $last_collected_t
+ units: seconds ago
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
+ crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
+ info: number of seconds since the last successful data collection
+ to: sysadmin