summaryrefslogtreecommitdiffstats
path: root/health/health.d/hdfs.conf
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--health/health.d/hdfs.conf81
1 files changed, 81 insertions, 0 deletions
diff --git a/health/health.d/hdfs.conf b/health/health.d/hdfs.conf
new file mode 100644
index 00000000..566e815a
--- /dev/null
+++ b/health/health.d/hdfs.conf
@@ -0,0 +1,81 @@
+
+# Common
+
+ template: hdfs_capacity_usage
+ on: hdfs.capacity
+ class: Utilization
+ type: Storage
+component: HDFS
+ calc: ($used) * 100 / ($used + $remaining)
+ units: %
+ every: 10s
+ warn: $this > (($status >= $WARNING) ? (70) : (80))
+ crit: $this > (($status == $CRITICAL) ? (80) : (98))
+ delay: down 15m multiplier 1.5 max 1h
+ summary: HDFS datanodes space utilization
+ info: summary datanodes space capacity utilization
+ to: sysadmin
+
+
+# NameNode
+
+ template: hdfs_missing_blocks
+ on: hdfs.blocks
+ class: Errors
+ type: Storage
+component: HDFS
+ calc: $missing
+ units: missing blocks
+ every: 10s
+ warn: $this > 0
+ delay: down 15m multiplier 1.5 max 1h
+ summary: HDFS missing blocks
+ info: number of missing blocks
+ to: sysadmin
+
+
+ template: hdfs_stale_nodes
+ on: hdfs.data_nodes
+ class: Errors
+ type: Storage
+component: HDFS
+ calc: $stale
+ units: dead nodes
+ every: 10s
+ warn: $this > 0
+ delay: down 15m multiplier 1.5 max 1h
+ summary: HDFS stale datanodes
+ info: number of datanodes marked stale due to delayed heartbeat
+ to: sysadmin
+
+
+ template: hdfs_dead_nodes
+ on: hdfs.data_nodes
+ class: Errors
+ type: Storage
+component: HDFS
+ calc: $dead
+ units: dead nodes
+ every: 10s
+ crit: $this > 0
+ delay: down 15m multiplier 1.5 max 1h
+ summary: HDFS dead datanodes
+ info: number of datanodes which are currently dead
+ to: sysadmin
+
+
+# DataNode
+
+ template: hdfs_num_failed_volumes
+ on: hdfs.num_failed_volumes
+ class: Errors
+ type: Storage
+component: HDFS
+ calc: $fsds_num_failed_volumes
+ units: failed volumes
+ every: 10s
+ warn: $this > 0
+ delay: down 15m multiplier 1.5 max 1h
+ summary: HDFS failed volumes
+ info: number of failed volumes
+ to: sysadmin