diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2021-05-19 12:33:27 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2021-05-19 12:33:27 +0000 |
commit | 841395dd16f470e3c051a0a4fff5b91efc983c30 (patch) | |
tree | 4115f6eedcddda75067130b80acaff9e51612f49 /health/health.d/hdfs.conf | |
parent | Adding upstream version 1.30.1. (diff) | |
download | netdata-841395dd16f470e3c051a0a4fff5b91efc983c30.tar.xz netdata-841395dd16f470e3c051a0a4fff5b91efc983c30.zip |
Adding upstream version 1.31.0.upstream/1.31.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'health/health.d/hdfs.conf')
-rw-r--r-- | health/health.d/hdfs.conf | 130 |
1 files changed, 74 insertions, 56 deletions
diff --git a/health/health.d/hdfs.conf b/health/health.d/hdfs.conf index 7345df4d..bd8308be 100644 --- a/health/health.d/hdfs.conf +++ b/health/health.d/hdfs.conf @@ -1,75 +1,93 @@ # make sure hdfs is running -template: hdfs_last_collected_secs - on: hdfs.heap_memory - calc: $now - $last_collected_t - units: seconds ago - every: 10s - warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) - crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) - delay: down 5m multiplier 1.5 max 1h - info: number of seconds since the last successful data collection - to: webmaster + template: hdfs_last_collected_secs + on: hdfs.heap_memory + class: Storage +component: HDFS + type: Latency + calc: $now - $last_collected_t + units: seconds ago + every: 10s + warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every)) + crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every)) + delay: down 5m multiplier 1.5 max 1h + info: number of seconds since the last successful data collection + to: webmaster # Common -template: hdfs_capacity_usage - on: hdfs.capacity - calc: ($used) * 100 / ($used + $remaining) - units: % - every: 10s - warn: $this > (($status >= $WARNING) ? (70) : (80)) - crit: $this > (($status == $CRITICAL) ? (80) : (98)) - delay: down 15m multiplier 1.5 max 1h - info: summary datanodes space capacity utilization - to: sysadmin + template: hdfs_capacity_usage + on: hdfs.capacity + class: Storage +component: HDFS + type: Utilization + calc: ($used) * 100 / ($used + $remaining) + units: % + every: 10s + warn: $this > (($status >= $WARNING) ? (70) : (80)) + crit: $this > (($status == $CRITICAL) ? (80) : (98)) + delay: down 15m multiplier 1.5 max 1h + info: summary datanodes space capacity utilization + to: sysadmin # NameNode -template: hdfs_missing_blocks - on: hdfs.blocks - calc: $missing - units: missing blocks - every: 10s - warn: $this > 0 - delay: down 15m multiplier 1.5 max 1h - info: number of missing blocks - to: sysadmin + template: hdfs_missing_blocks + on: hdfs.blocks + class: Storage +component: HDFS + type: Errors + calc: $missing + units: missing blocks + every: 10s + warn: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: number of missing blocks + to: sysadmin -template: hdfs_stale_nodes - on: hdfs.data_nodes - calc: $stale - units: dead nodes - every: 10s - warn: $this > 0 - delay: down 15m multiplier 1.5 max 1h - info: number of datanodes marked stale due to delayed heartbeat - to: sysadmin + template: hdfs_stale_nodes + on: hdfs.data_nodes + class: Storage +component: HDFS + type: Errors + calc: $stale + units: dead nodes + every: 10s + warn: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: number of datanodes marked stale due to delayed heartbeat + to: sysadmin -template: hdfs_dead_nodes - on: hdfs.data_nodes - calc: $dead - units: dead nodes - every: 10s - crit: $this > 0 - delay: down 15m multiplier 1.5 max 1h - info: number of datanodes which are currently dead - to: sysadmin + template: hdfs_dead_nodes + on: hdfs.data_nodes + class: Storage +component: HDFS + type: Errors + calc: $dead + units: dead nodes + every: 10s + crit: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: number of datanodes which are currently dead + to: sysadmin # DataNode -template: hdfs_num_failed_volumes - on: hdfs.num_failed_volumes - calc: $fsds_num_failed_volumes - units: failed volumes - every: 10s - warn: $this > 0 - delay: down 15m multiplier 1.5 max 1h - info: number of failed volumes - to: sysadmin + template: hdfs_num_failed_volumes + on: hdfs.num_failed_volumes + class: Storage +component: HDFS + type: Errors + calc: $fsds_num_failed_volumes + units: failed volumes + every: 10s + warn: $this > 0 + delay: down 15m multiplier 1.5 max 1h + info: number of failed volumes + to: sysadmin |