summaryrefslogtreecommitdiffstats
path: root/health/health.d/hdfs.conf
blob: bd8308bedc74b83f98432205d6858c91e9eb1095 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
# make sure hdfs is running

 template: hdfs_last_collected_secs
       on: hdfs.heap_memory
    class: Storage
component: HDFS
     type: Latency
     calc: $now - $last_collected_t
    units: seconds ago
    every: 10s
     warn: $this > (($status >= $WARNING)  ? ($update_every) : ( 5 * $update_every))
     crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
    delay: down 5m multiplier 1.5 max 1h
     info: number of seconds since the last successful data collection
       to: webmaster


# Common

 template: hdfs_capacity_usage
       on: hdfs.capacity
    class: Storage
component: HDFS
     type: Utilization
     calc: ($used) * 100 / ($used + $remaining)
    units: %
    every: 10s
     warn: $this > (($status >= $WARNING)  ? (70) : (80))
     crit: $this > (($status == $CRITICAL) ? (80) : (98))
    delay: down 15m multiplier 1.5 max 1h
     info: summary datanodes space capacity utilization
       to: sysadmin


# NameNode

 template: hdfs_missing_blocks
       on: hdfs.blocks
    class: Storage
component: HDFS
     type: Errors
     calc: $missing
    units: missing blocks
    every: 10s
     warn: $this > 0
    delay: down 15m multiplier 1.5 max 1h
     info: number of missing blocks
       to: sysadmin


 template: hdfs_stale_nodes
       on: hdfs.data_nodes
    class: Storage
component: HDFS
     type: Errors
     calc: $stale
    units: dead nodes
    every: 10s
     warn: $this > 0
    delay: down 15m multiplier 1.5 max 1h
     info: number of datanodes marked stale due to delayed heartbeat
       to: sysadmin


 template: hdfs_dead_nodes
       on: hdfs.data_nodes
    class: Storage
component: HDFS
     type: Errors
     calc: $dead
    units: dead nodes
    every: 10s
     crit: $this > 0
    delay: down 15m multiplier 1.5 max 1h
     info: number of datanodes which are currently dead
       to: sysadmin


# DataNode

 template: hdfs_num_failed_volumes
       on: hdfs.num_failed_volumes
    class: Storage
component: HDFS
     type: Errors
     calc: $fsds_num_failed_volumes
    units: failed volumes
    every: 10s
     warn: $this > 0
    delay: down 15m multiplier 1.5 max 1h
     info: number of failed volumes
       to: sysadmin