blob: 678faab4c0f203d6720c6f0f1fbdfdbf26cad919 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
|
# make sure hdfs is running
template: hdfs_last_collected_secs
on: hdfs.heap_memory
calc: $now - $last_collected_t
units: seconds ago
every: 10s
warn: $this > (($status >= $WARNING) ? ($update_every) : ( 5 * $update_every))
crit: $this > (($status == $CRITICAL) ? ($update_every) : (60 * $update_every))
delay: down 5m multiplier 1.5 max 1h
info: number of seconds since the last successful data collection
to: webmaster
# Common
template: hdfs_capacity_usage
on: hdfs.capacity
calc: ($used) * 100 / ($used + $remaining)
units: %
every: 10s
warn: $this > (($status >= $WARNING) ? (70) : (80))
crit: $this > (($status == $CRITICAL) ? (80) : (98))
delay: down 15m multiplier 1.5 max 1h
info: used capacity
to: sysadmin
# NameNode
template: hdfs_missing_blocks
on: hdfs.blocks
calc: $missing
units: missing blocks
every: 10s
warn: $this > 0
delay: down 15m multiplier 1.5 max 1h
info: missing blocks
to: sysadmin
template: hdfs_stale_nodes
on: hdfs.data_nodes
calc: $stale
units: dead nodes
every: 10s
warn: $this > 0
delay: down 15m multiplier 1.5 max 1h
info: stale data nodes
to: sysadmin
template: hdfs_dead_nodes
on: hdfs.data_nodes
calc: $dead
units: dead nodes
every: 10s
crit: $this > 0
delay: down 15m multiplier 1.5 max 1h
info: dead data nodes
to: sysadmin
# DataNode
template: hdfs_num_failed_volumes
on: hdfs.num_failed_volumes
calc: $fsds_num_failed_volumes
units: failed volumes
every: 10s
warn: $this > 0
delay: down 15m multiplier 1.5 max 1h
info: failed volumes
to: sysadmin
|