blob: 694868e01a097732fe1ebe7fac309f68ac0c2a8e (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
|
plugin_name: go.d.plugin
modules:
- meta:
id: collector-go.d.plugin-hfs
plugin_name: go.d.plugin
module_name: hfs
monitored_instance:
name: Hadoop Distributed File System (HDFS)
link: https://hadoop.apache.org/docs/r1.2.1/hdfs_design.html
icon_filename: hadoop.svg
categories:
- data-collection.storage-mount-points-and-filesystems
keywords:
- hdfs
- hadoop
related_resources:
integrations:
list: []
info_provided_to_referring_integrations:
description: ""
most_popular: true
overview:
data_collection:
metrics_description: |
This collector monitors HDFS nodes.
Netdata accesses HDFS metrics over `Java Management Extensions` (JMX) through the web interface of an HDFS daemon.
method_description: ""
supported_platforms:
include: []
exclude: []
multi_instance: true
additional_permissions:
description: ""
default_behavior:
auto_detection:
description: ""
limits:
description: ""
performance_impact:
description: ""
setup:
prerequisites:
list: []
configuration:
file:
name: go.d/hdfs.conf
options:
description: |
The following options can be defined globally: update_every, autodetection_retry.
folding:
title: Config options
enabled: true
list:
- name: update_every
description: Data collection frequency.
default_value: 1
required: false
- name: autodetection_retry
description: Recheck interval in seconds. Zero means no recheck will be scheduled.
default_value: 0
required: false
- name: url
description: Server URL.
default_value: http://127.0.0.1:9870/jmx
required: true
- name: timeout
description: HTTP request timeout.
default_value: 1
required: false
- name: username
description: Username for basic HTTP authentication.
default_value: ""
required: false
- name: password
description: Password for basic HTTP authentication.
default_value: ""
required: false
- name: proxy_url
description: Proxy URL.
default_value: ""
required: false
- name: proxy_username
description: Username for proxy basic HTTP authentication.
default_value: ""
required: false
- name: proxy_password
description: Password for proxy basic HTTP authentication.
default_value: ""
required: false
- name: method
description: HTTP request method.
default_value: "GET"
required: false
- name: body
description: HTTP request body.
default_value: ""
required: false
- name: headers
description: HTTP request headers.
default_value: ""
required: false
- name: not_follow_redirects
description: Redirect handling policy. Controls whether the client follows redirects.
default_value: no
required: false
- name: tls_skip_verify
description: Server certificate chain and hostname validation policy. Controls whether the client performs this check.
default_value: no
required: false
- name: tls_ca
description: Certification authority that the client uses when verifying the server's certificates.
default_value: ""
required: false
- name: tls_cert
description: Client TLS certificate.
default_value: ""
required: false
- name: tls_key
description: Client TLS key.
default_value: ""
required: false
examples:
folding:
title: Config
enabled: true
list:
- name: Basic
folding:
enabled: false
description: A basic example configuration.
config: |
jobs:
- name: local
url: http://127.0.0.1:9870/jmx
- name: HTTP authentication
description: Basic HTTP authentication.
config: |
jobs:
- name: local
url: http://127.0.0.1:9870/jmx
username: username
password: password
- name: HTTPS with self-signed certificate
description: |
Do not validate server certificate chain and hostname.
config: |
jobs:
- name: local
url: https://127.0.0.1:9870/jmx
tls_skip_verify: yes
- name: Multi-instance
description: |
> **Note**: When you define multiple jobs, their names must be unique.
Collecting metrics from local and remote instances.
config: |
jobs:
- name: local
url: http://127.0.0.1:9870/jmx
- name: remote
url: http://192.0.2.1:9870/jmx
troubleshooting:
problems:
list: []
alerts:
- name: hdfs_capacity_usage
metric: hdfs.capacity
info: summary datanodes space capacity utilization
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/hdfs.conf
- name: hdfs_missing_blocks
metric: hdfs.blocks
info: number of missing blocks
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/hdfs.conf
- name: hdfs_stale_nodes
metric: hdfs.data_nodes
info: number of datanodes marked stale due to delayed heartbeat
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/hdfs.conf
- name: hdfs_dead_nodes
metric: hdfs.data_nodes
info: number of datanodes which are currently dead
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/hdfs.conf
- name: hdfs_num_failed_volumes
metric: hdfs.num_failed_volumes
info: number of failed volumes
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/hdfs.conf
metrics:
folding:
title: Metrics
enabled: false
description: ""
availability:
- DataNode
- NameNode
scopes:
- name: global
description: These metrics refer to the entire monitored application.
labels: []
metrics:
- name: hdfs.heap_memory
description: Heap Memory
unit: MiB
chart_type: area
dimensions:
- name: committed
- name: used
- name: hdfs.gc_count_total
description: GC Events
unit: events/s
chart_type: line
dimensions:
- name: gc
- name: hdfs.gc_time_total
description: GC Time
unit: ms
chart_type: line
dimensions:
- name: ms
- name: hdfs.gc_threshold
description: Number of Times That the GC Threshold is Exceeded
unit: events/s
chart_type: line
dimensions:
- name: info
- name: warn
- name: hdfs.threads
description: Number of Threads
unit: num
chart_type: stacked
dimensions:
- name: new
- name: runnable
- name: blocked
- name: waiting
- name: timed_waiting
- name: terminated
- name: hdfs.logs_total
description: Number of Logs
unit: logs/s
chart_type: stacked
dimensions:
- name: info
- name: error
- name: warn
- name: fatal
- name: hdfs.rpc_bandwidth
description: RPC Bandwidth
unit: kilobits/s
chart_type: area
dimensions:
- name: received
- name: sent
- name: hdfs.rpc_calls
description: RPC Calls
unit: calls/s
chart_type: line
dimensions:
- name: calls
- name: hdfs.open_connections
description: RPC Open Connections
unit: connections
chart_type: line
dimensions:
- name: open
- name: hdfs.call_queue_length
description: RPC Call Queue Length
unit: num
chart_type: line
dimensions:
- name: length
- name: hdfs.avg_queue_time
description: RPC Avg Queue Time
unit: ms
chart_type: line
dimensions:
- name: time
- name: hdfs.avg_processing_time
description: RPC Avg Processing Time
unit: ms
chart_type: line
dimensions:
- name: time
- name: hdfs.capacity
description: Capacity Across All Datanodes
unit: KiB
chart_type: stacked
availability:
- NameNode
dimensions:
- name: remaining
- name: used
- name: hdfs.used_capacity
description: Used Capacity Across All Datanodes
unit: KiB
chart_type: stacked
availability:
- NameNode
dimensions:
- name: dfs
- name: non_dfs
- name: hdfs.load
description: Number of Concurrent File Accesses (read/write) Across All DataNodes
unit: load
chart_type: line
availability:
- NameNode
dimensions:
- name: load
- name: hdfs.volume_failures_total
description: Number of Volume Failures Across All Datanodes
unit: events/s
chart_type: line
availability:
- NameNode
dimensions:
- name: failures
- name: hdfs.files_total
description: Number of Tracked Files
unit: num
chart_type: line
availability:
- NameNode
dimensions:
- name: files
- name: hdfs.blocks_total
description: Number of Allocated Blocks in the System
unit: num
chart_type: line
availability:
- NameNode
dimensions:
- name: blocks
- name: hdfs.blocks
description: Number of Problem Blocks (can point to an unhealthy cluster)
unit: num
chart_type: line
availability:
- NameNode
dimensions:
- name: corrupt
- name: missing
- name: under_replicated
- name: hdfs.data_nodes
description: Number of Data Nodes By Status
unit: num
chart_type: stacked
availability:
- NameNode
dimensions:
- name: live
- name: dead
- name: stale
- name: hdfs.datanode_capacity
description: Capacity
unit: KiB
chart_type: stacked
availability:
- DataNode
dimensions:
- name: remaining
- name: used
- name: hdfs.datanode_used_capacity
description: Used Capacity
unit: KiB
chart_type: stacked
availability:
- DataNode
dimensions:
- name: dfs
- name: non_dfs
- name: hdfs.datanode_failed_volumes
description: Number of Failed Volumes
unit: num
chart_type: line
availability:
- DataNode
dimensions:
- name: failed volumes
- name: hdfs.datanode_bandwidth
description: Bandwidth
unit: KiB/s
chart_type: area
availability:
- DataNode
dimensions:
- name: reads
- name: writes
|