1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
|
// SPDX-License-Identifier: GPL-3.0-or-later
package hdfs
// HDFS Architecture
// https://hadoop.apache.org/docs/r1.2.1/hdfs_design.html#NameNode+and+DataNodes
// Metrics description
// https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/Metrics.html
// Good article
// https://www.datadoghq.com/blog/monitor-hadoop-metrics/#hdfs-metrics
type metrics struct {
Jvm *jvmMetrics `stm:"jvm"` // both
Rpc *rpcActivityMetrics `stm:"rpc"` // both
FSNameSystem *fsNameSystemMetrics `stm:"fsns"` // namenode
FSDatasetState *fsDatasetStateMetrics `stm:"fsds"` // datanode
DataNodeActivity *dataNodeActivityMetrics `stm:"dna"` // datanode
}
type jvmMetrics struct {
ProcessName string `json:"tag.ProcessName"`
HostName string `json:"tag.Hostname"`
//MemNonHeapUsedM float64 `stm:"mem_non_heap_used,1000,1"`
//MemNonHeapCommittedM float64 `stm:"mem_non_heap_committed,1000,1"`
//MemNonHeapMaxM float64 `stm:"mem_non_heap_max"`
MemHeapUsedM float64 `stm:"mem_heap_used,1000,1"`
MemHeapCommittedM float64 `stm:"mem_heap_committed,1000,1"`
MemHeapMaxM float64 `stm:"mem_heap_max"`
//MemMaxM float64 `stm:"mem_max"`
GcCount float64 `stm:"gc_count"`
GcTimeMillis float64 `stm:"gc_time_millis"`
GcNumWarnThresholdExceeded float64 `stm:"gc_num_warn_threshold_exceeded"`
GcNumInfoThresholdExceeded float64 `stm:"gc_num_info_threshold_exceeded"`
GcTotalExtraSleepTime float64 `stm:"gc_total_extra_sleep_time"`
ThreadsNew float64 `stm:"threads_new"`
ThreadsRunnable float64 `stm:"threads_runnable"`
ThreadsBlocked float64 `stm:"threads_blocked"`
ThreadsWaiting float64 `stm:"threads_waiting"`
ThreadsTimedWaiting float64 `stm:"threads_timed_waiting"`
ThreadsTerminated float64 `stm:"threads_terminated"`
LogFatal float64 `stm:"log_fatal"`
LogError float64 `stm:"log_error"`
LogWarn float64 `stm:"log_warn"`
LogInfo float64 `stm:"log_info"`
}
type rpcActivityMetrics struct {
ReceivedBytes float64 `stm:"received_bytes"`
SentBytes float64 `stm:"sent_bytes"`
RpcQueueTimeNumOps float64 `stm:"queue_time_num_ops"`
RpcQueueTimeAvgTime float64 `stm:"queue_time_avg_time,1000,1"`
//RpcProcessingTimeNumOps float64
RpcProcessingTimeAvgTime float64 `stm:"processing_time_avg_time,1000,1"`
//DeferredRpcProcessingTimeNumOps float64
//DeferredRpcProcessingTimeAvgTime float64
//RpcAuthenticationFailures float64
//RpcAuthenticationSuccesses float64
//RpcAuthorizationFailures float64
//RpcAuthorizationSuccesses float64
//RpcClientBackoff float64
//RpcSlowCalls float64
NumOpenConnections float64 `stm:"num_open_connections"`
CallQueueLength float64 `stm:"call_queue_length"`
//NumDroppedConnections float64
}
type fsNameSystemMetrics struct {
HostName string `json:"tag.Hostname"`
HAState string `json:"tag.HAState"`
//TotalSyncTimes float64 `json:"tag.tag.TotalSyncTimes"`
MissingBlocks float64 `stm:"missing_blocks"`
//MissingReplOneBlocks float64 `stm:"missing_repl_one_blocks"`
//ExpiredHeartbeats float64 `stm:"expired_heartbeats"`
//TransactionsSinceLastCheckpoint float64 `stm:"transactions_since_last_checkpoint"`
//TransactionsSinceLastLogRoll float64 `stm:"transactions_since_last_log_roll"`
//LastWrittenTransactionId float64 `stm:"last_written_transaction_id"`
//LastCheckpointTime float64 `stm:"last_checkpoint_time"`
CapacityTotal float64 `stm:"capacity_total"`
//CapacityTotalGB float64 `stm:"capacity_total_gb"`
CapacityDfsUsed float64 `json:"CapacityUsed" stm:"capacity_used_dfs"`
//CapacityUsedGB float64 `stm:"capacity_used_gb"`
CapacityRemaining float64 `stm:"capacity_remaining"`
//ProvidedCapacityTotal float64 `stm:"provided_capacity_total"`
//CapacityRemainingGB float64 `stm:"capacity_remaining_gb"`
CapacityUsedNonDFS float64 `stm:"capacity_used_non_dfs"`
TotalLoad float64 `stm:"total_load"`
//SnapshottableDirectories float64 `stm:"snapshottable_directories"`
//Snapshots float64 `stm:"snapshots"`
//NumEncryptionZones float64 `stm:"num_encryption_zones"`
//LockQueueLength float64 `stm:"lock_queue_length"`
BlocksTotal float64 `stm:"blocks_total"`
//NumFilesUnderConstruction float64 `stm:"num_files_under_construction"`
//NumActiveClients float64 `stm:"num_active_clients"`
FilesTotal float64 `stm:"files_total"`
//PendingReplicationBlocks float64 `stm:"pending_replication_blocks"`
//PendingReconstructionBlocks float64 `stm:"pending_reconstruction_blocks"`
UnderReplicatedBlocks float64 `stm:"under_replicated_blocks"`
//LowRedundancyBlocks float64 `stm:"low_redundancy_blocks"`
CorruptBlocks float64 `stm:"corrupt_blocks"`
//ScheduledReplicationBlocks float64 `stm:"scheduled_replication_blocks"`
//PendingDeletionBlocks float64 `stm:"pending_deletion_blocks"`
//LowRedundancyReplicatedBlocks float64 `stm:"low_redundancy_replicated_blocks"`
//CorruptReplicatedBlocks float64 `stm:"corrupt_replicated_blocks"`
//MissingReplicatedBlocks float64 `stm:"missing_replicated_blocks"`
//MissingReplicationOneBlocks float64 `stm:"missing_replication_one_blocks"`
//HighestPriorityLowRedundancyReplicatedBlocks float64 `stm:"highest_priority_low_redundancy_replicated_blocks"`
//HighestPriorityLowRedundancyECBlocks float64 `stm:"highest_priority_low_redundancy_ec_blocks"`
//BytesInFutureReplicatedBlocks float64 `stm:"bytes_in_future_replicated_blocks"`
//PendingDeletionReplicatedBlocks float64 `stm:"pending_deletion_replicated_blocks"`
//TotalReplicatedBlocks float64 `stm:"total_replicated_blocks"`
//LowRedundancyECBlockGroups float64 `stm:"low_redundancy_ec_block_groups"`
//CorruptECBlockGroups float64 `stm:"corrupt_ec_block_groups"`
//MissingECBlockGroups float64 `stm:"missing_ec_block_groups"`
//BytesInFutureECBlockGroups float64 `stm:"bytes_in_future_ec_block_groups"`
//PendingDeletionECBlocks float64 `stm:"pending_deletion_ec_blocks"`
//TotalECBlockGroups float64 `stm:"total_ec_block_groups"`
//ExcessBlocks float64 `stm:"excess_blocks"`
//NumTimedOutPendingReconstructions float64 `stm:"num_timed_out_pending_reconstructions"`
//PostponedMisreplicatedBlocks float64 `stm:"postponed_misreplicated_blocks"`
//PendingDataNodeMessageCount float64 `stm:"pending_data_node_message_count"`
//MillisSinceLastLoadedEdits float64 `stm:"millis_since_last_loaded_edits"`
//BlockCapacity float64 `stm:"block_capacity"`
NumLiveDataNodes float64 `stm:"num_live_data_nodes"`
NumDeadDataNodes float64 `stm:"num_dead_data_nodes"`
//NumDecomLiveDataNodes float64 `stm:"num_decom_live_data_nodes"`
//NumDecomDeadDataNodes float64 `stm:"num_decom_dead_data_nodes"`
VolumeFailuresTotal float64 `stm:"volume_failures_total"`
//EstimatedCapacityLostTotal float64 `stm:"estimated_capacity_lost_total"`
//NumDecommissioningDataNodes float64 `stm:"num_decommissioning_data_nodes"`
StaleDataNodes float64 `stm:"stale_data_nodes"`
//NumStaleStorages float64 `stm:"num_stale_storages"`
//TotalSyncCount float64 `stm:"total_sync_count"`
//NumInMaintenanceLiveDataNodes float64 `stm:"num_in_maintenance_live_data_nodes"`
//NumInMaintenanceDeadDataNodes float64 `stm:"num_in_maintenance_dead_data_nodes"`
//NumEnteringMaintenanceDataNodes float64 `stm:"num_entering_maintenance_data_nodes"`
// custom attributes
CapacityUsed float64 `json:"-" stm:"capacity_used"`
}
type fsDatasetStateMetrics struct {
HostName string `json:"tag.Hostname"`
Capacity float64 `stm:"capacity_total"`
DfsUsed float64 `stm:"capacity_used_dfs"`
Remaining float64 `stm:"capacity_remaining"`
NumFailedVolumes float64 `stm:"num_failed_volumes"`
//LastVolumeFailureDate float64 `stm:"LastVolumeFailureDate"`
//EstimatedCapacityLostTotal float64 `stm:"EstimatedCapacityLostTotal"`
//CacheUsed float64 `stm:"CacheUsed"`
//CacheCapacity float64 `stm:"CacheCapacity"`
//NumBlocksCached float64 `stm:"NumBlocksCached"`
//NumBlocksFailedToCache float64 `stm:"NumBlocksFailedToCache"`
//NumBlocksFailedToUnCache float64 `stm:"NumBlocksFailedToUnCache"`
// custom attributes
CapacityUsedNonDFS float64 `stm:"capacity_used_non_dfs"`
CapacityUsed float64 `stm:"capacity_used"`
}
type dataNodeActivityMetrics struct {
HostName string `json:"tag.Hostname"`
BytesWritten float64 `stm:"bytes_written"`
//TotalWriteTime float64
BytesRead float64 `stm:"bytes_read"`
//TotalReadTime float64
//BlocksWritten float64
//BlocksRead float64
//BlocksReplicated float64
//BlocksRemoved float64
//BlocksVerified float64
//BlockVerificationFailures float64
//BlocksCached float64
//BlocksUncached float64
//ReadsFromLocalClient float64
//ReadsFromRemoteClient float64
//WritesFromLocalClient float64
//WritesFromRemoteClient float64
//BlocksGetLocalPathInfo float64
//RemoteBytesRead float64
//RemoteBytesWritten float64
//RamDiskBlocksWrite float64
//RamDiskBlocksWriteFallback float64
//RamDiskBytesWrite float64
//RamDiskBlocksReadHits float64
//RamDiskBlocksEvicted float64
//RamDiskBlocksEvictedWithoutRead float64
//RamDiskBlocksEvictionWindowMsNumOps float64
//RamDiskBlocksEvictionWindowMsAvgTime float64
//RamDiskBlocksLazyPersisted float64
//RamDiskBlocksDeletedBeforeLazyPersisted float64
//RamDiskBytesLazyPersisted float64
//RamDiskBlocksLazyPersistWindowMsNumOps float64
//RamDiskBlocksLazyPersistWindowMsAvgTime float64
//FsyncCount float64
//VolumeFailures float64
//DatanodeNetworkErrors float64
//DataNodeActiveXceiversCount float64
//ReadBlockOpNumOps float64
//ReadBlockOpAvgTime float64
//WriteBlockOpNumOps float64
//WriteBlockOpAvgTime float64
//BlockChecksumOpNumOps float64
//BlockChecksumOpAvgTime float64
//CopyBlockOpNumOps float64
//CopyBlockOpAvgTime float64
//ReplaceBlockOpNumOps float64
//ReplaceBlockOpAvgTime float64
//HeartbeatsNumOps float64
//HeartbeatsAvgTime float64
//HeartbeatsTotalNumOps float64
//HeartbeatsTotalAvgTime float64
//LifelinesNumOps float64
//LifelinesAvgTime float64
//BlockReportsNumOps float64
//BlockReportsAvgTime float64
//IncrementalBlockReportsNumOps float64
//IncrementalBlockReportsAvgTime float64
//CacheReportsNumOps float64
//CacheReportsAvgTime float64
//PacketAckRoundTripTimeNanosNumOps float64
//PacketAckRoundTripTimeNanosAvgTime float64
//FlushNanosNumOps float64
//FlushNanosAvgTime float64
//FsyncNanosNumOps float64
//FsyncNanosAvgTime float64
//SendDataPacketBlockedOnNetworkNanosNumOps float64
//SendDataPacketBlockedOnNetworkNanosAvgTime float64
//SendDataPacketTransferNanosNumOps float64
//SendDataPacketTransferNanosAvgTime float64
//BlocksInPendingIBR float64
//BlocksReceivingInPendingIBR float64
//BlocksReceivedInPendingIBR float64
//BlocksDeletedInPendingIBR float64
//EcReconstructionTasks float64
//EcFailedReconstructionTasks float64
//EcDecodingTimeNanos float64
//EcReconstructionBytesRead float64
//EcReconstructionBytesWritten float64
//EcReconstructionRemoteBytesRead float64
//EcReconstructionReadTimeMillis float64
//EcReconstructionDecodingTimeMillis float64
//EcReconstructionWriteTimeMillis float64
}
|