summaryrefslogtreecommitdiffstats
path: root/src/go/collectors/go.d.plugin/modules/hdfs/metrics.go
blob: 972436a5d260c42f445700ddeeaeb2fc9e6f88a3 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
// SPDX-License-Identifier: GPL-3.0-or-later

package hdfs

// HDFS Architecture
// https://hadoop.apache.org/docs/r1.2.1/hdfs_design.html#NameNode+and+DataNodes

// Metrics description
// https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/Metrics.html

// Good article
// https://www.datadoghq.com/blog/monitor-hadoop-metrics/#hdfs-metrics

type metrics struct {
	Jvm              *jvmMetrics              `stm:"jvm"`  // both
	Rpc              *rpcActivityMetrics      `stm:"rpc"`  // both
	FSNameSystem     *fsNameSystemMetrics     `stm:"fsns"` // namenode
	FSDatasetState   *fsDatasetStateMetrics   `stm:"fsds"` // datanode
	DataNodeActivity *dataNodeActivityMetrics `stm:"dna"`  // datanode
}

type jvmMetrics struct {
	ProcessName string `json:"tag.ProcessName"`
	HostName    string `json:"tag.Hostname"`
	//MemNonHeapUsedM            float64 `stm:"mem_non_heap_used,1000,1"`
	//MemNonHeapCommittedM       float64 `stm:"mem_non_heap_committed,1000,1"`
	//MemNonHeapMaxM             float64 `stm:"mem_non_heap_max"`
	MemHeapUsedM      float64 `stm:"mem_heap_used,1000,1"`
	MemHeapCommittedM float64 `stm:"mem_heap_committed,1000,1"`
	MemHeapMaxM       float64 `stm:"mem_heap_max"`
	//MemMaxM                    float64 `stm:"mem_max"`
	GcCount                    float64 `stm:"gc_count"`
	GcTimeMillis               float64 `stm:"gc_time_millis"`
	GcNumWarnThresholdExceeded float64 `stm:"gc_num_warn_threshold_exceeded"`
	GcNumInfoThresholdExceeded float64 `stm:"gc_num_info_threshold_exceeded"`
	GcTotalExtraSleepTime      float64 `stm:"gc_total_extra_sleep_time"`
	ThreadsNew                 float64 `stm:"threads_new"`
	ThreadsRunnable            float64 `stm:"threads_runnable"`
	ThreadsBlocked             float64 `stm:"threads_blocked"`
	ThreadsWaiting             float64 `stm:"threads_waiting"`
	ThreadsTimedWaiting        float64 `stm:"threads_timed_waiting"`
	ThreadsTerminated          float64 `stm:"threads_terminated"`
	LogFatal                   float64 `stm:"log_fatal"`
	LogError                   float64 `stm:"log_error"`
	LogWarn                    float64 `stm:"log_warn"`
	LogInfo                    float64 `stm:"log_info"`
}

type rpcActivityMetrics struct {
	ReceivedBytes       float64 `stm:"received_bytes"`
	SentBytes           float64 `stm:"sent_bytes"`
	RpcQueueTimeNumOps  float64 `stm:"queue_time_num_ops"`
	RpcQueueTimeAvgTime float64 `stm:"queue_time_avg_time,1000,1"`
	//RpcProcessingTimeNumOps  float64
	RpcProcessingTimeAvgTime float64 `stm:"processing_time_avg_time,1000,1"`
	//DeferredRpcProcessingTimeNumOps  float64
	//DeferredRpcProcessingTimeAvgTime float64
	//RpcAuthenticationFailures        float64
	//RpcAuthenticationSuccesses       float64
	//RpcAuthorizationFailures         float64
	//RpcAuthorizationSuccesses        float64
	//RpcClientBackoff                 float64
	//RpcSlowCalls                     float64
	NumOpenConnections float64 `stm:"num_open_connections"`
	CallQueueLength    float64 `stm:"call_queue_length"`
	//NumDroppedConnections            float64
}

type fsNameSystemMetrics struct {
	HostName string `json:"tag.Hostname"`
	HAState  string `json:"tag.HAState"`
	//TotalSyncTimes                               float64 `json:"tag.tag.TotalSyncTimes"`
	MissingBlocks float64 `stm:"missing_blocks"`
	//MissingReplOneBlocks                         float64 `stm:"missing_repl_one_blocks"`
	//ExpiredHeartbeats                            float64 `stm:"expired_heartbeats"`
	//TransactionsSinceLastCheckpoint              float64 `stm:"transactions_since_last_checkpoint"`
	//TransactionsSinceLastLogRoll                 float64 `stm:"transactions_since_last_log_roll"`
	//LastWrittenTransactionId                     float64 `stm:"last_written_transaction_id"`
	//LastCheckpointTime                           float64 `stm:"last_checkpoint_time"`
	CapacityTotal float64 `stm:"capacity_total"`
	//CapacityTotalGB                              float64 `stm:"capacity_total_gb"`
	CapacityDfsUsed float64 `json:"CapacityUsed" stm:"capacity_used_dfs"`
	//CapacityUsedGB                               float64 `stm:"capacity_used_gb"`
	CapacityRemaining float64 `stm:"capacity_remaining"`
	//ProvidedCapacityTotal                        float64 `stm:"provided_capacity_total"`
	//CapacityRemainingGB                          float64 `stm:"capacity_remaining_gb"`
	CapacityUsedNonDFS float64 `stm:"capacity_used_non_dfs"`
	TotalLoad          float64 `stm:"total_load"`
	//SnapshottableDirectories                     float64 `stm:"snapshottable_directories"`
	//Snapshots                                    float64 `stm:"snapshots"`
	//NumEncryptionZones                           float64 `stm:"num_encryption_zones"`
	//LockQueueLength                              float64 `stm:"lock_queue_length"`
	BlocksTotal float64 `stm:"blocks_total"`
	//NumFilesUnderConstruction                    float64 `stm:"num_files_under_construction"`
	//NumActiveClients                             float64 `stm:"num_active_clients"`
	FilesTotal float64 `stm:"files_total"`
	//PendingReplicationBlocks    float64 `stm:"pending_replication_blocks"`
	//PendingReconstructionBlocks float64 `stm:"pending_reconstruction_blocks"`
	UnderReplicatedBlocks float64 `stm:"under_replicated_blocks"`
	//LowRedundancyBlocks                          float64 `stm:"low_redundancy_blocks"`
	CorruptBlocks float64 `stm:"corrupt_blocks"`
	//ScheduledReplicationBlocks float64 `stm:"scheduled_replication_blocks"`
	//PendingDeletionBlocks      float64 `stm:"pending_deletion_blocks"`
	//LowRedundancyReplicatedBlocks                float64 `stm:"low_redundancy_replicated_blocks"`
	//CorruptReplicatedBlocks                      float64 `stm:"corrupt_replicated_blocks"`
	//MissingReplicatedBlocks                      float64 `stm:"missing_replicated_blocks"`
	//MissingReplicationOneBlocks                  float64 `stm:"missing_replication_one_blocks"`
	//HighestPriorityLowRedundancyReplicatedBlocks float64 `stm:"highest_priority_low_redundancy_replicated_blocks"`
	//HighestPriorityLowRedundancyECBlocks         float64 `stm:"highest_priority_low_redundancy_ec_blocks"`
	//BytesInFutureReplicatedBlocks                float64 `stm:"bytes_in_future_replicated_blocks"`
	//PendingDeletionReplicatedBlocks              float64 `stm:"pending_deletion_replicated_blocks"`
	//TotalReplicatedBlocks                        float64 `stm:"total_replicated_blocks"`
	//LowRedundancyECBlockGroups                   float64 `stm:"low_redundancy_ec_block_groups"`
	//CorruptECBlockGroups                         float64 `stm:"corrupt_ec_block_groups"`
	//MissingECBlockGroups                         float64 `stm:"missing_ec_block_groups"`
	//BytesInFutureECBlockGroups                   float64 `stm:"bytes_in_future_ec_block_groups"`
	//PendingDeletionECBlocks                      float64 `stm:"pending_deletion_ec_blocks"`
	//TotalECBlockGroups                           float64 `stm:"total_ec_block_groups"`
	//ExcessBlocks                                 float64 `stm:"excess_blocks"`
	//NumTimedOutPendingReconstructions            float64 `stm:"num_timed_out_pending_reconstructions"`
	//PostponedMisreplicatedBlocks                 float64 `stm:"postponed_misreplicated_blocks"`
	//PendingDataNodeMessageCount                  float64 `stm:"pending_data_node_message_count"`
	//MillisSinceLastLoadedEdits                   float64 `stm:"millis_since_last_loaded_edits"`
	//BlockCapacity                                float64 `stm:"block_capacity"`
	NumLiveDataNodes float64 `stm:"num_live_data_nodes"`
	NumDeadDataNodes float64 `stm:"num_dead_data_nodes"`
	//NumDecomLiveDataNodes                        float64 `stm:"num_decom_live_data_nodes"`
	//NumDecomDeadDataNodes                        float64 `stm:"num_decom_dead_data_nodes"`
	VolumeFailuresTotal float64 `stm:"volume_failures_total"`
	//EstimatedCapacityLostTotal                   float64 `stm:"estimated_capacity_lost_total"`
	//NumDecommissioningDataNodes                  float64 `stm:"num_decommissioning_data_nodes"`
	StaleDataNodes float64 `stm:"stale_data_nodes"`
	//NumStaleStorages                             float64 `stm:"num_stale_storages"`
	//TotalSyncCount                               float64 `stm:"total_sync_count"`
	//NumInMaintenanceLiveDataNodes                float64 `stm:"num_in_maintenance_live_data_nodes"`
	//NumInMaintenanceDeadDataNodes                float64 `stm:"num_in_maintenance_dead_data_nodes"`
	//NumEnteringMaintenanceDataNodes              float64 `stm:"num_entering_maintenance_data_nodes"`

	// custom attributes
	CapacityUsed float64 `json:"-" stm:"capacity_used"`
}

type fsDatasetStateMetrics struct {
	HostName         string  `json:"tag.Hostname"`
	Capacity         float64 `stm:"capacity_total"`
	DfsUsed          float64 `stm:"capacity_used_dfs"`
	Remaining        float64 `stm:"capacity_remaining"`
	NumFailedVolumes float64 `stm:"num_failed_volumes"`
	//LastVolumeFailureDate      float64 `stm:"LastVolumeFailureDate"`
	//EstimatedCapacityLostTotal float64 `stm:"EstimatedCapacityLostTotal"`
	//CacheUsed                  float64 `stm:"CacheUsed"`
	//CacheCapacity              float64 `stm:"CacheCapacity"`
	//NumBlocksCached            float64 `stm:"NumBlocksCached"`
	//NumBlocksFailedToCache     float64 `stm:"NumBlocksFailedToCache"`
	//NumBlocksFailedToUnCache   float64 `stm:"NumBlocksFailedToUnCache"`

	// custom attributes
	CapacityUsedNonDFS float64 `stm:"capacity_used_non_dfs"`
	CapacityUsed       float64 `stm:"capacity_used"`
}

type dataNodeActivityMetrics struct {
	HostName     string  `json:"tag.Hostname"`
	BytesWritten float64 `stm:"bytes_written"`
	//TotalWriteTime                             float64
	BytesRead float64 `stm:"bytes_read"`
	//TotalReadTime                              float64
	//BlocksWritten float64
	//BlocksRead    float64
	//BlocksReplicated                           float64
	//BlocksRemoved                              float64
	//BlocksVerified                             float64
	//BlockVerificationFailures                  float64
	//BlocksCached                               float64
	//BlocksUncached                             float64
	//ReadsFromLocalClient                       float64
	//ReadsFromRemoteClient                      float64
	//WritesFromLocalClient                      float64
	//WritesFromRemoteClient                     float64
	//BlocksGetLocalPathInfo                     float64
	//RemoteBytesRead                            float64
	//RemoteBytesWritten                         float64
	//RamDiskBlocksWrite                         float64
	//RamDiskBlocksWriteFallback                 float64
	//RamDiskBytesWrite                          float64
	//RamDiskBlocksReadHits                      float64
	//RamDiskBlocksEvicted                       float64
	//RamDiskBlocksEvictedWithoutRead            float64
	//RamDiskBlocksEvictionWindowMsNumOps        float64
	//RamDiskBlocksEvictionWindowMsAvgTime       float64
	//RamDiskBlocksLazyPersisted                 float64
	//RamDiskBlocksDeletedBeforeLazyPersisted    float64
	//RamDiskBytesLazyPersisted                  float64
	//RamDiskBlocksLazyPersistWindowMsNumOps     float64
	//RamDiskBlocksLazyPersistWindowMsAvgTime    float64
	//FsyncCount                                 float64
	//VolumeFailures                             float64
	//DatanodeNetworkErrors                      float64
	//DataNodeActiveXceiversCount                float64
	//ReadBlockOpNumOps                          float64
	//ReadBlockOpAvgTime                         float64
	//WriteBlockOpNumOps                         float64
	//WriteBlockOpAvgTime                        float64
	//BlockChecksumOpNumOps                      float64
	//BlockChecksumOpAvgTime                     float64
	//CopyBlockOpNumOps                          float64
	//CopyBlockOpAvgTime                         float64
	//ReplaceBlockOpNumOps                       float64
	//ReplaceBlockOpAvgTime                      float64
	//HeartbeatsNumOps                           float64
	//HeartbeatsAvgTime                          float64
	//HeartbeatsTotalNumOps                      float64
	//HeartbeatsTotalAvgTime                     float64
	//LifelinesNumOps                            float64
	//LifelinesAvgTime                           float64
	//BlockReportsNumOps                         float64
	//BlockReportsAvgTime                        float64
	//IncrementalBlockReportsNumOps              float64
	//IncrementalBlockReportsAvgTime             float64
	//CacheReportsNumOps                         float64
	//CacheReportsAvgTime                        float64
	//PacketAckRoundTripTimeNanosNumOps          float64
	//PacketAckRoundTripTimeNanosAvgTime         float64
	//FlushNanosNumOps                           float64
	//FlushNanosAvgTime                          float64
	//FsyncNanosNumOps                           float64
	//FsyncNanosAvgTime                          float64
	//SendDataPacketBlockedOnNetworkNanosNumOps  float64
	//SendDataPacketBlockedOnNetworkNanosAvgTime float64
	//SendDataPacketTransferNanosNumOps          float64
	//SendDataPacketTransferNanosAvgTime         float64
	//BlocksInPendingIBR                         float64
	//BlocksReceivingInPendingIBR                float64
	//BlocksReceivedInPendingIBR                 float64
	//BlocksDeletedInPendingIBR                  float64
	//EcReconstructionTasks                      float64
	//EcFailedReconstructionTasks                float64
	//EcDecodingTimeNanos                        float64
	//EcReconstructionBytesRead                  float64
	//EcReconstructionBytesWritten               float64
	//EcReconstructionRemoteBytesRead            float64
	//EcReconstructionReadTimeMillis             float64
	//EcReconstructionDecodingTimeMillis         float64
	//EcReconstructionWriteTimeMillis            float64
}