diff options
Diffstat (limited to '')
-rw-r--r-- | src/go/collectors/go.d.plugin/modules/hdfs/metrics.go | 245 |
1 files changed, 245 insertions, 0 deletions
diff --git a/src/go/collectors/go.d.plugin/modules/hdfs/metrics.go b/src/go/collectors/go.d.plugin/modules/hdfs/metrics.go new file mode 100644 index 000000000..972436a5d --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/hdfs/metrics.go @@ -0,0 +1,245 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package hdfs + +// HDFS Architecture +// https://hadoop.apache.org/docs/r1.2.1/hdfs_design.html#NameNode+and+DataNodes + +// Metrics description +// https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/Metrics.html + +// Good article +// https://www.datadoghq.com/blog/monitor-hadoop-metrics/#hdfs-metrics + +type metrics struct { + Jvm *jvmMetrics `stm:"jvm"` // both + Rpc *rpcActivityMetrics `stm:"rpc"` // both + FSNameSystem *fsNameSystemMetrics `stm:"fsns"` // namenode + FSDatasetState *fsDatasetStateMetrics `stm:"fsds"` // datanode + DataNodeActivity *dataNodeActivityMetrics `stm:"dna"` // datanode +} + +type jvmMetrics struct { + ProcessName string `json:"tag.ProcessName"` + HostName string `json:"tag.Hostname"` + //MemNonHeapUsedM float64 `stm:"mem_non_heap_used,1000,1"` + //MemNonHeapCommittedM float64 `stm:"mem_non_heap_committed,1000,1"` + //MemNonHeapMaxM float64 `stm:"mem_non_heap_max"` + MemHeapUsedM float64 `stm:"mem_heap_used,1000,1"` + MemHeapCommittedM float64 `stm:"mem_heap_committed,1000,1"` + MemHeapMaxM float64 `stm:"mem_heap_max"` + //MemMaxM float64 `stm:"mem_max"` + GcCount float64 `stm:"gc_count"` + GcTimeMillis float64 `stm:"gc_time_millis"` + GcNumWarnThresholdExceeded float64 `stm:"gc_num_warn_threshold_exceeded"` + GcNumInfoThresholdExceeded float64 `stm:"gc_num_info_threshold_exceeded"` + GcTotalExtraSleepTime float64 `stm:"gc_total_extra_sleep_time"` + ThreadsNew float64 `stm:"threads_new"` + ThreadsRunnable float64 `stm:"threads_runnable"` + ThreadsBlocked float64 `stm:"threads_blocked"` + ThreadsWaiting float64 `stm:"threads_waiting"` + ThreadsTimedWaiting float64 `stm:"threads_timed_waiting"` + ThreadsTerminated float64 `stm:"threads_terminated"` + LogFatal float64 `stm:"log_fatal"` + LogError float64 `stm:"log_error"` + LogWarn float64 `stm:"log_warn"` + LogInfo float64 `stm:"log_info"` +} + +type rpcActivityMetrics struct { + ReceivedBytes float64 `stm:"received_bytes"` + SentBytes float64 `stm:"sent_bytes"` + RpcQueueTimeNumOps float64 `stm:"queue_time_num_ops"` + RpcQueueTimeAvgTime float64 `stm:"queue_time_avg_time,1000,1"` + //RpcProcessingTimeNumOps float64 + RpcProcessingTimeAvgTime float64 `stm:"processing_time_avg_time,1000,1"` + //DeferredRpcProcessingTimeNumOps float64 + //DeferredRpcProcessingTimeAvgTime float64 + //RpcAuthenticationFailures float64 + //RpcAuthenticationSuccesses float64 + //RpcAuthorizationFailures float64 + //RpcAuthorizationSuccesses float64 + //RpcClientBackoff float64 + //RpcSlowCalls float64 + NumOpenConnections float64 `stm:"num_open_connections"` + CallQueueLength float64 `stm:"call_queue_length"` + //NumDroppedConnections float64 +} + +type fsNameSystemMetrics struct { + HostName string `json:"tag.Hostname"` + HAState string `json:"tag.HAState"` + //TotalSyncTimes float64 `json:"tag.tag.TotalSyncTimes"` + MissingBlocks float64 `stm:"missing_blocks"` + //MissingReplOneBlocks float64 `stm:"missing_repl_one_blocks"` + //ExpiredHeartbeats float64 `stm:"expired_heartbeats"` + //TransactionsSinceLastCheckpoint float64 `stm:"transactions_since_last_checkpoint"` + //TransactionsSinceLastLogRoll float64 `stm:"transactions_since_last_log_roll"` + //LastWrittenTransactionId float64 `stm:"last_written_transaction_id"` + //LastCheckpointTime float64 `stm:"last_checkpoint_time"` + CapacityTotal float64 `stm:"capacity_total"` + //CapacityTotalGB float64 `stm:"capacity_total_gb"` + CapacityDfsUsed float64 `json:"CapacityUsed" stm:"capacity_used_dfs"` + //CapacityUsedGB float64 `stm:"capacity_used_gb"` + CapacityRemaining float64 `stm:"capacity_remaining"` + //ProvidedCapacityTotal float64 `stm:"provided_capacity_total"` + //CapacityRemainingGB float64 `stm:"capacity_remaining_gb"` + CapacityUsedNonDFS float64 `stm:"capacity_used_non_dfs"` + TotalLoad float64 `stm:"total_load"` + //SnapshottableDirectories float64 `stm:"snapshottable_directories"` + //Snapshots float64 `stm:"snapshots"` + //NumEncryptionZones float64 `stm:"num_encryption_zones"` + //LockQueueLength float64 `stm:"lock_queue_length"` + BlocksTotal float64 `stm:"blocks_total"` + //NumFilesUnderConstruction float64 `stm:"num_files_under_construction"` + //NumActiveClients float64 `stm:"num_active_clients"` + FilesTotal float64 `stm:"files_total"` + //PendingReplicationBlocks float64 `stm:"pending_replication_blocks"` + //PendingReconstructionBlocks float64 `stm:"pending_reconstruction_blocks"` + UnderReplicatedBlocks float64 `stm:"under_replicated_blocks"` + //LowRedundancyBlocks float64 `stm:"low_redundancy_blocks"` + CorruptBlocks float64 `stm:"corrupt_blocks"` + //ScheduledReplicationBlocks float64 `stm:"scheduled_replication_blocks"` + //PendingDeletionBlocks float64 `stm:"pending_deletion_blocks"` + //LowRedundancyReplicatedBlocks float64 `stm:"low_redundancy_replicated_blocks"` + //CorruptReplicatedBlocks float64 `stm:"corrupt_replicated_blocks"` + //MissingReplicatedBlocks float64 `stm:"missing_replicated_blocks"` + //MissingReplicationOneBlocks float64 `stm:"missing_replication_one_blocks"` + //HighestPriorityLowRedundancyReplicatedBlocks float64 `stm:"highest_priority_low_redundancy_replicated_blocks"` + //HighestPriorityLowRedundancyECBlocks float64 `stm:"highest_priority_low_redundancy_ec_blocks"` + //BytesInFutureReplicatedBlocks float64 `stm:"bytes_in_future_replicated_blocks"` + //PendingDeletionReplicatedBlocks float64 `stm:"pending_deletion_replicated_blocks"` + //TotalReplicatedBlocks float64 `stm:"total_replicated_blocks"` + //LowRedundancyECBlockGroups float64 `stm:"low_redundancy_ec_block_groups"` + //CorruptECBlockGroups float64 `stm:"corrupt_ec_block_groups"` + //MissingECBlockGroups float64 `stm:"missing_ec_block_groups"` + //BytesInFutureECBlockGroups float64 `stm:"bytes_in_future_ec_block_groups"` + //PendingDeletionECBlocks float64 `stm:"pending_deletion_ec_blocks"` + //TotalECBlockGroups float64 `stm:"total_ec_block_groups"` + //ExcessBlocks float64 `stm:"excess_blocks"` + //NumTimedOutPendingReconstructions float64 `stm:"num_timed_out_pending_reconstructions"` + //PostponedMisreplicatedBlocks float64 `stm:"postponed_misreplicated_blocks"` + //PendingDataNodeMessageCount float64 `stm:"pending_data_node_message_count"` + //MillisSinceLastLoadedEdits float64 `stm:"millis_since_last_loaded_edits"` + //BlockCapacity float64 `stm:"block_capacity"` + NumLiveDataNodes float64 `stm:"num_live_data_nodes"` + NumDeadDataNodes float64 `stm:"num_dead_data_nodes"` + //NumDecomLiveDataNodes float64 `stm:"num_decom_live_data_nodes"` + //NumDecomDeadDataNodes float64 `stm:"num_decom_dead_data_nodes"` + VolumeFailuresTotal float64 `stm:"volume_failures_total"` + //EstimatedCapacityLostTotal float64 `stm:"estimated_capacity_lost_total"` + //NumDecommissioningDataNodes float64 `stm:"num_decommissioning_data_nodes"` + StaleDataNodes float64 `stm:"stale_data_nodes"` + //NumStaleStorages float64 `stm:"num_stale_storages"` + //TotalSyncCount float64 `stm:"total_sync_count"` + //NumInMaintenanceLiveDataNodes float64 `stm:"num_in_maintenance_live_data_nodes"` + //NumInMaintenanceDeadDataNodes float64 `stm:"num_in_maintenance_dead_data_nodes"` + //NumEnteringMaintenanceDataNodes float64 `stm:"num_entering_maintenance_data_nodes"` + + // custom attributes + CapacityUsed float64 `json:"-" stm:"capacity_used"` +} + +type fsDatasetStateMetrics struct { + HostName string `json:"tag.Hostname"` + Capacity float64 `stm:"capacity_total"` + DfsUsed float64 `stm:"capacity_used_dfs"` + Remaining float64 `stm:"capacity_remaining"` + NumFailedVolumes float64 `stm:"num_failed_volumes"` + //LastVolumeFailureDate float64 `stm:"LastVolumeFailureDate"` + //EstimatedCapacityLostTotal float64 `stm:"EstimatedCapacityLostTotal"` + //CacheUsed float64 `stm:"CacheUsed"` + //CacheCapacity float64 `stm:"CacheCapacity"` + //NumBlocksCached float64 `stm:"NumBlocksCached"` + //NumBlocksFailedToCache float64 `stm:"NumBlocksFailedToCache"` + //NumBlocksFailedToUnCache float64 `stm:"NumBlocksFailedToUnCache"` + + // custom attributes + CapacityUsedNonDFS float64 `stm:"capacity_used_non_dfs"` + CapacityUsed float64 `stm:"capacity_used"` +} + +type dataNodeActivityMetrics struct { + HostName string `json:"tag.Hostname"` + BytesWritten float64 `stm:"bytes_written"` + //TotalWriteTime float64 + BytesRead float64 `stm:"bytes_read"` + //TotalReadTime float64 + //BlocksWritten float64 + //BlocksRead float64 + //BlocksReplicated float64 + //BlocksRemoved float64 + //BlocksVerified float64 + //BlockVerificationFailures float64 + //BlocksCached float64 + //BlocksUncached float64 + //ReadsFromLocalClient float64 + //ReadsFromRemoteClient float64 + //WritesFromLocalClient float64 + //WritesFromRemoteClient float64 + //BlocksGetLocalPathInfo float64 + //RemoteBytesRead float64 + //RemoteBytesWritten float64 + //RamDiskBlocksWrite float64 + //RamDiskBlocksWriteFallback float64 + //RamDiskBytesWrite float64 + //RamDiskBlocksReadHits float64 + //RamDiskBlocksEvicted float64 + //RamDiskBlocksEvictedWithoutRead float64 + //RamDiskBlocksEvictionWindowMsNumOps float64 + //RamDiskBlocksEvictionWindowMsAvgTime float64 + //RamDiskBlocksLazyPersisted float64 + //RamDiskBlocksDeletedBeforeLazyPersisted float64 + //RamDiskBytesLazyPersisted float64 + //RamDiskBlocksLazyPersistWindowMsNumOps float64 + //RamDiskBlocksLazyPersistWindowMsAvgTime float64 + //FsyncCount float64 + //VolumeFailures float64 + //DatanodeNetworkErrors float64 + //DataNodeActiveXceiversCount float64 + //ReadBlockOpNumOps float64 + //ReadBlockOpAvgTime float64 + //WriteBlockOpNumOps float64 + //WriteBlockOpAvgTime float64 + //BlockChecksumOpNumOps float64 + //BlockChecksumOpAvgTime float64 + //CopyBlockOpNumOps float64 + //CopyBlockOpAvgTime float64 + //ReplaceBlockOpNumOps float64 + //ReplaceBlockOpAvgTime float64 + //HeartbeatsNumOps float64 + //HeartbeatsAvgTime float64 + //HeartbeatsTotalNumOps float64 + //HeartbeatsTotalAvgTime float64 + //LifelinesNumOps float64 + //LifelinesAvgTime float64 + //BlockReportsNumOps float64 + //BlockReportsAvgTime float64 + //IncrementalBlockReportsNumOps float64 + //IncrementalBlockReportsAvgTime float64 + //CacheReportsNumOps float64 + //CacheReportsAvgTime float64 + //PacketAckRoundTripTimeNanosNumOps float64 + //PacketAckRoundTripTimeNanosAvgTime float64 + //FlushNanosNumOps float64 + //FlushNanosAvgTime float64 + //FsyncNanosNumOps float64 + //FsyncNanosAvgTime float64 + //SendDataPacketBlockedOnNetworkNanosNumOps float64 + //SendDataPacketBlockedOnNetworkNanosAvgTime float64 + //SendDataPacketTransferNanosNumOps float64 + //SendDataPacketTransferNanosAvgTime float64 + //BlocksInPendingIBR float64 + //BlocksReceivingInPendingIBR float64 + //BlocksReceivedInPendingIBR float64 + //BlocksDeletedInPendingIBR float64 + //EcReconstructionTasks float64 + //EcFailedReconstructionTasks float64 + //EcDecodingTimeNanos float64 + //EcReconstructionBytesRead float64 + //EcReconstructionBytesWritten float64 + //EcReconstructionRemoteBytesRead float64 + //EcReconstructionReadTimeMillis float64 + //EcReconstructionDecodingTimeMillis float64 + //EcReconstructionWriteTimeMillis float64 +} |