diff options
Diffstat (limited to 'src/go/plugin/go.d/modules/hdfs')
17 files changed, 2596 insertions, 0 deletions
diff --git a/src/go/plugin/go.d/modules/hdfs/README.md b/src/go/plugin/go.d/modules/hdfs/README.md new file mode 120000 index 000000000..38f428a06 --- /dev/null +++ b/src/go/plugin/go.d/modules/hdfs/README.md @@ -0,0 +1 @@ +integrations/hadoop_distributed_file_system_hdfs.md
\ No newline at end of file diff --git a/src/go/plugin/go.d/modules/hdfs/charts.go b/src/go/plugin/go.d/modules/hdfs/charts.go new file mode 100644 index 000000000..5b264c64c --- /dev/null +++ b/src/go/plugin/go.d/modules/hdfs/charts.go @@ -0,0 +1,328 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package hdfs + +import "github.com/netdata/netdata/go/plugins/plugin/go.d/agent/module" + +type ( + Charts = module.Charts + Dims = module.Dims + Vars = module.Vars +) + +var jvmCharts = Charts{ + { + ID: "jvm_heap_memory", + Title: "Heap Memory", + Units: "MiB", + Fam: "jvm", + Ctx: "hdfs.heap_memory", + Type: module.Area, + Dims: Dims{ + {ID: "jvm_mem_heap_committed", Name: "committed", Div: 1000}, + {ID: "jvm_mem_heap_used", Name: "used", Div: 1000}, + }, + Vars: Vars{ + {ID: "jvm_mem_heap_max"}, + }, + }, + { + ID: "jvm_gc_count_total", + Title: "GC Events", + Units: "events/s", + Fam: "jvm", + Ctx: "hdfs.gc_count_total", + Dims: Dims{ + {ID: "jvm_gc_count", Name: "gc", Algo: module.Incremental}, + }, + }, + { + ID: "jvm_gc_time_total", + Title: "GC Time", + Units: "ms", + Fam: "jvm", + Ctx: "hdfs.gc_time_total", + Dims: Dims{ + {ID: "jvm_gc_time_millis", Name: "time", Algo: module.Incremental}, + }, + }, + { + ID: "jvm_gc_threshold", + Title: "Number of Times That the GC Threshold is Exceeded", + Units: "events/s", + Fam: "jvm", + Ctx: "hdfs.gc_threshold", + Dims: Dims{ + {ID: "jvm_gc_num_info_threshold_exceeded", Name: "info", Algo: module.Incremental}, + {ID: "jvm_gc_num_warn_threshold_exceeded", Name: "warn", Algo: module.Incremental}, + }, + }, + { + ID: "jvm_threads", + Title: "Number of Threads", + Units: "num", + Fam: "jvm", + Ctx: "hdfs.threads", + Type: module.Stacked, + Dims: Dims{ + {ID: "jvm_threads_new", Name: "new"}, + {ID: "jvm_threads_runnable", Name: "runnable"}, + {ID: "jvm_threads_blocked", Name: "blocked"}, + {ID: "jvm_threads_waiting", Name: "waiting"}, + {ID: "jvm_threads_timed_waiting", Name: "timed_waiting"}, + {ID: "jvm_threads_terminated", Name: "terminated"}, + }, + }, + { + ID: "jvm_logs_total", + Title: "Number of Logs", + Units: "logs/s", + Fam: "jvm", + Ctx: "hdfs.logs_total", + Type: module.Stacked, + Dims: Dims{ + {ID: "jvm_log_info", Name: "info", Algo: module.Incremental}, + {ID: "jvm_log_error", Name: "error", Algo: module.Incremental}, + {ID: "jvm_log_warn", Name: "warn", Algo: module.Incremental}, + {ID: "jvm_log_fatal", Name: "fatal", Algo: module.Incremental}, + }, + }, +} + +var rpcActivityCharts = Charts{ + { + ID: "rpc_bandwidth", + Title: "RPC Bandwidth", + Units: "kilobits/s", + Fam: "rpc", + Ctx: "hdfs.rpc_bandwidth", + Type: module.Area, + Dims: Dims{ + {ID: "rpc_received_bytes", Name: "received", Div: 1000, Algo: module.Incremental}, + {ID: "rpc_sent_bytes", Name: "sent", Div: -1000, Algo: module.Incremental}, + }, + }, + { + ID: "rpc_calls", + Title: "RPC Calls", + Units: "calls/s", + Fam: "rpc", + Ctx: "hdfs.rpc_calls", + Dims: Dims{ + {ID: "rpc_queue_time_num_ops", Name: "calls", Algo: module.Incremental}, + }, + }, + { + ID: "rpc_open_connections", + Title: "RPC Open Connections", + Units: "connections", + Fam: "rpc", + Ctx: "hdfs.open_connections", + Dims: Dims{ + {ID: "rpc_num_open_connections", Name: "open"}, + }, + }, + { + ID: "rpc_call_queue_length", + Title: "RPC Call Queue Length", + Units: "num", + Fam: "rpc", + Ctx: "hdfs.call_queue_length", + Dims: Dims{ + {ID: "rpc_call_queue_length", Name: "length"}, + }, + }, + { + ID: "rpc_avg_queue_time", + Title: "RPC Avg Queue Time", + Units: "ms", + Fam: "rpc", + Ctx: "hdfs.avg_queue_time", + Dims: Dims{ + {ID: "rpc_queue_time_avg_time", Name: "time", Div: 1000}, + }, + }, + { + ID: "rpc_avg_processing_time", + Title: "RPC Avg Processing Time", + Units: "ms", + Fam: "rpc", + Ctx: "hdfs.avg_processing_time", + Dims: Dims{ + {ID: "rpc_processing_time_avg_time", Name: "time", Div: 1000}, + }, + }, +} + +var fsNameSystemCharts = Charts{ + { + ID: "fs_name_system_capacity", + Title: "Capacity Across All Datanodes", + Units: "KiB", + Fam: "fs name system", + Ctx: "hdfs.capacity", + Type: module.Stacked, + Dims: Dims{ + {ID: "fsns_capacity_remaining", Name: "remaining", Div: 1024}, + {ID: "fsns_capacity_used", Name: "used", Div: 1024}, + }, + Vars: Vars{ + {ID: "fsns_capacity_total"}, + }, + }, + { + ID: "fs_name_system_used_capacity", + Title: "Used Capacity Across All Datanodes", + Units: "KiB", + Fam: "fs name system", + Ctx: "hdfs.used_capacity", + Type: module.Stacked, + Dims: Dims{ + {ID: "fsns_capacity_used_dfs", Name: "dfs", Div: 1024}, + {ID: "fsns_capacity_used_non_dfs", Name: "non_dfs", Div: 1024}, + }, + }, + { + ID: "fs_name_system_load", + Title: "Number of Concurrent File Accesses (read/write) Across All DataNodes", + Units: "load", + Fam: "fs name system", + Ctx: "hdfs.load", + Dims: Dims{ + {ID: "fsns_total_load", Name: "load"}, + }, + }, + { + ID: "fs_name_system_volume_failures_total", + Title: "Number of Volume Failures Across All Datanodes", + Units: "events/s", + Fam: "fs name system", + Ctx: "hdfs.volume_failures_total", + Dims: Dims{ + {ID: "fsns_volume_failures_total", Name: "failures", Algo: module.Incremental}, + }, + }, + { + ID: "fs_files_total", + Title: "Number of Tracked Files", + Units: "num", + Fam: "fs name system", + Ctx: "hdfs.files_total", + Dims: Dims{ + {ID: "fsns_files_total", Name: "files"}, + }, + }, + { + ID: "fs_blocks_total", + Title: "Number of Allocated Blocks in the System", + Units: "num", + Fam: "fs name system", + Ctx: "hdfs.blocks_total", + Dims: Dims{ + {ID: "fsns_blocks_total", Name: "blocks"}, + }, + }, + { + ID: "fs_problem_blocks", + Title: "Number of Problem Blocks (can point to an unhealthy cluster)", + Units: "num", + Fam: "fs name system", + Ctx: "hdfs.blocks", + Dims: Dims{ + {ID: "fsns_corrupt_blocks", Name: "corrupt"}, + {ID: "fsns_missing_blocks", Name: "missing"}, + {ID: "fsns_under_replicated_blocks", Name: "under_replicated"}, + }, + }, + { + ID: "fs_name_system_data_nodes", + Title: "Number of Data Nodes By Status", + Units: "num", + Fam: "fs name system", + Ctx: "hdfs.data_nodes", + Type: module.Stacked, + Dims: Dims{ + {ID: "fsns_num_live_data_nodes", Name: "live"}, + {ID: "fsns_num_dead_data_nodes", Name: "dead"}, + {ID: "fsns_stale_data_nodes", Name: "stale"}, + }, + }, +} + +var fsDatasetStateCharts = Charts{ + { + ID: "fs_dataset_state_capacity", + Title: "Capacity", + Units: "KiB", + Fam: "fs dataset", + Ctx: "hdfs.datanode_capacity", + Type: module.Stacked, + Dims: Dims{ + {ID: "fsds_capacity_remaining", Name: "remaining", Div: 1024}, + {ID: "fsds_capacity_used", Name: "used", Div: 1024}, + }, + Vars: Vars{ + {ID: "fsds_capacity_total"}, + }, + }, + { + ID: "fs_dataset_state_used_capacity", + Title: "Used Capacity", + Units: "KiB", + Fam: "fs dataset", + Ctx: "hdfs.datanode_used_capacity", + Type: module.Stacked, + Dims: Dims{ + {ID: "fsds_capacity_used_dfs", Name: "dfs", Div: 1024}, + {ID: "fsds_capacity_used_non_dfs", Name: "non_dfs", Div: 1024}, + }, + }, + { + ID: "fs_dataset_state_num_failed_volumes", + Title: "Number of Failed Volumes", + Units: "num", + Fam: "fs dataset", + Ctx: "hdfs.datanode_failed_volumes", + Dims: Dims{ + {ID: "fsds_num_failed_volumes", Name: "failed volumes"}, + }, + }, +} + +var fsDataNodeActivityCharts = Charts{ + { + ID: "dna_bandwidth", + Title: "Bandwidth", + Units: "KiB/s", + Fam: "activity", + Ctx: "hdfs.datanode_bandwidth", + Type: module.Area, + Dims: Dims{ + {ID: "dna_bytes_read", Name: "reads", Div: 1024, Algo: module.Incremental}, + {ID: "dna_bytes_written", Name: "writes", Div: -1024, Algo: module.Incremental}, + }, + }, +} + +func dataNodeCharts() *Charts { + charts := Charts{} + panicIfError(charts.Add(*jvmCharts.Copy()...)) + panicIfError(charts.Add(*rpcActivityCharts.Copy()...)) + panicIfError(charts.Add(*fsDatasetStateCharts.Copy()...)) + panicIfError(charts.Add(*fsDataNodeActivityCharts.Copy()...)) + return &charts +} + +func nameNodeCharts() *Charts { + charts := Charts{} + panicIfError(charts.Add(*jvmCharts.Copy()...)) + panicIfError(charts.Add(*rpcActivityCharts.Copy()...)) + panicIfError(charts.Add(*fsNameSystemCharts.Copy()...)) + return &charts +} + +func panicIfError(err error) { + if err != nil { + panic(err) + } +} diff --git a/src/go/plugin/go.d/modules/hdfs/client.go b/src/go/plugin/go.d/modules/hdfs/client.go new file mode 100644 index 000000000..3c43348be --- /dev/null +++ b/src/go/plugin/go.d/modules/hdfs/client.go @@ -0,0 +1,69 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package hdfs + +import ( + "encoding/json" + "fmt" + "io" + "net/http" + + "github.com/netdata/netdata/go/plugins/plugin/go.d/pkg/web" +) + +func newClient(httpClient *http.Client, request web.Request) *client { + return &client{ + httpClient: httpClient, + request: request, + } +} + +type client struct { + httpClient *http.Client + request web.Request +} + +func (c *client) do() (*http.Response, error) { + req, err := web.NewHTTPRequest(c.request) + if err != nil { + return nil, fmt.Errorf("error on creating http request to %s : %v", c.request.URL, err) + } + + // req.Header.Add("Accept-Encoding", "gzip") + // req.Header.Set("User-Agent", "netdata/go.d.plugin") + + return c.httpClient.Do(req) +} + +func (c *client) doOK() (*http.Response, error) { + resp, err := c.do() + if err != nil { + return nil, err + } + + if resp.StatusCode != http.StatusOK { + return resp, fmt.Errorf("%s returned %d", c.request.URL, resp.StatusCode) + } + return resp, nil +} + +func (c *client) doOKWithDecodeJSON(dst interface{}) error { + resp, err := c.doOK() + defer closeBody(resp) + if err != nil { + return err + } + + err = json.NewDecoder(resp.Body).Decode(dst) + if err != nil { + return fmt.Errorf("error on decoding response from %s : %v", c.request.URL, err) + } + return nil +} + +func closeBody(resp *http.Response) { + if resp != nil && resp.Body != nil { + _, _ = io.Copy(io.Discard, resp.Body) + _ = resp.Body.Close() + } +} diff --git a/src/go/plugin/go.d/modules/hdfs/collect.go b/src/go/plugin/go.d/modules/hdfs/collect.go new file mode 100644 index 000000000..6ac022b87 --- /dev/null +++ b/src/go/plugin/go.d/modules/hdfs/collect.go @@ -0,0 +1,201 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package hdfs + +import ( + "encoding/json" + "errors" + "fmt" + "strings" + + "github.com/netdata/netdata/go/plugins/plugin/go.d/pkg/stm" +) + +func (h *HDFS) collect() (map[string]int64, error) { + var raw rawJMX + err := h.client.doOKWithDecodeJSON(&raw) + if err != nil { + return nil, err + } + + if raw.isEmpty() { + return nil, errors.New("empty response") + } + + mx := h.collectRawJMX(raw) + + return stm.ToMap(mx), nil +} + +func (h *HDFS) determineNodeType() (nodeType, error) { + var raw rawJMX + err := h.client.doOKWithDecodeJSON(&raw) + if err != nil { + return "", err + } + + if raw.isEmpty() { + return "", errors.New("empty response") + } + + jvm := raw.findJvm() + if jvm == nil { + return "", errors.New("couldn't find jvm in response") + } + + v, ok := jvm["tag.ProcessName"] + if !ok { + return "", errors.New("couldn't find process name in JvmMetrics") + } + + t := nodeType(strings.Trim(string(v), "\"")) + if t == nameNodeType || t == dataNodeType { + return t, nil + } + return "", errors.New("unknown node type") +} + +func (h *HDFS) collectRawJMX(raw rawJMX) *metrics { + var mx metrics + switch h.nodeType { + default: + panic(fmt.Sprintf("unsupported node type : '%s'", h.nodeType)) + case nameNodeType: + h.collectNameNode(&mx, raw) + case dataNodeType: + h.collectDataNode(&mx, raw) + } + return &mx +} + +func (h *HDFS) collectNameNode(mx *metrics, raw rawJMX) { + err := h.collectJVM(mx, raw) + if err != nil { + h.Debugf("error on collecting jvm : %v", err) + } + + err = h.collectRPCActivity(mx, raw) + if err != nil { + h.Debugf("error on collecting rpc activity : %v", err) + } + + err = h.collectFSNameSystem(mx, raw) + if err != nil { + h.Debugf("error on collecting fs name system : %v", err) + } +} + +func (h *HDFS) collectDataNode(mx *metrics, raw rawJMX) { + err := h.collectJVM(mx, raw) + if err != nil { + h.Debugf("error on collecting jvm : %v", err) + } + + err = h.collectRPCActivity(mx, raw) + if err != nil { + h.Debugf("error on collecting rpc activity : %v", err) + } + + err = h.collectFSDatasetState(mx, raw) + if err != nil { + h.Debugf("error on collecting fs dataset state : %v", err) + } + + err = h.collectDataNodeActivity(mx, raw) + if err != nil { + h.Debugf("error on collecting datanode activity state : %v", err) + } +} + +func (h *HDFS) collectJVM(mx *metrics, raw rawJMX) error { + v := raw.findJvm() + if v == nil { + return nil + } + + var jvm jvmMetrics + err := writeJSONTo(&jvm, v) + if err != nil { + return err + } + + mx.Jvm = &jvm + return nil +} + +func (h *HDFS) collectRPCActivity(mx *metrics, raw rawJMX) error { + v := raw.findRPCActivity() + if v == nil { + return nil + } + + var rpc rpcActivityMetrics + err := writeJSONTo(&rpc, v) + if err != nil { + return err + } + + mx.Rpc = &rpc + return nil +} + +func (h *HDFS) collectFSNameSystem(mx *metrics, raw rawJMX) error { + v := raw.findFSNameSystem() + if v == nil { + return nil + } + + var fs fsNameSystemMetrics + err := writeJSONTo(&fs, v) + if err != nil { + return err + } + + fs.CapacityUsed = fs.CapacityDfsUsed + fs.CapacityUsedNonDFS + + mx.FSNameSystem = &fs + return nil +} + +func (h *HDFS) collectFSDatasetState(mx *metrics, raw rawJMX) error { + v := raw.findFSDatasetState() + if v == nil { + return nil + } + + var fs fsDatasetStateMetrics + err := writeJSONTo(&fs, v) + if err != nil { + return err + } + + fs.CapacityUsed = fs.Capacity - fs.Remaining + fs.CapacityUsedNonDFS = fs.CapacityUsed - fs.DfsUsed + + mx.FSDatasetState = &fs + return nil +} + +func (h *HDFS) collectDataNodeActivity(mx *metrics, raw rawJMX) error { + v := raw.findDataNodeActivity() + if v == nil { + return nil + } + + var dna dataNodeActivityMetrics + err := writeJSONTo(&dna, v) + if err != nil { + return err + } + + mx.DataNodeActivity = &dna + return nil +} + +func writeJSONTo(dst interface{}, src interface{}) error { + b, err := json.Marshal(src) + if err != nil { + return err + } + return json.Unmarshal(b, dst) +} diff --git a/src/go/plugin/go.d/modules/hdfs/config_schema.json b/src/go/plugin/go.d/modules/hdfs/config_schema.json new file mode 100644 index 000000000..528cc4dbf --- /dev/null +++ b/src/go/plugin/go.d/modules/hdfs/config_schema.json @@ -0,0 +1,186 @@ +{ + "jsonSchema": { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "HDFS collector configuration.", + "type": "object", + "properties": { + "update_every": { + "title": "Update every", + "description": "Data collection interval, measured in seconds.", + "type": "integer", + "minimum": 1, + "default": 1 + }, + "url": { + "title": "URL", + "description": "The URL of the HDFS DataNode or NameNode JMX endpoint.", + "type": "string", + "default": "http://127.0.0.1:9870/jmx", + "format": "uri" + }, + "timeout": { + "title": "Timeout", + "description": "The timeout in seconds for the HTTP request.", + "type": "number", + "minimum": 0.5, + "default": 1 + }, + "not_follow_redirects": { + "title": "Not follow redirects", + "description": "If set, the client will not follow HTTP redirects automatically.", + "type": "boolean" + }, + "username": { + "title": "Username", + "description": "The username for basic authentication.", + "type": "string", + "sensitive": true + }, + "password": { + "title": "Password", + "description": "The password for basic authentication.", + "type": "string", + "sensitive": true + }, + "proxy_url": { + "title": "Proxy URL", + "description": "The URL of the proxy server.", + "type": "string" + }, + "proxy_username": { + "title": "Proxy username", + "description": "The username for proxy authentication.", + "type": "string", + "sensitive": true + }, + "proxy_password": { + "title": "Proxy password", + "description": "The password for proxy authentication.", + "type": "string", + "sensitive": true + }, + "headers": { + "title": "Headers", + "description": "Additional HTTP headers to include in the request.", + "type": [ + "object", + "null" + ], + "additionalProperties": { + "type": "string" + } + }, + "tls_skip_verify": { + "title": "Skip TLS verification", + "description": "If set, TLS certificate verification will be skipped.", + "type": "boolean" + }, + "tls_ca": { + "title": "TLS CA", + "description": "The path to the CA certificate file for TLS verification.", + "type": "string", + "pattern": "^$|^/" + }, + "tls_cert": { + "title": "TLS certificate", + "description": "The path to the client certificate file for TLS authentication.", + "type": "string", + "pattern": "^$|^/" + }, + "tls_key": { + "title": "TLS key", + "description": "The path to the client key file for TLS authentication.", + "type": "string", + "pattern": "^$|^/" + }, + "body": { + "title": "Body", + "type": "string" + }, + "method": { + "title": "Method", + "type": "string" + } + }, + "required": [ + "url" + ], + "additionalProperties": false, + "patternProperties": { + "^name$": {} + } + }, + "uiSchema": { + "ui:flavour": "tabs", + "ui:options": { + "tabs": [ + { + "title": "Base", + "fields": [ + "update_every", + "url", + "timeout", + "not_follow_redirects" + ] + }, + { + "title": "Auth", + "fields": [ + "username", + "password" + ] + }, + { + "title": "TLS", + "fields": [ + "tls_skip_verify", + "tls_ca", + "tls_cert", + "tls_key" + ] + }, + { + "title": "Proxy", + "fields": [ + "proxy_url", + "proxy_username", + "proxy_password" + ] + }, + { + "title": "Headers", + "fields": [ + "headers" + ] + } + ] + }, + "uiOptions": { + "fullPage": true + }, + "body": { + "ui:widget": "hidden" + }, + "method": { + "ui:widget": "hidden" + }, + "url": { + "ui:help": "By default, the DataNode's port is 9864, and the NameNode's port is 9870, as specified in the [HDFS configuration](https://hadoop.apache.org/docs/r3.1.3/hadoop-project-dist/hadoop-hdfs/hdfs-default.xml)." + }, + "timeout": { + "ui:help": "Accepts decimals for precise control (e.g., type 1.5 for 1.5 seconds)." + }, + "username": { + "ui:widget": "password" + }, + "proxy_username": { + "ui:widget": "password" + }, + "password": { + "ui:widget": "password" + }, + "proxy_password": { + "ui:widget": "password" + } + } +} diff --git a/src/go/plugin/go.d/modules/hdfs/hdfs.go b/src/go/plugin/go.d/modules/hdfs/hdfs.go new file mode 100644 index 000000000..44b5840bb --- /dev/null +++ b/src/go/plugin/go.d/modules/hdfs/hdfs.go @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package hdfs + +import ( + _ "embed" + "errors" + "time" + + "github.com/netdata/netdata/go/plugins/plugin/go.d/agent/module" + "github.com/netdata/netdata/go/plugins/plugin/go.d/pkg/web" +) + +//go:embed "config_schema.json" +var configSchema string + +func init() { + module.Register("hdfs", module.Creator{ + JobConfigSchema: configSchema, + Create: func() module.Module { return New() }, + Config: func() any { return &Config{} }, + }) +} + +func New() *HDFS { + config := Config{ + HTTP: web.HTTP{ + Request: web.Request{ + URL: "http://127.0.0.1:9870/jmx", + }, + Client: web.Client{ + Timeout: web.Duration(time.Second), + }, + }, + } + + return &HDFS{ + Config: config, + } +} + +type Config struct { + web.HTTP `yaml:",inline" json:""` + UpdateEvery int `yaml:"update_every" json:"update_every"` +} + +type ( + HDFS struct { + module.Base + Config `yaml:",inline" json:""` + + client *client + + nodeType + } + nodeType string +) + +const ( + dataNodeType nodeType = "DataNode" + nameNodeType nodeType = "NameNode" +) + +func (h *HDFS) Configuration() any { + return h.Config +} + +func (h *HDFS) Init() error { + if err := h.validateConfig(); err != nil { + h.Errorf("config validation: %v", err) + return err + } + + cl, err := h.createClient() + if err != nil { + h.Errorf("error on creating client : %v", err) + return err + } + h.client = cl + + return nil +} + +func (h *HDFS) Check() error { + typ, err := h.determineNodeType() + if err != nil { + h.Errorf("error on node type determination : %v", err) + return err + } + h.nodeType = typ + + mx, err := h.collect() + if err != nil { + h.Error(err) + return err + } + if len(mx) == 0 { + return errors.New("no metrics collected") + } + return nil +} + +func (h *HDFS) Charts() *Charts { + switch h.nodeType { + default: + return nil + case nameNodeType: + return nameNodeCharts() + case dataNodeType: + return dataNodeCharts() + } +} + +func (h *HDFS) Collect() map[string]int64 { + mx, err := h.collect() + + if err != nil { + h.Error(err) + } + + if len(mx) == 0 { + return nil + } + + return mx +} + +func (h *HDFS) Cleanup() { + if h.client != nil && h.client.httpClient != nil { + h.client.httpClient.CloseIdleConnections() + } +} diff --git a/src/go/plugin/go.d/modules/hdfs/hdfs_test.go b/src/go/plugin/go.d/modules/hdfs/hdfs_test.go new file mode 100644 index 000000000..d24e50bb6 --- /dev/null +++ b/src/go/plugin/go.d/modules/hdfs/hdfs_test.go @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package hdfs + +import ( + "net/http" + "net/http/httptest" + "os" + "testing" + + "github.com/netdata/netdata/go/plugins/plugin/go.d/agent/module" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +var ( + dataConfigJSON, _ = os.ReadFile("testdata/config.json") + dataConfigYAML, _ = os.ReadFile("testdata/config.yaml") + + dataUnknownNodeMetrics, _ = os.ReadFile("testdata/unknownnode.json") + dataDataNodeMetrics, _ = os.ReadFile("testdata/datanode.json") + dataNameNodeMetrics, _ = os.ReadFile("testdata/namenode.json") +) + +func Test_testDataIsValid(t *testing.T) { + for name, data := range map[string][]byte{ + "dataConfigJSON": dataConfigJSON, + "dataConfigYAML": dataConfigYAML, + "dataUnknownNodeMetrics": dataUnknownNodeMetrics, + "dataDataNodeMetrics": dataDataNodeMetrics, + "dataNameNodeMetrics": dataNameNodeMetrics, + } { + require.NotNil(t, data, name) + } +} + +func TestHDFS_ConfigurationSerialize(t *testing.T) { + module.TestConfigurationSerialize(t, &HDFS{}, dataConfigJSON, dataConfigYAML) +} + +func TestHDFS_Init(t *testing.T) { + job := New() + + assert.NoError(t, job.Init()) +} + +func TestHDFS_InitErrorOnCreatingClientWrongTLSCA(t *testing.T) { + job := New() + job.Client.TLSConfig.TLSCA = "testdata/tls" + + assert.Error(t, job.Init()) +} + +func TestHDFS_Check(t *testing.T) { + ts := httptest.NewServer( + http.HandlerFunc( + func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write(dataNameNodeMetrics) + })) + defer ts.Close() + + job := New() + job.URL = ts.URL + require.NoError(t, job.Init()) + + assert.NoError(t, job.Check()) + assert.NotZero(t, job.nodeType) +} + +func TestHDFS_CheckDataNode(t *testing.T) { + ts := httptest.NewServer( + http.HandlerFunc( + func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write(dataDataNodeMetrics) + })) + defer ts.Close() + + job := New() + job.URL = ts.URL + require.NoError(t, job.Init()) + + assert.NoError(t, job.Check()) + assert.Equal(t, dataNodeType, job.nodeType) +} + +func TestHDFS_CheckNameNode(t *testing.T) { + ts := httptest.NewServer( + http.HandlerFunc( + func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write(dataNameNodeMetrics) + })) + defer ts.Close() + + job := New() + job.URL = ts.URL + require.NoError(t, job.Init()) + + assert.NoError(t, job.Check()) + assert.Equal(t, nameNodeType, job.nodeType) +} + +func TestHDFS_CheckErrorOnNodeTypeDetermination(t *testing.T) { + ts := httptest.NewServer( + http.HandlerFunc( + func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write(dataUnknownNodeMetrics) + })) + defer ts.Close() + + job := New() + job.URL = ts.URL + require.NoError(t, job.Init()) + + assert.Error(t, job.Check()) +} + +func TestHDFS_CheckNoResponse(t *testing.T) { + job := New() + job.URL = "http://127.0.0.1:38001/jmx" + require.NoError(t, job.Init()) + + assert.Error(t, job.Check()) +} + +func TestHDFS_Charts(t *testing.T) { + assert.Nil(t, New().Charts()) +} + +func TestHDFS_ChartsUnknownNode(t *testing.T) { + job := New() + + assert.Nil(t, job.Charts()) +} + +func TestHDFS_ChartsDataNode(t *testing.T) { + job := New() + job.nodeType = dataNodeType + + assert.Equal(t, dataNodeCharts(), job.Charts()) +} + +func TestHDFS_ChartsNameNode(t *testing.T) { + job := New() + job.nodeType = nameNodeType + + assert.Equal(t, nameNodeCharts(), job.Charts()) +} + +func TestHDFS_Cleanup(t *testing.T) { + New().Cleanup() +} + +func TestHDFS_CollectDataNode(t *testing.T) { + ts := httptest.NewServer( + http.HandlerFunc( + func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write(dataDataNodeMetrics) + })) + defer ts.Close() + + job := New() + job.URL = ts.URL + require.NoError(t, job.Init()) + require.NoError(t, job.Check()) + + expected := map[string]int64{ + "dna_bytes_read": 80689178, + "dna_bytes_written": 500960407, + "fsds_capacity_remaining": 32920760320, + "fsds_capacity_total": 53675536384, + "fsds_capacity_used": 20754776064, + "fsds_capacity_used_dfs": 1186058240, + "fsds_capacity_used_non_dfs": 19568717824, + "fsds_num_failed_volumes": 0, + "jvm_gc_count": 155, + "jvm_gc_num_info_threshold_exceeded": 0, + "jvm_gc_num_warn_threshold_exceeded": 0, + "jvm_gc_time_millis": 672, + "jvm_gc_total_extra_sleep_time": 8783, + "jvm_log_error": 1, + "jvm_log_fatal": 0, + "jvm_log_info": 257, + "jvm_log_warn": 2, + "jvm_mem_heap_committed": 60500, + "jvm_mem_heap_max": 843, + "jvm_mem_heap_used": 18885, + "jvm_threads_blocked": 0, + "jvm_threads_new": 0, + "jvm_threads_runnable": 11, + "jvm_threads_terminated": 0, + "jvm_threads_timed_waiting": 25, + "jvm_threads_waiting": 11, + "rpc_call_queue_length": 0, + "rpc_num_open_connections": 0, + "rpc_processing_time_avg_time": 0, + "rpc_queue_time_avg_time": 0, + "rpc_queue_time_num_ops": 0, + "rpc_received_bytes": 7, + "rpc_sent_bytes": 187, + } + + assert.Equal(t, expected, job.Collect()) +} + +func TestHDFS_CollectNameNode(t *testing.T) { + ts := httptest.NewServer( + http.HandlerFunc( + func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write(dataNameNodeMetrics) + })) + defer ts.Close() + + job := New() + job.URL = ts.URL + require.NoError(t, job.Init()) + require.NoError(t, job.Check()) + + expected := map[string]int64{ + "fsns_blocks_total": 15, + "fsns_capacity_remaining": 65861697536, + "fsns_capacity_total": 107351072768, + "fsns_capacity_used": 41489375232, + "fsns_capacity_used_dfs": 2372116480, + "fsns_capacity_used_non_dfs": 39117258752, + "fsns_corrupt_blocks": 0, + "fsns_files_total": 12, + "fsns_missing_blocks": 0, + "fsns_num_dead_data_nodes": 0, + "fsns_num_live_data_nodes": 2, + "fsns_stale_data_nodes": 0, + "fsns_total_load": 2, + "fsns_under_replicated_blocks": 0, + "fsns_volume_failures_total": 0, + "jvm_gc_count": 1699, + "jvm_gc_num_info_threshold_exceeded": 0, + "jvm_gc_num_warn_threshold_exceeded": 0, + "jvm_gc_time_millis": 3483, + "jvm_gc_total_extra_sleep_time": 1944, + "jvm_log_error": 0, + "jvm_log_fatal": 0, + "jvm_log_info": 3382077, + "jvm_log_warn": 3378983, + "jvm_mem_heap_committed": 67000, + "jvm_mem_heap_max": 843, + "jvm_mem_heap_used": 26603, + "jvm_threads_blocked": 0, + "jvm_threads_new": 0, + "jvm_threads_runnable": 7, + "jvm_threads_terminated": 0, + "jvm_threads_timed_waiting": 34, + "jvm_threads_waiting": 6, + "rpc_call_queue_length": 0, + "rpc_num_open_connections": 2, + "rpc_processing_time_avg_time": 0, + "rpc_queue_time_avg_time": 58, + "rpc_queue_time_num_ops": 585402, + "rpc_received_bytes": 240431351, + "rpc_sent_bytes": 25067414, + } + + assert.Equal(t, expected, job.Collect()) +} + +func TestHDFS_CollectUnknownNode(t *testing.T) { + ts := httptest.NewServer( + http.HandlerFunc( + func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write(dataUnknownNodeMetrics) + })) + defer ts.Close() + + job := New() + job.URL = ts.URL + require.NoError(t, job.Init()) + + assert.Panics(t, func() { _ = job.Collect() }) +} + +func TestHDFS_CollectNoResponse(t *testing.T) { + job := New() + job.URL = "http://127.0.0.1:38001/jmx" + require.NoError(t, job.Init()) + + assert.Nil(t, job.Collect()) +} + +func TestHDFS_CollectReceiveInvalidResponse(t *testing.T) { + ts := httptest.NewServer( + http.HandlerFunc( + func(w http.ResponseWriter, r *http.Request) { + _, _ = w.Write([]byte("hello and\ngoodbye!\n")) + })) + defer ts.Close() + + job := New() + job.URL = ts.URL + require.NoError(t, job.Init()) + + assert.Nil(t, job.Collect()) +} + +func TestHDFS_CollectReceive404(t *testing.T) { + ts := httptest.NewServer( + http.HandlerFunc( + func(w http.ResponseWriter, r *http.Request) { + w.WriteHeader(http.StatusNotFound) + })) + defer ts.Close() + + job := New() + job.URL = ts.URL + require.NoError(t, job.Init()) + + assert.Nil(t, job.Collect()) +} diff --git a/src/go/plugin/go.d/modules/hdfs/init.go b/src/go/plugin/go.d/modules/hdfs/init.go new file mode 100644 index 000000000..1159ab73b --- /dev/null +++ b/src/go/plugin/go.d/modules/hdfs/init.go @@ -0,0 +1,25 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package hdfs + +import ( + "errors" + + "github.com/netdata/netdata/go/plugins/plugin/go.d/pkg/web" +) + +func (h *HDFS) validateConfig() error { + if h.URL == "" { + return errors.New("url not set") + } + return nil +} + +func (h *HDFS) createClient() (*client, error) { + httpClient, err := web.NewHTTPClient(h.Client) + if err != nil { + return nil, err + } + + return newClient(httpClient, h.Request), nil +} diff --git a/src/go/plugin/go.d/modules/hdfs/integrations/hadoop_distributed_file_system_hdfs.md b/src/go/plugin/go.d/modules/hdfs/integrations/hadoop_distributed_file_system_hdfs.md new file mode 100644 index 000000000..e37ccde0c --- /dev/null +++ b/src/go/plugin/go.d/modules/hdfs/integrations/hadoop_distributed_file_system_hdfs.md @@ -0,0 +1,286 @@ +<!--startmeta +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/go/plugin/go.d/modules/hdfs/README.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/go/plugin/go.d/modules/hdfs/metadata.yaml" +sidebar_label: "Hadoop Distributed File System (HDFS)" +learn_status: "Published" +learn_rel_path: "Collecting Metrics/Storage, Mount Points and Filesystems" +most_popular: True +message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE COLLECTOR'S metadata.yaml FILE" +endmeta--> + +# Hadoop Distributed File System (HDFS) + + +<img src="https://netdata.cloud/img/hadoop.svg" width="150"/> + + +Plugin: go.d.plugin +Module: hfs + +<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" /> + +## Overview + +This collector monitors HDFS nodes. + +Netdata accesses HDFS metrics over `Java Management Extensions` (JMX) through the web interface of an HDFS daemon. + + + + +This collector is supported on all platforms. + +This collector supports collecting metrics from multiple instances of this integration, including remote instances. + + +### Default Behavior + +#### Auto-Detection + +This integration doesn't support auto-detection. + +#### Limits + +The default configuration for this integration does not impose any limits on data collection. + +#### Performance Impact + +The default configuration for this integration is not expected to impose a significant performance impact on the system. + + +## Metrics + +Metrics grouped by *scope*. + +The scope defines the instance that the metric belongs to. An instance is uniquely identified by a set of labels. + + + +### Per Hadoop Distributed File System (HDFS) instance + +These metrics refer to the entire monitored application. + +This scope has no labels. + +Metrics: + +| Metric | Dimensions | Unit | DataNode | NameNode | +|:------|:----------|:----|:---:|:---:| +| hdfs.heap_memory | committed, used | MiB | • | • | +| hdfs.gc_count_total | gc | events/s | • | • | +| hdfs.gc_time_total | ms | ms | • | • | +| hdfs.gc_threshold | info, warn | events/s | • | • | +| hdfs.threads | new, runnable, blocked, waiting, timed_waiting, terminated | num | • | • | +| hdfs.logs_total | info, error, warn, fatal | logs/s | • | • | +| hdfs.rpc_bandwidth | received, sent | kilobits/s | • | • | +| hdfs.rpc_calls | calls | calls/s | • | • | +| hdfs.open_connections | open | connections | • | • | +| hdfs.call_queue_length | length | num | • | • | +| hdfs.avg_queue_time | time | ms | • | • | +| hdfs.avg_processing_time | time | ms | • | • | +| hdfs.capacity | remaining, used | KiB | | • | +| hdfs.used_capacity | dfs, non_dfs | KiB | | • | +| hdfs.load | load | load | | • | +| hdfs.volume_failures_total | failures | events/s | | • | +| hdfs.files_total | files | num | | • | +| hdfs.blocks_total | blocks | num | | • | +| hdfs.blocks | corrupt, missing, under_replicated | num | | • | +| hdfs.data_nodes | live, dead, stale | num | | • | +| hdfs.datanode_capacity | remaining, used | KiB | • | | +| hdfs.datanode_used_capacity | dfs, non_dfs | KiB | • | | +| hdfs.datanode_failed_volumes | failed volumes | num | • | | +| hdfs.datanode_bandwidth | reads, writes | KiB/s | • | | + + + +## Alerts + + +The following alerts are available: + +| Alert name | On metric | Description | +|:------------|:----------|:------------| +| [ hdfs_capacity_usage ](https://github.com/netdata/netdata/blob/master/src/health/health.d/hdfs.conf) | hdfs.capacity | summary datanodes space capacity utilization | +| [ hdfs_missing_blocks ](https://github.com/netdata/netdata/blob/master/src/health/health.d/hdfs.conf) | hdfs.blocks | number of missing blocks | +| [ hdfs_stale_nodes ](https://github.com/netdata/netdata/blob/master/src/health/health.d/hdfs.conf) | hdfs.data_nodes | number of datanodes marked stale due to delayed heartbeat | +| [ hdfs_dead_nodes ](https://github.com/netdata/netdata/blob/master/src/health/health.d/hdfs.conf) | hdfs.data_nodes | number of datanodes which are currently dead | +| [ hdfs_num_failed_volumes ](https://github.com/netdata/netdata/blob/master/src/health/health.d/hdfs.conf) | hdfs.num_failed_volumes | number of failed volumes | + + +## Setup + +### Prerequisites + +No action required. + +### Configuration + +#### File + +The configuration file name for this integration is `go.d/hdfs.conf`. + + +You can edit the configuration file using the `edit-config` script from the +Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory). + +```bash +cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata +sudo ./edit-config go.d/hdfs.conf +``` +#### Options + +The following options can be defined globally: update_every, autodetection_retry. + + +<details open><summary>Config options</summary> + +| Name | Description | Default | Required | +|:----|:-----------|:-------|:--------:| +| update_every | Data collection frequency. | 1 | no | +| autodetection_retry | Recheck interval in seconds. Zero means no recheck will be scheduled. | 0 | no | +| url | Server URL. | http://127.0.0.1:9870/jmx | yes | +| timeout | HTTP request timeout. | 1 | no | +| username | Username for basic HTTP authentication. | | no | +| password | Password for basic HTTP authentication. | | no | +| proxy_url | Proxy URL. | | no | +| proxy_username | Username for proxy basic HTTP authentication. | | no | +| proxy_password | Password for proxy basic HTTP authentication. | | no | +| method | HTTP request method. | GET | no | +| body | HTTP request body. | | no | +| headers | HTTP request headers. | | no | +| not_follow_redirects | Redirect handling policy. Controls whether the client follows redirects. | no | no | +| tls_skip_verify | Server certificate chain and hostname validation policy. Controls whether the client performs this check. | no | no | +| tls_ca | Certification authority that the client uses when verifying the server's certificates. | | no | +| tls_cert | Client TLS certificate. | | no | +| tls_key | Client TLS key. | | no | + +</details> + +#### Examples + +##### Basic + +A basic example configuration. + +```yaml +jobs: + - name: local + url: http://127.0.0.1:9870/jmx + +``` +##### HTTP authentication + +Basic HTTP authentication. + +<details open><summary>Config</summary> + +```yaml +jobs: + - name: local + url: http://127.0.0.1:9870/jmx + username: username + password: password + +``` +</details> + +##### HTTPS with self-signed certificate + +Do not validate server certificate chain and hostname. + + +<details open><summary>Config</summary> + +```yaml +jobs: + - name: local + url: https://127.0.0.1:9870/jmx + tls_skip_verify: yes + +``` +</details> + +##### Multi-instance + +> **Note**: When you define multiple jobs, their names must be unique. + +Collecting metrics from local and remote instances. + + +<details open><summary>Config</summary> + +```yaml +jobs: + - name: local + url: http://127.0.0.1:9870/jmx + + - name: remote + url: http://192.0.2.1:9870/jmx + +``` +</details> + + + +## Troubleshooting + +### Debug Mode + +**Important**: Debug mode is not supported for data collection jobs created via the UI using the Dyncfg feature. + +To troubleshoot issues with the `hfs` collector, run the `go.d.plugin` with the debug option enabled. The output +should give you clues as to why the collector isn't working. + +- Navigate to the `plugins.d` directory, usually at `/usr/libexec/netdata/plugins.d/`. If that's not the case on + your system, open `netdata.conf` and look for the `plugins` setting under `[directories]`. + + ```bash + cd /usr/libexec/netdata/plugins.d/ + ``` + +- Switch to the `netdata` user. + + ```bash + sudo -u netdata -s + ``` + +- Run the `go.d.plugin` to debug the collector: + + ```bash + ./go.d.plugin -d -m hfs + ``` + +### Getting Logs + +If you're encountering problems with the `hfs` collector, follow these steps to retrieve logs and identify potential issues: + +- **Run the command** specific to your system (systemd, non-systemd, or Docker container). +- **Examine the output** for any warnings or error messages that might indicate issues. These messages should provide clues about the root cause of the problem. + +#### System with systemd + +Use the following command to view logs generated since the last Netdata service restart: + +```bash +journalctl _SYSTEMD_INVOCATION_ID="$(systemctl show --value --property=InvocationID netdata)" --namespace=netdata --grep hfs +``` + +#### System without systemd + +Locate the collector log file, typically at `/var/log/netdata/collector.log`, and use `grep` to filter for collector's name: + +```bash +grep hfs /var/log/netdata/collector.log +``` + +**Note**: This method shows logs from all restarts. Focus on the **latest entries** for troubleshooting current issues. + +#### Docker Container + +If your Netdata runs in a Docker container named "netdata" (replace if different), use this command: + +```bash +docker logs netdata 2>&1 | grep hfs +``` + + diff --git a/src/go/plugin/go.d/modules/hdfs/metadata.yaml b/src/go/plugin/go.d/modules/hdfs/metadata.yaml new file mode 100644 index 000000000..694868e01 --- /dev/null +++ b/src/go/plugin/go.d/modules/hdfs/metadata.yaml @@ -0,0 +1,388 @@ +plugin_name: go.d.plugin +modules: + - meta: + id: collector-go.d.plugin-hfs + plugin_name: go.d.plugin + module_name: hfs + monitored_instance: + name: Hadoop Distributed File System (HDFS) + link: https://hadoop.apache.org/docs/r1.2.1/hdfs_design.html + icon_filename: hadoop.svg + categories: + - data-collection.storage-mount-points-and-filesystems + keywords: + - hdfs + - hadoop + related_resources: + integrations: + list: [] + info_provided_to_referring_integrations: + description: "" + most_popular: true + overview: + data_collection: + metrics_description: | + This collector monitors HDFS nodes. + + Netdata accesses HDFS metrics over `Java Management Extensions` (JMX) through the web interface of an HDFS daemon. + method_description: "" + supported_platforms: + include: [] + exclude: [] + multi_instance: true + additional_permissions: + description: "" + default_behavior: + auto_detection: + description: "" + limits: + description: "" + performance_impact: + description: "" + setup: + prerequisites: + list: [] + configuration: + file: + name: go.d/hdfs.conf + options: + description: | + The following options can be defined globally: update_every, autodetection_retry. + folding: + title: Config options + enabled: true + list: + - name: update_every + description: Data collection frequency. + default_value: 1 + required: false + - name: autodetection_retry + description: Recheck interval in seconds. Zero means no recheck will be scheduled. + default_value: 0 + required: false + - name: url + description: Server URL. + default_value: http://127.0.0.1:9870/jmx + required: true + - name: timeout + description: HTTP request timeout. + default_value: 1 + required: false + - name: username + description: Username for basic HTTP authentication. + default_value: "" + required: false + - name: password + description: Password for basic HTTP authentication. + default_value: "" + required: false + - name: proxy_url + description: Proxy URL. + default_value: "" + required: false + - name: proxy_username + description: Username for proxy basic HTTP authentication. + default_value: "" + required: false + - name: proxy_password + description: Password for proxy basic HTTP authentication. + default_value: "" + required: false + - name: method + description: HTTP request method. + default_value: "GET" + required: false + - name: body + description: HTTP request body. + default_value: "" + required: false + - name: headers + description: HTTP request headers. + default_value: "" + required: false + - name: not_follow_redirects + description: Redirect handling policy. Controls whether the client follows redirects. + default_value: no + required: false + - name: tls_skip_verify + description: Server certificate chain and hostname validation policy. Controls whether the client performs this check. + default_value: no + required: false + - name: tls_ca + description: Certification authority that the client uses when verifying the server's certificates. + default_value: "" + required: false + - name: tls_cert + description: Client TLS certificate. + default_value: "" + required: false + - name: tls_key + description: Client TLS key. + default_value: "" + required: false + examples: + folding: + title: Config + enabled: true + list: + - name: Basic + folding: + enabled: false + description: A basic example configuration. + config: | + jobs: + - name: local + url: http://127.0.0.1:9870/jmx + - name: HTTP authentication + description: Basic HTTP authentication. + config: | + jobs: + - name: local + url: http://127.0.0.1:9870/jmx + username: username + password: password + - name: HTTPS with self-signed certificate + description: | + Do not validate server certificate chain and hostname. + config: | + jobs: + - name: local + url: https://127.0.0.1:9870/jmx + tls_skip_verify: yes + - name: Multi-instance + description: | + > **Note**: When you define multiple jobs, their names must be unique. + + Collecting metrics from local and remote instances. + config: | + jobs: + - name: local + url: http://127.0.0.1:9870/jmx + + - name: remote + url: http://192.0.2.1:9870/jmx + troubleshooting: + problems: + list: [] + alerts: + - name: hdfs_capacity_usage + metric: hdfs.capacity + info: summary datanodes space capacity utilization + link: https://github.com/netdata/netdata/blob/master/src/health/health.d/hdfs.conf + - name: hdfs_missing_blocks + metric: hdfs.blocks + info: number of missing blocks + link: https://github.com/netdata/netdata/blob/master/src/health/health.d/hdfs.conf + - name: hdfs_stale_nodes + metric: hdfs.data_nodes + info: number of datanodes marked stale due to delayed heartbeat + link: https://github.com/netdata/netdata/blob/master/src/health/health.d/hdfs.conf + - name: hdfs_dead_nodes + metric: hdfs.data_nodes + info: number of datanodes which are currently dead + link: https://github.com/netdata/netdata/blob/master/src/health/health.d/hdfs.conf + - name: hdfs_num_failed_volumes + metric: hdfs.num_failed_volumes + info: number of failed volumes + link: https://github.com/netdata/netdata/blob/master/src/health/health.d/hdfs.conf + metrics: + folding: + title: Metrics + enabled: false + description: "" + availability: + - DataNode + - NameNode + scopes: + - name: global + description: These metrics refer to the entire monitored application. + labels: [] + metrics: + - name: hdfs.heap_memory + description: Heap Memory + unit: MiB + chart_type: area + dimensions: + - name: committed + - name: used + - name: hdfs.gc_count_total + description: GC Events + unit: events/s + chart_type: line + dimensions: + - name: gc + - name: hdfs.gc_time_total + description: GC Time + unit: ms + chart_type: line + dimensions: + - name: ms + - name: hdfs.gc_threshold + description: Number of Times That the GC Threshold is Exceeded + unit: events/s + chart_type: line + dimensions: + - name: info + - name: warn + - name: hdfs.threads + description: Number of Threads + unit: num + chart_type: stacked + dimensions: + - name: new + - name: runnable + - name: blocked + - name: waiting + - name: timed_waiting + - name: terminated + - name: hdfs.logs_total + description: Number of Logs + unit: logs/s + chart_type: stacked + dimensions: + - name: info + - name: error + - name: warn + - name: fatal + - name: hdfs.rpc_bandwidth + description: RPC Bandwidth + unit: kilobits/s + chart_type: area + dimensions: + - name: received + - name: sent + - name: hdfs.rpc_calls + description: RPC Calls + unit: calls/s + chart_type: line + dimensions: + - name: calls + - name: hdfs.open_connections + description: RPC Open Connections + unit: connections + chart_type: line + dimensions: + - name: open + - name: hdfs.call_queue_length + description: RPC Call Queue Length + unit: num + chart_type: line + dimensions: + - name: length + - name: hdfs.avg_queue_time + description: RPC Avg Queue Time + unit: ms + chart_type: line + dimensions: + - name: time + - name: hdfs.avg_processing_time + description: RPC Avg Processing Time + unit: ms + chart_type: line + dimensions: + - name: time + - name: hdfs.capacity + description: Capacity Across All Datanodes + unit: KiB + chart_type: stacked + availability: + - NameNode + dimensions: + - name: remaining + - name: used + - name: hdfs.used_capacity + description: Used Capacity Across All Datanodes + unit: KiB + chart_type: stacked + availability: + - NameNode + dimensions: + - name: dfs + - name: non_dfs + - name: hdfs.load + description: Number of Concurrent File Accesses (read/write) Across All DataNodes + unit: load + chart_type: line + availability: + - NameNode + dimensions: + - name: load + - name: hdfs.volume_failures_total + description: Number of Volume Failures Across All Datanodes + unit: events/s + chart_type: line + availability: + - NameNode + dimensions: + - name: failures + - name: hdfs.files_total + description: Number of Tracked Files + unit: num + chart_type: line + availability: + - NameNode + dimensions: + - name: files + - name: hdfs.blocks_total + description: Number of Allocated Blocks in the System + unit: num + chart_type: line + availability: + - NameNode + dimensions: + - name: blocks + - name: hdfs.blocks + description: Number of Problem Blocks (can point to an unhealthy cluster) + unit: num + chart_type: line + availability: + - NameNode + dimensions: + - name: corrupt + - name: missing + - name: under_replicated + - name: hdfs.data_nodes + description: Number of Data Nodes By Status + unit: num + chart_type: stacked + availability: + - NameNode + dimensions: + - name: live + - name: dead + - name: stale + - name: hdfs.datanode_capacity + description: Capacity + unit: KiB + chart_type: stacked + availability: + - DataNode + dimensions: + - name: remaining + - name: used + - name: hdfs.datanode_used_capacity + description: Used Capacity + unit: KiB + chart_type: stacked + availability: + - DataNode + dimensions: + - name: dfs + - name: non_dfs + - name: hdfs.datanode_failed_volumes + description: Number of Failed Volumes + unit: num + chart_type: line + availability: + - DataNode + dimensions: + - name: failed volumes + - name: hdfs.datanode_bandwidth + description: Bandwidth + unit: KiB/s + chart_type: area + availability: + - DataNode + dimensions: + - name: reads + - name: writes diff --git a/src/go/plugin/go.d/modules/hdfs/metrics.go b/src/go/plugin/go.d/modules/hdfs/metrics.go new file mode 100644 index 000000000..972436a5d --- /dev/null +++ b/src/go/plugin/go.d/modules/hdfs/metrics.go @@ -0,0 +1,245 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package hdfs + +// HDFS Architecture +// https://hadoop.apache.org/docs/r1.2.1/hdfs_design.html#NameNode+and+DataNodes + +// Metrics description +// https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-common/Metrics.html + +// Good article +// https://www.datadoghq.com/blog/monitor-hadoop-metrics/#hdfs-metrics + +type metrics struct { + Jvm *jvmMetrics `stm:"jvm"` // both + Rpc *rpcActivityMetrics `stm:"rpc"` // both + FSNameSystem *fsNameSystemMetrics `stm:"fsns"` // namenode + FSDatasetState *fsDatasetStateMetrics `stm:"fsds"` // datanode + DataNodeActivity *dataNodeActivityMetrics `stm:"dna"` // datanode +} + +type jvmMetrics struct { + ProcessName string `json:"tag.ProcessName"` + HostName string `json:"tag.Hostname"` + //MemNonHeapUsedM float64 `stm:"mem_non_heap_used,1000,1"` + //MemNonHeapCommittedM float64 `stm:"mem_non_heap_committed,1000,1"` + //MemNonHeapMaxM float64 `stm:"mem_non_heap_max"` + MemHeapUsedM float64 `stm:"mem_heap_used,1000,1"` + MemHeapCommittedM float64 `stm:"mem_heap_committed,1000,1"` + MemHeapMaxM float64 `stm:"mem_heap_max"` + //MemMaxM float64 `stm:"mem_max"` + GcCount float64 `stm:"gc_count"` + GcTimeMillis float64 `stm:"gc_time_millis"` + GcNumWarnThresholdExceeded float64 `stm:"gc_num_warn_threshold_exceeded"` + GcNumInfoThresholdExceeded float64 `stm:"gc_num_info_threshold_exceeded"` + GcTotalExtraSleepTime float64 `stm:"gc_total_extra_sleep_time"` + ThreadsNew float64 `stm:"threads_new"` + ThreadsRunnable float64 `stm:"threads_runnable"` + ThreadsBlocked float64 `stm:"threads_blocked"` + ThreadsWaiting float64 `stm:"threads_waiting"` + ThreadsTimedWaiting float64 `stm:"threads_timed_waiting"` + ThreadsTerminated float64 `stm:"threads_terminated"` + LogFatal float64 `stm:"log_fatal"` + LogError float64 `stm:"log_error"` + LogWarn float64 `stm:"log_warn"` + LogInfo float64 `stm:"log_info"` +} + +type rpcActivityMetrics struct { + ReceivedBytes float64 `stm:"received_bytes"` + SentBytes float64 `stm:"sent_bytes"` + RpcQueueTimeNumOps float64 `stm:"queue_time_num_ops"` + RpcQueueTimeAvgTime float64 `stm:"queue_time_avg_time,1000,1"` + //RpcProcessingTimeNumOps float64 + RpcProcessingTimeAvgTime float64 `stm:"processing_time_avg_time,1000,1"` + //DeferredRpcProcessingTimeNumOps float64 + //DeferredRpcProcessingTimeAvgTime float64 + //RpcAuthenticationFailures float64 + //RpcAuthenticationSuccesses float64 + //RpcAuthorizationFailures float64 + //RpcAuthorizationSuccesses float64 + //RpcClientBackoff float64 + //RpcSlowCalls float64 + NumOpenConnections float64 `stm:"num_open_connections"` + CallQueueLength float64 `stm:"call_queue_length"` + //NumDroppedConnections float64 +} + +type fsNameSystemMetrics struct { + HostName string `json:"tag.Hostname"` + HAState string `json:"tag.HAState"` + //TotalSyncTimes float64 `json:"tag.tag.TotalSyncTimes"` + MissingBlocks float64 `stm:"missing_blocks"` + //MissingReplOneBlocks float64 `stm:"missing_repl_one_blocks"` + //ExpiredHeartbeats float64 `stm:"expired_heartbeats"` + //TransactionsSinceLastCheckpoint float64 `stm:"transactions_since_last_checkpoint"` + //TransactionsSinceLastLogRoll float64 `stm:"transactions_since_last_log_roll"` + //LastWrittenTransactionId float64 `stm:"last_written_transaction_id"` + //LastCheckpointTime float64 `stm:"last_checkpoint_time"` + CapacityTotal float64 `stm:"capacity_total"` + //CapacityTotalGB float64 `stm:"capacity_total_gb"` + CapacityDfsUsed float64 `json:"CapacityUsed" stm:"capacity_used_dfs"` + //CapacityUsedGB float64 `stm:"capacity_used_gb"` + CapacityRemaining float64 `stm:"capacity_remaining"` + //ProvidedCapacityTotal float64 `stm:"provided_capacity_total"` + //CapacityRemainingGB float64 `stm:"capacity_remaining_gb"` + CapacityUsedNonDFS float64 `stm:"capacity_used_non_dfs"` + TotalLoad float64 `stm:"total_load"` + //SnapshottableDirectories float64 `stm:"snapshottable_directories"` + //Snapshots float64 `stm:"snapshots"` + //NumEncryptionZones float64 `stm:"num_encryption_zones"` + //LockQueueLength float64 `stm:"lock_queue_length"` + BlocksTotal float64 `stm:"blocks_total"` + //NumFilesUnderConstruction float64 `stm:"num_files_under_construction"` + //NumActiveClients float64 `stm:"num_active_clients"` + FilesTotal float64 `stm:"files_total"` + //PendingReplicationBlocks float64 `stm:"pending_replication_blocks"` + //PendingReconstructionBlocks float64 `stm:"pending_reconstruction_blocks"` + UnderReplicatedBlocks float64 `stm:"under_replicated_blocks"` + //LowRedundancyBlocks float64 `stm:"low_redundancy_blocks"` + CorruptBlocks float64 `stm:"corrupt_blocks"` + //ScheduledReplicationBlocks float64 `stm:"scheduled_replication_blocks"` + //PendingDeletionBlocks float64 `stm:"pending_deletion_blocks"` + //LowRedundancyReplicatedBlocks float64 `stm:"low_redundancy_replicated_blocks"` + //CorruptReplicatedBlocks float64 `stm:"corrupt_replicated_blocks"` + //MissingReplicatedBlocks float64 `stm:"missing_replicated_blocks"` + //MissingReplicationOneBlocks float64 `stm:"missing_replication_one_blocks"` + //HighestPriorityLowRedundancyReplicatedBlocks float64 `stm:"highest_priority_low_redundancy_replicated_blocks"` + //HighestPriorityLowRedundancyECBlocks float64 `stm:"highest_priority_low_redundancy_ec_blocks"` + //BytesInFutureReplicatedBlocks float64 `stm:"bytes_in_future_replicated_blocks"` + //PendingDeletionReplicatedBlocks float64 `stm:"pending_deletion_replicated_blocks"` + //TotalReplicatedBlocks float64 `stm:"total_replicated_blocks"` + //LowRedundancyECBlockGroups float64 `stm:"low_redundancy_ec_block_groups"` + //CorruptECBlockGroups float64 `stm:"corrupt_ec_block_groups"` + //MissingECBlockGroups float64 `stm:"missing_ec_block_groups"` + //BytesInFutureECBlockGroups float64 `stm:"bytes_in_future_ec_block_groups"` + //PendingDeletionECBlocks float64 `stm:"pending_deletion_ec_blocks"` + //TotalECBlockGroups float64 `stm:"total_ec_block_groups"` + //ExcessBlocks float64 `stm:"excess_blocks"` + //NumTimedOutPendingReconstructions float64 `stm:"num_timed_out_pending_reconstructions"` + //PostponedMisreplicatedBlocks float64 `stm:"postponed_misreplicated_blocks"` + //PendingDataNodeMessageCount float64 `stm:"pending_data_node_message_count"` + //MillisSinceLastLoadedEdits float64 `stm:"millis_since_last_loaded_edits"` + //BlockCapacity float64 `stm:"block_capacity"` + NumLiveDataNodes float64 `stm:"num_live_data_nodes"` + NumDeadDataNodes float64 `stm:"num_dead_data_nodes"` + //NumDecomLiveDataNodes float64 `stm:"num_decom_live_data_nodes"` + //NumDecomDeadDataNodes float64 `stm:"num_decom_dead_data_nodes"` + VolumeFailuresTotal float64 `stm:"volume_failures_total"` + //EstimatedCapacityLostTotal float64 `stm:"estimated_capacity_lost_total"` + //NumDecommissioningDataNodes float64 `stm:"num_decommissioning_data_nodes"` + StaleDataNodes float64 `stm:"stale_data_nodes"` + //NumStaleStorages float64 `stm:"num_stale_storages"` + //TotalSyncCount float64 `stm:"total_sync_count"` + //NumInMaintenanceLiveDataNodes float64 `stm:"num_in_maintenance_live_data_nodes"` + //NumInMaintenanceDeadDataNodes float64 `stm:"num_in_maintenance_dead_data_nodes"` + //NumEnteringMaintenanceDataNodes float64 `stm:"num_entering_maintenance_data_nodes"` + + // custom attributes + CapacityUsed float64 `json:"-" stm:"capacity_used"` +} + +type fsDatasetStateMetrics struct { + HostName string `json:"tag.Hostname"` + Capacity float64 `stm:"capacity_total"` + DfsUsed float64 `stm:"capacity_used_dfs"` + Remaining float64 `stm:"capacity_remaining"` + NumFailedVolumes float64 `stm:"num_failed_volumes"` + //LastVolumeFailureDate float64 `stm:"LastVolumeFailureDate"` + //EstimatedCapacityLostTotal float64 `stm:"EstimatedCapacityLostTotal"` + //CacheUsed float64 `stm:"CacheUsed"` + //CacheCapacity float64 `stm:"CacheCapacity"` + //NumBlocksCached float64 `stm:"NumBlocksCached"` + //NumBlocksFailedToCache float64 `stm:"NumBlocksFailedToCache"` + //NumBlocksFailedToUnCache float64 `stm:"NumBlocksFailedToUnCache"` + + // custom attributes + CapacityUsedNonDFS float64 `stm:"capacity_used_non_dfs"` + CapacityUsed float64 `stm:"capacity_used"` +} + +type dataNodeActivityMetrics struct { + HostName string `json:"tag.Hostname"` + BytesWritten float64 `stm:"bytes_written"` + //TotalWriteTime float64 + BytesRead float64 `stm:"bytes_read"` + //TotalReadTime float64 + //BlocksWritten float64 + //BlocksRead float64 + //BlocksReplicated float64 + //BlocksRemoved float64 + //BlocksVerified float64 + //BlockVerificationFailures float64 + //BlocksCached float64 + //BlocksUncached float64 + //ReadsFromLocalClient float64 + //ReadsFromRemoteClient float64 + //WritesFromLocalClient float64 + //WritesFromRemoteClient float64 + //BlocksGetLocalPathInfo float64 + //RemoteBytesRead float64 + //RemoteBytesWritten float64 + //RamDiskBlocksWrite float64 + //RamDiskBlocksWriteFallback float64 + //RamDiskBytesWrite float64 + //RamDiskBlocksReadHits float64 + //RamDiskBlocksEvicted float64 + //RamDiskBlocksEvictedWithoutRead float64 + //RamDiskBlocksEvictionWindowMsNumOps float64 + //RamDiskBlocksEvictionWindowMsAvgTime float64 + //RamDiskBlocksLazyPersisted float64 + //RamDiskBlocksDeletedBeforeLazyPersisted float64 + //RamDiskBytesLazyPersisted float64 + //RamDiskBlocksLazyPersistWindowMsNumOps float64 + //RamDiskBlocksLazyPersistWindowMsAvgTime float64 + //FsyncCount float64 + //VolumeFailures float64 + //DatanodeNetworkErrors float64 + //DataNodeActiveXceiversCount float64 + //ReadBlockOpNumOps float64 + //ReadBlockOpAvgTime float64 + //WriteBlockOpNumOps float64 + //WriteBlockOpAvgTime float64 + //BlockChecksumOpNumOps float64 + //BlockChecksumOpAvgTime float64 + //CopyBlockOpNumOps float64 + //CopyBlockOpAvgTime float64 + //ReplaceBlockOpNumOps float64 + //ReplaceBlockOpAvgTime float64 + //HeartbeatsNumOps float64 + //HeartbeatsAvgTime float64 + //HeartbeatsTotalNumOps float64 + //HeartbeatsTotalAvgTime float64 + //LifelinesNumOps float64 + //LifelinesAvgTime float64 + //BlockReportsNumOps float64 + //BlockReportsAvgTime float64 + //IncrementalBlockReportsNumOps float64 + //IncrementalBlockReportsAvgTime float64 + //CacheReportsNumOps float64 + //CacheReportsAvgTime float64 + //PacketAckRoundTripTimeNanosNumOps float64 + //PacketAckRoundTripTimeNanosAvgTime float64 + //FlushNanosNumOps float64 + //FlushNanosAvgTime float64 + //FsyncNanosNumOps float64 + //FsyncNanosAvgTime float64 + //SendDataPacketBlockedOnNetworkNanosNumOps float64 + //SendDataPacketBlockedOnNetworkNanosAvgTime float64 + //SendDataPacketTransferNanosNumOps float64 + //SendDataPacketTransferNanosAvgTime float64 + //BlocksInPendingIBR float64 + //BlocksReceivingInPendingIBR float64 + //BlocksReceivedInPendingIBR float64 + //BlocksDeletedInPendingIBR float64 + //EcReconstructionTasks float64 + //EcFailedReconstructionTasks float64 + //EcDecodingTimeNanos float64 + //EcReconstructionBytesRead float64 + //EcReconstructionBytesWritten float64 + //EcReconstructionRemoteBytesRead float64 + //EcReconstructionReadTimeMillis float64 + //EcReconstructionDecodingTimeMillis float64 + //EcReconstructionWriteTimeMillis float64 +} diff --git a/src/go/plugin/go.d/modules/hdfs/raw_data.go b/src/go/plugin/go.d/modules/hdfs/raw_data.go new file mode 100644 index 000000000..ab434ae17 --- /dev/null +++ b/src/go/plugin/go.d/modules/hdfs/raw_data.go @@ -0,0 +1,51 @@ +package hdfs + +import ( + "encoding/json" + "strings" +) + +type ( + rawData map[string]json.RawMessage + rawJMX struct { + Beans []rawData + } +) + +func (r rawJMX) isEmpty() bool { + return len(r.Beans) == 0 +} + +func (r rawJMX) find(f func(rawData) bool) rawData { + for _, v := range r.Beans { + if f(v) { + return v + } + } + return nil +} + +func (r rawJMX) findJvm() rawData { + f := func(data rawData) bool { return string(data["modelerType"]) == "\"JvmMetrics\"" } + return r.find(f) +} + +func (r rawJMX) findRPCActivity() rawData { + f := func(data rawData) bool { return strings.HasPrefix(string(data["modelerType"]), "\"RpcActivityForPort") } + return r.find(f) +} + +func (r rawJMX) findFSNameSystem() rawData { + f := func(data rawData) bool { return string(data["modelerType"]) == "\"FSNamesystem\"" } + return r.find(f) +} + +func (r rawJMX) findFSDatasetState() rawData { + f := func(data rawData) bool { return string(data["modelerType"]) == "\"FSDatasetState\"" } + return r.find(f) +} + +func (r rawJMX) findDataNodeActivity() rawData { + f := func(data rawData) bool { return strings.HasPrefix(string(data["modelerType"]), "\"DataNodeActivity") } + return r.find(f) +} diff --git a/src/go/plugin/go.d/modules/hdfs/testdata/config.json b/src/go/plugin/go.d/modules/hdfs/testdata/config.json new file mode 100644 index 000000000..984c3ed6e --- /dev/null +++ b/src/go/plugin/go.d/modules/hdfs/testdata/config.json @@ -0,0 +1,20 @@ +{ + "update_every": 123, + "url": "ok", + "body": "ok", + "method": "ok", + "headers": { + "ok": "ok" + }, + "username": "ok", + "password": "ok", + "proxy_url": "ok", + "proxy_username": "ok", + "proxy_password": "ok", + "timeout": 123.123, + "not_follow_redirects": true, + "tls_ca": "ok", + "tls_cert": "ok", + "tls_key": "ok", + "tls_skip_verify": true +} diff --git a/src/go/plugin/go.d/modules/hdfs/testdata/config.yaml b/src/go/plugin/go.d/modules/hdfs/testdata/config.yaml new file mode 100644 index 000000000..8558b61cc --- /dev/null +++ b/src/go/plugin/go.d/modules/hdfs/testdata/config.yaml @@ -0,0 +1,17 @@ +update_every: 123 +url: "ok" +body: "ok" +method: "ok" +headers: + ok: "ok" +username: "ok" +password: "ok" +proxy_url: "ok" +proxy_username: "ok" +proxy_password: "ok" +timeout: 123.123 +not_follow_redirects: yes +tls_ca: "ok" +tls_cert: "ok" +tls_key: "ok" +tls_skip_verify: yes diff --git a/src/go/plugin/go.d/modules/hdfs/testdata/datanode.json b/src/go/plugin/go.d/modules/hdfs/testdata/datanode.json new file mode 100644 index 000000000..0f657d560 --- /dev/null +++ b/src/go/plugin/go.d/modules/hdfs/testdata/datanode.json @@ -0,0 +1,165 @@ +{ + "beans":[ + { + "name":"Hadoop:service=DataNode,name=JvmMetrics", + "modelerType":"JvmMetrics", + "tag.Context":"jvm", + "tag.ProcessName":"DataNode", + "tag.SessionId":null, + "tag.Hostname":"dev-slave-01.dev.local", + "MemNonHeapUsedM":53.67546, + "MemNonHeapCommittedM":54.9375, + "MemNonHeapMaxM":-1.0, + "MemHeapUsedM":18.885735, + "MemHeapCommittedM":60.5, + "MemHeapMaxM":843.0, + "MemMaxM":843.0, + "GcCount":155, + "GcTimeMillis":672, + "GcNumWarnThresholdExceeded":0, + "GcNumInfoThresholdExceeded":0, + "GcTotalExtraSleepTime":8783, + "ThreadsNew":0, + "ThreadsRunnable":11, + "ThreadsBlocked":0, + "ThreadsWaiting":11, + "ThreadsTimedWaiting":25, + "ThreadsTerminated":0, + "LogFatal":0, + "LogError":1, + "LogWarn":2, + "LogInfo":257 + }, + { + "name":"Hadoop:service=DataNode,name=FSDatasetState", + "modelerType":"FSDatasetState", + "tag.Context":"FSDatasetState", + "tag.StorageInfo":"FSDataset{dirpath='[/data/hdfs/data]'}", + "tag.Hostname":"dev-slave-01.dev.local", + "Capacity":53675536384, + "DfsUsed":1186058240, + "Remaining":32920760320, + "NumFailedVolumes":0, + "LastVolumeFailureDate":0, + "EstimatedCapacityLostTotal":0, + "CacheUsed":0, + "CacheCapacity":0, + "NumBlocksCached":0, + "NumBlocksFailedToCache":0, + "NumBlocksFailedToUnCache":4 + }, + { + "name":"Hadoop:service=DataNode,name=DataNodeActivity-dev-slave-01.dev.local-9866", + "modelerType":"DataNodeActivity-dev-slave-01.dev.local-9866", + "tag.SessionId":null, + "tag.Context":"dfs", + "tag.Hostname":"dev-slave-01.dev.local", + "BytesWritten":500960407, + "TotalWriteTime":463, + "BytesRead":80689178, + "TotalReadTime":41203, + "BlocksWritten":16, + "BlocksRead":16, + "BlocksReplicated":4, + "BlocksRemoved":4, + "BlocksVerified":0, + "BlockVerificationFailures":0, + "BlocksCached":0, + "BlocksUncached":0, + "ReadsFromLocalClient":0, + "ReadsFromRemoteClient":16, + "WritesFromLocalClient":0, + "WritesFromRemoteClient":12, + "BlocksGetLocalPathInfo":0, + "RemoteBytesRead":80689178, + "RemoteBytesWritten":97283223, + "RamDiskBlocksWrite":0, + "RamDiskBlocksWriteFallback":0, + "RamDiskBytesWrite":0, + "RamDiskBlocksReadHits":0, + "RamDiskBlocksEvicted":0, + "RamDiskBlocksEvictedWithoutRead":0, + "RamDiskBlocksEvictionWindowMsNumOps":0, + "RamDiskBlocksEvictionWindowMsAvgTime":0.0, + "RamDiskBlocksLazyPersisted":0, + "RamDiskBlocksDeletedBeforeLazyPersisted":0, + "RamDiskBytesLazyPersisted":0, + "RamDiskBlocksLazyPersistWindowMsNumOps":0, + "RamDiskBlocksLazyPersistWindowMsAvgTime":0.0, + "FsyncCount":0, + "VolumeFailures":0, + "DatanodeNetworkErrors":7, + "DataNodeActiveXceiversCount":0, + "ReadBlockOpNumOps":16, + "ReadBlockOpAvgTime":2258.2, + "WriteBlockOpNumOps":12, + "WriteBlockOpAvgTime":12640.666666666666, + "BlockChecksumOpNumOps":0, + "BlockChecksumOpAvgTime":0.0, + "CopyBlockOpNumOps":0, + "CopyBlockOpAvgTime":0.0, + "ReplaceBlockOpNumOps":0, + "ReplaceBlockOpAvgTime":0.0, + "HeartbeatsNumOps":285073, + "HeartbeatsAvgTime":1.2035398230088497, + "HeartbeatsTotalNumOps":285073, + "HeartbeatsTotalAvgTime":1.2035398230088497, + "LifelinesNumOps":0, + "LifelinesAvgTime":0.0, + "BlockReportsNumOps":41, + "BlockReportsAvgTime":2.0, + "IncrementalBlockReportsNumOps":20, + "IncrementalBlockReportsAvgTime":1.2, + "CacheReportsNumOps":0, + "CacheReportsAvgTime":0.0, + "PacketAckRoundTripTimeNanosNumOps":603, + "PacketAckRoundTripTimeNanosAvgTime":1733672.0, + "FlushNanosNumOps":7660, + "FlushNanosAvgTime":3988.858108108108, + "FsyncNanosNumOps":0, + "FsyncNanosAvgTime":0.0, + "SendDataPacketBlockedOnNetworkNanosNumOps":7091, + "SendDataPacketBlockedOnNetworkNanosAvgTime":2.4469053762711864E7, + "SendDataPacketTransferNanosNumOps":7091, + "SendDataPacketTransferNanosAvgTime":37130.05084745763, + "BlocksInPendingIBR":0, + "BlocksReceivingInPendingIBR":0, + "BlocksReceivedInPendingIBR":0, + "BlocksDeletedInPendingIBR":0, + "EcReconstructionTasks":0, + "EcFailedReconstructionTasks":0, + "EcDecodingTimeNanos":0, + "EcReconstructionBytesRead":0, + "EcReconstructionBytesWritten":0, + "EcReconstructionRemoteBytesRead":0, + "EcReconstructionReadTimeMillis":0, + "EcReconstructionDecodingTimeMillis":0, + "EcReconstructionWriteTimeMillis":0 + }, + { + "name":"Hadoop:service=DataNode,name=RpcActivityForPort9867", + "modelerType":"RpcActivityForPort9867", + "tag.port":"9867", + "tag.Context":"rpc", + "tag.NumOpenConnectionsPerUser":"{}", + "tag.Hostname":"dev-slave-01.dev.local", + "ReceivedBytes":7, + "SentBytes":187, + "RpcQueueTimeNumOps":0, + "RpcQueueTimeAvgTime":0.0, + "RpcProcessingTimeNumOps":0, + "RpcProcessingTimeAvgTime":0.0, + "DeferredRpcProcessingTimeNumOps":0, + "DeferredRpcProcessingTimeAvgTime":0.0, + "RpcAuthenticationFailures":0, + "RpcAuthenticationSuccesses":0, + "RpcAuthorizationFailures":0, + "RpcAuthorizationSuccesses":0, + "RpcClientBackoff":0, + "RpcSlowCalls":0, + "NumOpenConnections":0, + "CallQueueLength":0, + "NumDroppedConnections":0 + } + ] +}
\ No newline at end of file diff --git a/src/go/plugin/go.d/modules/hdfs/testdata/namenode.json b/src/go/plugin/go.d/modules/hdfs/testdata/namenode.json new file mode 100644 index 000000000..2d33d32f3 --- /dev/null +++ b/src/go/plugin/go.d/modules/hdfs/testdata/namenode.json @@ -0,0 +1,132 @@ +{ + "beans":[ + { + "name":"Hadoop:service=NameNode,name=JvmMetrics", + "modelerType":"JvmMetrics", + "tag.Context":"jvm", + "tag.ProcessName":"NameNode", + "tag.SessionId":null, + "tag.Hostname":"dev-master-02.dev.local", + "MemNonHeapUsedM":66.170395, + "MemNonHeapCommittedM":67.75, + "MemNonHeapMaxM":-1.0, + "MemHeapUsedM":26.603287, + "MemHeapCommittedM":67.0, + "MemHeapMaxM":843.0, + "MemMaxM":843.0, + "GcCount":1699, + "GcTimeMillis":3483, + "GcNumWarnThresholdExceeded":0, + "GcNumInfoThresholdExceeded":0, + "GcTotalExtraSleepTime":1944, + "ThreadsNew":0, + "ThreadsRunnable":7, + "ThreadsBlocked":0, + "ThreadsWaiting":6, + "ThreadsTimedWaiting":34, + "ThreadsTerminated":0, + "LogFatal":0, + "LogError":0, + "LogWarn":3378983, + "LogInfo":3382077 + }, + { + "name":"Hadoop:service=NameNode,name=FSNamesystem", + "modelerType":"FSNamesystem", + "tag.Context":"dfs", + "tag.HAState":"active", + "tag.TotalSyncTimes":"98 ", + "tag.Hostname":"dev-master-02.dev.local", + "MissingBlocks":0, + "MissingReplOneBlocks":0, + "ExpiredHeartbeats":0, + "TransactionsSinceLastCheckpoint":1, + "TransactionsSinceLastLogRoll":1, + "LastWrittenTransactionId":624, + "LastCheckpointTime":1566814983890, + "CapacityTotal":107351072768, + "CapacityTotalGB":100.0, + "CapacityUsed":2372116480, + "CapacityUsedGB":2.0, + "CapacityRemaining":65861697536, + "ProvidedCapacityTotal":0, + "CapacityRemainingGB":61.0, + "CapacityUsedNonDFS":39117258752, + "TotalLoad":2, + "SnapshottableDirectories":0, + "Snapshots":0, + "NumEncryptionZones":0, + "LockQueueLength":0, + "BlocksTotal":15, + "NumFilesUnderConstruction":0, + "NumActiveClients":0, + "FilesTotal":12, + "PendingReplicationBlocks":0, + "PendingReconstructionBlocks":0, + "UnderReplicatedBlocks":0, + "LowRedundancyBlocks":0, + "CorruptBlocks":0, + "ScheduledReplicationBlocks":0, + "PendingDeletionBlocks":0, + "LowRedundancyReplicatedBlocks":0, + "CorruptReplicatedBlocks":0, + "MissingReplicatedBlocks":0, + "MissingReplicationOneBlocks":0, + "HighestPriorityLowRedundancyReplicatedBlocks":0, + "HighestPriorityLowRedundancyECBlocks":0, + "BytesInFutureReplicatedBlocks":0, + "PendingDeletionReplicatedBlocks":0, + "TotalReplicatedBlocks":15, + "LowRedundancyECBlockGroups":0, + "CorruptECBlockGroups":0, + "MissingECBlockGroups":0, + "BytesInFutureECBlockGroups":0, + "PendingDeletionECBlocks":0, + "TotalECBlockGroups":0, + "ExcessBlocks":0, + "NumTimedOutPendingReconstructions":0, + "PostponedMisreplicatedBlocks":0, + "PendingDataNodeMessageCount":0, + "MillisSinceLastLoadedEdits":0, + "BlockCapacity":2097152, + "NumLiveDataNodes":2, + "NumDeadDataNodes":0, + "NumDecomLiveDataNodes":0, + "NumDecomDeadDataNodes":0, + "VolumeFailuresTotal":0, + "EstimatedCapacityLostTotal":0, + "NumDecommissioningDataNodes":0, + "StaleDataNodes":0, + "NumStaleStorages":0, + "TotalSyncCount":2, + "NumInMaintenanceLiveDataNodes":0, + "NumInMaintenanceDeadDataNodes":0, + "NumEnteringMaintenanceDataNodes":0 + }, + { + "name":"Hadoop:service=NameNode,name=RpcActivityForPort9000", + "modelerType":"RpcActivityForPort9000", + "tag.port":"9000", + "tag.Context":"rpc", + "tag.NumOpenConnectionsPerUser":"{\"hadoop\":2}", + "tag.Hostname":"dev-master-02.dev.local", + "ReceivedBytes":240431351, + "SentBytes":25067414, + "RpcQueueTimeNumOps":585402, + "RpcQueueTimeAvgTime":0.05813953488372093, + "RpcProcessingTimeNumOps":585402, + "RpcProcessingTimeAvgTime":0.0, + "DeferredRpcProcessingTimeNumOps":0, + "DeferredRpcProcessingTimeAvgTime":0.0, + "RpcAuthenticationFailures":0, + "RpcAuthenticationSuccesses":0, + "RpcAuthorizationFailures":0, + "RpcAuthorizationSuccesses":14327, + "RpcClientBackoff":0, + "RpcSlowCalls":0, + "NumOpenConnections":2, + "CallQueueLength":0, + "NumDroppedConnections":0 + } + ] +}
\ No newline at end of file diff --git a/src/go/plugin/go.d/modules/hdfs/testdata/unknownnode.json b/src/go/plugin/go.d/modules/hdfs/testdata/unknownnode.json new file mode 100644 index 000000000..7370a7a37 --- /dev/null +++ b/src/go/plugin/go.d/modules/hdfs/testdata/unknownnode.json @@ -0,0 +1,34 @@ +{ + "beans":[ + { + "name":"Hadoop:service=UnknownNode,name=JvmMetrics", + "modelerType":"JvmMetrics", + "tag.Context":"jvm", + "tag.ProcessName":"UnknownNode", + "tag.SessionId":null, + "tag.Hostname":"dev-slave-01.dev.local", + "MemNonHeapUsedM":53.67546, + "MemNonHeapCommittedM":54.9375, + "MemNonHeapMaxM":-1.0, + "MemHeapUsedM":18.885735, + "MemHeapCommittedM":60.5, + "MemHeapMaxM":843.0, + "MemMaxM":843.0, + "GcCount":155, + "GcTimeMillis":672, + "GcNumWarnThresholdExceeded":0, + "GcNumInfoThresholdExceeded":0, + "GcTotalExtraSleepTime":8783, + "ThreadsNew":1, + "ThreadsRunnable":2, + "ThreadsBlocked":3, + "ThreadsWaiting":4, + "ThreadsTimedWaiting":5, + "ThreadsTerminated":6, + "LogFatal":10, + "LogError":11, + "LogWarn":12, + "LogInfo":13 + } + ] +}
\ No newline at end of file |