diff options
Diffstat (limited to '')
-rw-r--r-- | src/go/plugin/go.d/modules/consul/collect_metrics.go | 205 |
1 files changed, 205 insertions, 0 deletions
diff --git a/src/go/plugin/go.d/modules/consul/collect_metrics.go b/src/go/plugin/go.d/modules/consul/collect_metrics.go new file mode 100644 index 000000000..557ecf64c --- /dev/null +++ b/src/go/plugin/go.d/modules/consul/collect_metrics.go @@ -0,0 +1,205 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package consul + +import ( + "fmt" + "math" + "strconv" + "strings" + + "github.com/netdata/netdata/go/plugins/plugin/go.d/pkg/prometheus" +) + +func (c *Consul) collectMetricsPrometheus(mx map[string]int64) error { + mfs, err := c.prom.Scrape() + if err != nil { + return err + } + + // Key Metrics (https://developer.hashicorp.com/consul/docs/agent/telemetry#key-metrics) + + // prometheus metrics are messy: + // - if 'disable_hostname' is false (default): + // - consul_autopilot_failure_tolerance => consul_hostname_autopilot_failure_tolerance + // - both are exposed + // - only the one with the hostname has the correct value + // - 1.14.3 (it probably has something to do with cloud management version): + // - runtime_sys_bytes => runtime_sys_bytes_sys_bytes; consul_autopilot_healthy => consul_autopilot_healthy_healthy + // - both are exposed + // - only the one with the double name has the correct value + + if c.isServer() { + c.collectSummary(mx, mfs, "raft_thread_main_saturation") + c.collectSummary(mx, mfs, "raft_thread_fsm_saturation") + c.collectSummary(mx, mfs, "raft_boltdb_logsPerBatch") + c.collectSummary(mx, mfs, "kvs_apply") + c.collectSummary(mx, mfs, "txn_apply") + c.collectSummary(mx, mfs, "raft_boltdb_storeLogs") + c.collectSummary(mx, mfs, "raft_rpc_installSnapshot") // make sense for followers only + c.collectSummary(mx, mfs, "raft_commitTime") // make sense for leader only + c.collectSummary(mx, mfs, "raft_leader_lastContact") // make sense for leader only + + c.collectCounter(mx, mfs, "raft_apply", precision) // make sense for leader only + c.collectCounter(mx, mfs, "raft_state_candidate", 1) + c.collectCounter(mx, mfs, "raft_state_leader", 1) + + c.collectGaugeBool(mx, mfs, "autopilot_healthy", "autopilot_healthy_healthy") + c.collectGaugeBool(mx, mfs, "server_isLeader", "server_isLeader_isLeader") + c.collectGauge(mx, mfs, "autopilot_failure_tolerance", 1, "autopilot_failure_tolerance_failure_tolerance") + c.collectGauge(mx, mfs, "raft_fsm_lastRestoreDuration", 1) + c.collectGauge(mx, mfs, "raft_leader_oldestLogAge", 1, "raft_leader_oldestLogAge_oldestLogAge") + c.collectGauge(mx, mfs, "raft_boltdb_freelistBytes", 1, "raft_boltdb_freelistBytes_freelistBytes") + + if isLeader, ok := c.isLeader(mfs); ok { + if isLeader && !c.hasLeaderCharts { + c.addLeaderCharts() + c.hasLeaderCharts = true + } + if !isLeader && c.hasLeaderCharts { + c.removeLeaderCharts() + c.hasLeaderCharts = false + } + if !isLeader && !c.hasFollowerCharts { + c.addFollowerCharts() + c.hasFollowerCharts = true + } + if isLeader && c.hasFollowerCharts { + c.removeFollowerCharts() + c.hasFollowerCharts = false + } + } + } + + c.collectGauge(mx, mfs, "system_licenseExpiration", 3600, "system_licenseExpiration_licenseExpiration") + + c.collectCounter(mx, mfs, "client_rpc", 1) + c.collectCounter(mx, mfs, "client_rpc_exceeded", 1) + c.collectCounter(mx, mfs, "client_rpc_failed", 1) + + c.collectGauge(mx, mfs, "runtime_alloc_bytes", 1, "runtime_alloc_bytes_alloc_bytes") + c.collectGauge(mx, mfs, "runtime_sys_bytes", 1, "runtime_sys_bytes_sys_bytes") + c.collectGauge(mx, mfs, "runtime_total_gc_pause_ns", 1, "runtime_total_gc_pause_ns_total_gc_pause_ns") + + return nil +} + +func (c *Consul) isLeader(mfs prometheus.MetricFamilies) (bool, bool) { + var mf *prometheus.MetricFamily + for _, v := range []string{"server_isLeader_isLeader", "server_isLeader"} { + if mf = mfs.GetGauge(c.promMetricNameWithHostname(v)); mf != nil { + break + } + if mf = mfs.GetGauge(c.promMetricName(v)); mf != nil { + break + } + } + + if mf == nil { + return false, false + } + + return mf.Metrics()[0].Gauge().Value() == 1, true +} + +func (c *Consul) collectGauge(mx map[string]int64, mfs prometheus.MetricFamilies, name string, mul float64, aliases ...string) { + var mf *prometheus.MetricFamily + for _, v := range append(aliases, name) { + if mf = mfs.GetGauge(c.promMetricNameWithHostname(v)); mf != nil { + break + } + if mf = mfs.GetGauge(c.promMetricName(v)); mf != nil { + break + } + } + + if mf == nil { + return + } + + v := mf.Metrics()[0].Gauge().Value() + + if !math.IsNaN(v) { + mx[name] = int64(v * mul) + } +} + +func (c *Consul) collectGaugeBool(mx map[string]int64, mfs prometheus.MetricFamilies, name string, aliases ...string) { + var mf *prometheus.MetricFamily + for _, v := range append(aliases, name) { + if mf = mfs.GetGauge(c.promMetricNameWithHostname(v)); mf != nil { + break + } + if mf = mfs.GetGauge(c.promMetricName(v)); mf != nil { + break + } + } + + if mf == nil { + return + } + + v := mf.Metrics()[0].Gauge().Value() + + if !math.IsNaN(v) { + mx[name+"_yes"] = boolToInt(v == 1) + mx[name+"_no"] = boolToInt(v == 0) + } +} + +func (c *Consul) collectCounter(mx map[string]int64, mfs prometheus.MetricFamilies, name string, mul float64) { + mf := mfs.GetCounter(c.promMetricName(name)) + if mf == nil { + return + } + + v := mf.Metrics()[0].Counter().Value() + + if !math.IsNaN(v) { + mx[name] = int64(v * mul) + } +} + +func (c *Consul) collectSummary(mx map[string]int64, mfs prometheus.MetricFamilies, name string) { + mf := mfs.GetSummary(c.promMetricName(name)) + if mf == nil { + return + } + + m := mf.Metrics()[0] + + for _, q := range m.Summary().Quantiles() { + v := q.Value() + // MaxAge is 10 seconds (hardcoded) + // https://github.com/hashicorp/go-metrics/blob/b6d5c860c07ef6eeec89f4a662c7b452dd4d0c93/prometheus/prometheus.go#L227 + if math.IsNaN(v) { + v = 0 + } + + id := fmt.Sprintf("%s_quantile=%s", name, formatFloat(q.Quantile())) + mx[id] = int64(v * precision * precision) + } + + mx[name+"_sum"] = int64(m.Summary().Sum() * precision) + mx[name+"_count"] = int64(m.Summary().Count()) +} + +func (c *Consul) promMetricName(name string) string { + px := c.cfg.DebugConfig.Telemetry.MetricsPrefix + return px + "_" + name +} + +var forbiddenCharsReplacer = strings.NewReplacer(" ", "_", ".", "_", "=", "_", "-", "_", "/", "_") + +// controlled by 'disable_hostname' +// https://developer.hashicorp.com/consul/docs/agent/config/config-files#telemetry-disable_hostname +func (c *Consul) promMetricNameWithHostname(name string) string { + px := c.cfg.DebugConfig.Telemetry.MetricsPrefix + node := forbiddenCharsReplacer.Replace(c.cfg.Config.NodeName) + + return px + "_" + node + "_" + name +} + +func formatFloat(v float64) string { + return strconv.FormatFloat(v, 'f', -1, 64) +} |