summaryrefslogtreecommitdiffstats
path: root/src/go/collectors/go.d.plugin/modules/k8s_state
diff options
context:
space:
mode:
Diffstat (limited to '')
l---------src/go/collectors/go.d.plugin/modules/k8s_state/README.md1
-rw-r--r--src/go/collectors/go.d.plugin/modules/k8s_state/charts.go785
-rw-r--r--src/go/collectors/go.d.plugin/modules/k8s_state/client.go64
-rw-r--r--src/go/collectors/go.d.plugin/modules/k8s_state/cluster_meta.go84
-rw-r--r--src/go/collectors/go.d.plugin/modules/k8s_state/collect.go264
-rw-r--r--src/go/collectors/go.d.plugin/modules/k8s_state/config_schema.json25
-rw-r--r--src/go/collectors/go.d.plugin/modules/k8s_state/discover_kubernetes.go160
-rw-r--r--src/go/collectors/go.d.plugin/modules/k8s_state/discover_node.go105
-rw-r--r--src/go/collectors/go.d.plugin/modules/k8s_state/discover_pod.go105
-rw-r--r--src/go/collectors/go.d.plugin/modules/k8s_state/init.go15
-rw-r--r--src/go/collectors/go.d.plugin/modules/k8s_state/integrations/kubernetes_cluster_state.md218
-rw-r--r--src/go/collectors/go.d.plugin/modules/k8s_state/kube_state.go147
-rw-r--r--src/go/collectors/go.d.plugin/modules/k8s_state/kube_state_test.go859
-rw-r--r--src/go/collectors/go.d.plugin/modules/k8s_state/metadata.yaml356
-rw-r--r--src/go/collectors/go.d.plugin/modules/k8s_state/resource.go44
-rw-r--r--src/go/collectors/go.d.plugin/modules/k8s_state/state.go165
-rw-r--r--src/go/collectors/go.d.plugin/modules/k8s_state/testdata/config.json3
-rw-r--r--src/go/collectors/go.d.plugin/modules/k8s_state/testdata/config.yaml1
-rw-r--r--src/go/collectors/go.d.plugin/modules/k8s_state/update_node_state.go47
-rw-r--r--src/go/collectors/go.d.plugin/modules/k8s_state/update_pod_state.go165
-rw-r--r--src/go/collectors/go.d.plugin/modules/k8s_state/update_state.go27
21 files changed, 3640 insertions, 0 deletions
diff --git a/src/go/collectors/go.d.plugin/modules/k8s_state/README.md b/src/go/collectors/go.d.plugin/modules/k8s_state/README.md
new file mode 120000
index 000000000..72c4e5cab
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/k8s_state/README.md
@@ -0,0 +1 @@
+integrations/kubernetes_cluster_state.md \ No newline at end of file
diff --git a/src/go/collectors/go.d.plugin/modules/k8s_state/charts.go b/src/go/collectors/go.d.plugin/modules/k8s_state/charts.go
new file mode 100644
index 000000000..0cec12512
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/k8s_state/charts.go
@@ -0,0 +1,785 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+package k8s_state
+
+import (
+ "fmt"
+ "regexp"
+ "strings"
+
+ "github.com/netdata/netdata/go/go.d.plugin/agent/module"
+)
+
+// NETDATA_CHART_PRIO_CGROUPS_CONTAINERS 40000
+const prioDiscoveryDiscovererState = 50999
+
+const (
+ prioNodeAllocatableCPURequestsUtil = 50100 + iota
+ prioNodeAllocatableCPURequestsUsed
+ prioNodeAllocatableCPULimitsUtil
+ prioNodeAllocatableCPULimitsUsed
+ prioNodeAllocatableMemRequestsUtil
+ prioNodeAllocatableMemRequestsUsed
+ prioNodeAllocatableMemLimitsUtil
+ prioNodeAllocatableMemLimitsUsed
+ prioNodeAllocatablePodsUtil
+ prioNodeAllocatablePodsUsage
+ prioNodeConditions
+ prioNodeSchedulability
+ prioNodePodsReadiness
+ prioNodePodsReadinessState
+ prioNodePodsCondition
+ prioNodePodsPhase
+ prioNodeContainersCount
+ prioNodeContainersState
+ prioNodeInitContainersState
+ prioNodeAge
+)
+
+const (
+ prioPodCPURequestsUsed = 50300 + iota
+ prioPodCPULimitsUsed
+ prioPodMemRequestsUsed
+ prioPodMemLimitsUsed
+ prioPodCondition
+ prioPodPhase
+ prioPodAge
+ prioPodContainersCount
+ prioPodContainersState
+ prioPodInitContainersState
+ prioPodContainerReadinessState
+ prioPodContainerRestarts
+ prioPodContainerState
+ prioPodContainerWaitingStateReason
+ prioPodContainerTerminatedStateReason
+)
+
+const (
+ labelKeyPrefix = "k8s_"
+ //labelKeyLabelPrefix = labelKeyPrefix + "label_"
+ //labelKeyAnnotationPrefix = labelKeyPrefix + "annotation_"
+ labelKeyClusterID = labelKeyPrefix + "cluster_id"
+ labelKeyClusterName = labelKeyPrefix + "cluster_name"
+ labelKeyNamespace = labelKeyPrefix + "namespace"
+ labelKeyKind = labelKeyPrefix + "kind"
+ labelKeyPodName = labelKeyPrefix + "pod_name"
+ labelKeyNodeName = labelKeyPrefix + "node_name"
+ labelKeyPodUID = labelKeyPrefix + "pod_uid"
+ labelKeyControllerKind = labelKeyPrefix + "controller_kind"
+ labelKeyControllerName = labelKeyPrefix + "controller_name"
+ labelKeyContainerName = labelKeyPrefix + "container_name"
+ labelKeyContainerID = labelKeyPrefix + "container_id"
+ labelKeyQoSClass = labelKeyPrefix + "qos_class"
+)
+
+var baseCharts = module.Charts{
+ discoveryStatusChart.Copy(),
+}
+
+var nodeChartsTmpl = module.Charts{
+ nodeAllocatableCPURequestsUtilChartTmpl.Copy(),
+ nodeAllocatableCPURequestsUsedChartTmpl.Copy(),
+ nodeAllocatableCPULimitsUtilChartTmpl.Copy(),
+ nodeAllocatableCPULimitsUsedChartTmpl.Copy(),
+ nodeAllocatableMemRequestsUtilChartTmpl.Copy(),
+ nodeAllocatableMemRequestsUsedChartTmpl.Copy(),
+ nodeAllocatableMemLimitsUtilChartTmpl.Copy(),
+ nodeAllocatableMemLimitsUsedChartTmpl.Copy(),
+ nodeAllocatablePodsUtilizationChartTmpl.Copy(),
+ nodeAllocatablePodsUsageChartTmpl.Copy(),
+ nodeConditionsChartTmpl.Copy(),
+ nodeSchedulabilityChartTmpl.Copy(),
+ nodePodsReadinessChartTmpl.Copy(),
+ nodePodsReadinessStateChartTmpl.Copy(),
+ nodePodsConditionChartTmpl.Copy(),
+ nodePodsPhaseChartTmpl.Copy(),
+ nodeContainersChartTmpl.Copy(),
+ nodeContainersStateChartTmpl.Copy(),
+ nodeInitContainersStateChartTmpl.Copy(),
+ nodeAgeChartTmpl.Copy(),
+}
+
+var podChartsTmpl = module.Charts{
+ podCPURequestsUsedChartTmpl.Copy(),
+ podCPULimitsUsedChartTmpl.Copy(),
+ podMemRequestsUsedChartTmpl.Copy(),
+ podMemLimitsUsedChartTmpl.Copy(),
+ podConditionChartTmpl.Copy(),
+ podPhaseChartTmpl.Copy(),
+ podAgeChartTmpl.Copy(),
+ podContainersCountChartTmpl.Copy(),
+ podContainersStateChartTmpl.Copy(),
+ podInitContainersStateChartTmpl.Copy(),
+}
+
+var containerChartsTmpl = module.Charts{
+ containerReadinessStateChartTmpl.Copy(),
+ containerRestartsChartTmpl.Copy(),
+ containersStateChartTmpl.Copy(),
+ containersStateWaitingChartTmpl.Copy(),
+ containersStateTerminatedChartTmpl.Copy(),
+}
+
+var (
+ // CPU resource
+ nodeAllocatableCPURequestsUtilChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "node_%s.allocatable_cpu_requests_utilization",
+ Title: "CPU requests utilization",
+ Units: "%",
+ Fam: "node cpu resource",
+ Ctx: "k8s_state.node_allocatable_cpu_requests_utilization",
+ Priority: prioNodeAllocatableCPURequestsUtil,
+ Dims: module.Dims{
+ {ID: "node_%s_alloc_cpu_requests_util", Name: "requests", Div: precision},
+ },
+ }
+ nodeAllocatableCPURequestsUsedChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "node_%s.allocatable_cpu_requests_used",
+ Title: "CPU requests used",
+ Units: "millicpu",
+ Fam: "node cpu resource",
+ Ctx: "k8s_state.node_allocatable_cpu_requests_used",
+ Priority: prioNodeAllocatableCPURequestsUsed,
+ Dims: module.Dims{
+ {ID: "node_%s_alloc_cpu_requests_used", Name: "requests"},
+ },
+ }
+ nodeAllocatableCPULimitsUtilChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "node_%s.allocatable_cpu_limits_utilization",
+ Title: "CPU limits utilization",
+ Units: "%",
+ Fam: "node cpu resource",
+ Ctx: "k8s_state.node_allocatable_cpu_limits_utilization",
+ Priority: prioNodeAllocatableCPULimitsUtil,
+ Dims: module.Dims{
+ {ID: "node_%s_alloc_cpu_limits_util", Name: "limits", Div: precision},
+ },
+ }
+ nodeAllocatableCPULimitsUsedChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "node_%s.allocatable_cpu_limits_used",
+ Title: "CPU limits used",
+ Units: "millicpu",
+ Fam: "node cpu resource",
+ Ctx: "k8s_state.node_allocatable_cpu_limits_used",
+ Priority: prioNodeAllocatableCPULimitsUsed,
+ Dims: module.Dims{
+ {ID: "node_%s_alloc_cpu_limits_used", Name: "limits"},
+ },
+ }
+ // memory resource
+ nodeAllocatableMemRequestsUtilChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "node_%s.allocatable_mem_requests_utilization",
+ Title: "Memory requests utilization",
+ Units: "%",
+ Fam: "node mem resource",
+ Ctx: "k8s_state.node_allocatable_mem_requests_utilization",
+ Priority: prioNodeAllocatableMemRequestsUtil,
+ Dims: module.Dims{
+ {ID: "node_%s_alloc_mem_requests_util", Name: "requests", Div: precision},
+ },
+ }
+ nodeAllocatableMemRequestsUsedChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "node_%s.allocatable_mem_requests_used",
+ Title: "Memory requests used",
+ Units: "bytes",
+ Fam: "node mem resource",
+ Ctx: "k8s_state.node_allocatable_mem_requests_used",
+ Priority: prioNodeAllocatableMemRequestsUsed,
+ Dims: module.Dims{
+ {ID: "node_%s_alloc_mem_requests_used", Name: "requests"},
+ },
+ }
+ nodeAllocatableMemLimitsUtilChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "node_%s.allocatable_mem_limits_utilization",
+ Title: "Memory limits utilization",
+ Units: "%",
+ Fam: "node mem resource",
+ Ctx: "k8s_state.node_allocatable_mem_limits_utilization",
+ Priority: prioNodeAllocatableMemLimitsUtil,
+ Dims: module.Dims{
+ {ID: "node_%s_alloc_mem_limits_util", Name: "limits", Div: precision},
+ },
+ }
+ nodeAllocatableMemLimitsUsedChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "node_%s.allocatable_mem_limits_used",
+ Title: "Memory limits used",
+ Units: "bytes",
+ Fam: "node mem resource",
+ Ctx: "k8s_state.node_allocatable_mem_limits_used",
+ Priority: prioNodeAllocatableMemLimitsUsed,
+ Dims: module.Dims{
+ {ID: "node_%s_alloc_mem_limits_used", Name: "limits"},
+ },
+ }
+ // pods resource
+ nodeAllocatablePodsUtilizationChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "node_%s.allocatable_pods_utilization",
+ Title: "Pods resource utilization",
+ Units: "%",
+ Fam: "node pods resource",
+ Ctx: "k8s_state.node_allocatable_pods_utilization",
+ Priority: prioNodeAllocatablePodsUtil,
+ Dims: module.Dims{
+ {ID: "node_%s_alloc_pods_util", Name: "allocated", Div: precision},
+ },
+ }
+ nodeAllocatablePodsUsageChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "node_%s.allocated_pods_usage",
+ Title: "Pods resource usage",
+ Units: "pods",
+ Fam: "node pods resource",
+ Ctx: "k8s_state.node_allocatable_pods_usage",
+ Type: module.Stacked,
+ Priority: prioNodeAllocatablePodsUsage,
+ Dims: module.Dims{
+ {ID: "node_%s_alloc_pods_available", Name: "available"},
+ {ID: "node_%s_alloc_pods_allocated", Name: "allocated"},
+ },
+ }
+ // condition
+ nodeConditionsChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "node_%s.condition_status",
+ Title: "Condition status",
+ Units: "status",
+ Fam: "node condition",
+ Ctx: "k8s_state.node_condition",
+ Priority: prioNodeConditions,
+ }
+ nodeSchedulabilityChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "node_%s.schedulability",
+ Title: "Schedulability",
+ Units: "state",
+ Fam: "node schedulability",
+ Ctx: "k8s_state.node_schedulability",
+ Priority: prioNodeSchedulability,
+ Dims: module.Dims{
+ {ID: "node_%s_schedulability_schedulable", Name: "schedulable"},
+ {ID: "node_%s_schedulability_unschedulable", Name: "unschedulable"},
+ },
+ }
+ // pods readiness
+ nodePodsReadinessChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "node_%s.pods_readiness",
+ Title: "Pods readiness",
+ Units: "%",
+ Fam: "node pods readiness",
+ Ctx: "k8s_state.node_pods_readiness",
+ Priority: prioNodePodsReadiness,
+ Dims: module.Dims{
+ {ID: "node_%s_pods_readiness", Name: "ready", Div: precision},
+ },
+ }
+ nodePodsReadinessStateChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "node_%s.pods_readiness_state",
+ Title: "Pods readiness state",
+ Units: "pods",
+ Fam: "node pods readiness",
+ Ctx: "k8s_state.node_pods_readiness_state",
+ Type: module.Stacked,
+ Priority: prioNodePodsReadinessState,
+ Dims: module.Dims{
+ {ID: "node_%s_pods_readiness_ready", Name: "ready"},
+ {ID: "node_%s_pods_readiness_unready", Name: "unready"},
+ },
+ }
+ // pods condition
+ nodePodsConditionChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "node_%s.pods_condition",
+ Title: "Pods condition",
+ Units: "pods",
+ Fam: "node pods condition",
+ Ctx: "k8s_state.node_pods_condition",
+ Priority: prioNodePodsCondition,
+ Dims: module.Dims{
+ {ID: "node_%s_pods_cond_podready", Name: "pod_ready"},
+ {ID: "node_%s_pods_cond_podscheduled", Name: "pod_scheduled"},
+ {ID: "node_%s_pods_cond_podinitialized", Name: "pod_initialized"},
+ {ID: "node_%s_pods_cond_containersready", Name: "containers_ready"},
+ },
+ }
+ // pods phase
+ nodePodsPhaseChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "node_%s.pods_phase",
+ Title: "Pods phase",
+ Units: "pods",
+ Fam: "node pods phase",
+ Ctx: "k8s_state.node_pods_phase",
+ Type: module.Stacked,
+ Priority: prioNodePodsPhase,
+ Dims: module.Dims{
+ {ID: "node_%s_pods_phase_running", Name: "running"},
+ {ID: "node_%s_pods_phase_failed", Name: "failed"},
+ {ID: "node_%s_pods_phase_succeeded", Name: "succeeded"},
+ {ID: "node_%s_pods_phase_pending", Name: "pending"},
+ },
+ }
+ // containers
+ nodeContainersChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "node_%s.containers",
+ Title: "Containers",
+ Units: "containers",
+ Fam: "node containers",
+ Ctx: "k8s_state.node_containers",
+ Priority: prioNodeContainersCount,
+ Dims: module.Dims{
+ {ID: "node_%s_containers", Name: "containers"},
+ {ID: "node_%s_init_containers", Name: "init_containers"},
+ },
+ }
+ nodeContainersStateChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "node_%s.containers_state",
+ Title: "Containers state",
+ Units: "containers",
+ Fam: "node containers",
+ Ctx: "k8s_state.node_containers_state",
+ Type: module.Stacked,
+ Priority: prioNodeContainersState,
+ Dims: module.Dims{
+ {ID: "node_%s_containers_state_running", Name: "running"},
+ {ID: "node_%s_containers_state_waiting", Name: "waiting"},
+ {ID: "node_%s_containers_state_terminated", Name: "terminated"},
+ },
+ }
+ nodeInitContainersStateChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "node_%s.init_containers_state",
+ Title: "Init containers state",
+ Units: "containers",
+ Fam: "node containers",
+ Ctx: "k8s_state.node_init_containers_state",
+ Type: module.Stacked,
+ Priority: prioNodeInitContainersState,
+ Dims: module.Dims{
+ {ID: "node_%s_init_containers_state_running", Name: "running"},
+ {ID: "node_%s_init_containers_state_waiting", Name: "waiting"},
+ {ID: "node_%s_init_containers_state_terminated", Name: "terminated"},
+ },
+ }
+ // age
+ nodeAgeChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "node_%s.age",
+ Title: "Age",
+ Units: "seconds",
+ Fam: "node age",
+ Ctx: "k8s_state.node_age",
+ Priority: prioNodeAge,
+ Dims: module.Dims{
+ {ID: "node_%s_age", Name: "age"},
+ },
+ }
+)
+
+func (ks *KubeState) newNodeCharts(ns *nodeState) *module.Charts {
+ cs := nodeChartsTmpl.Copy()
+ for _, c := range *cs {
+ c.ID = fmt.Sprintf(c.ID, replaceDots(ns.id()))
+ c.Labels = ks.newNodeChartLabels(ns)
+ for _, d := range c.Dims {
+ d.ID = fmt.Sprintf(d.ID, ns.id())
+ }
+ }
+ return cs
+}
+
+func (ks *KubeState) newNodeChartLabels(ns *nodeState) []module.Label {
+ labels := []module.Label{
+ {Key: labelKeyNodeName, Value: ns.name, Source: module.LabelSourceK8s},
+ {Key: labelKeyClusterID, Value: ks.kubeClusterID, Source: module.LabelSourceK8s},
+ {Key: labelKeyClusterName, Value: ks.kubeClusterName, Source: module.LabelSourceK8s},
+ }
+ return labels
+}
+
+func (ks *KubeState) addNodeCharts(ns *nodeState) {
+ cs := ks.newNodeCharts(ns)
+ if err := ks.Charts().Add(*cs...); err != nil {
+ ks.Warning(err)
+ }
+}
+
+func (ks *KubeState) removeNodeCharts(ns *nodeState) {
+ prefix := fmt.Sprintf("node_%s", replaceDots(ns.id()))
+ for _, c := range *ks.Charts() {
+ if strings.HasPrefix(c.ID, prefix) {
+ c.MarkRemove()
+ c.MarkNotCreated()
+ }
+ }
+}
+
+func (ks *KubeState) addNodeConditionToCharts(ns *nodeState, cond string) {
+ id := fmt.Sprintf(nodeConditionsChartTmpl.ID, replaceDots(ns.id()))
+ c := ks.Charts().Get(id)
+ if c == nil {
+ ks.Warningf("chart '%s' does not exist", id)
+ return
+ }
+ dim := &module.Dim{
+ ID: fmt.Sprintf("node_%s_cond_%s", ns.id(), strings.ToLower(cond)),
+ Name: cond,
+ }
+ if err := c.AddDim(dim); err != nil {
+ ks.Warning(err)
+ return
+ }
+ c.MarkNotCreated()
+}
+
+var (
+ podCPURequestsUsedChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "pod_%s.cpu_requests_used",
+ Title: "CPU requests used",
+ Units: "millicpu",
+ Fam: "pod allocated cpu",
+ Ctx: "k8s_state.pod_cpu_requests_used",
+ Priority: prioPodCPURequestsUsed,
+ Dims: module.Dims{
+ {ID: "pod_%s_cpu_requests_used", Name: "requests"},
+ },
+ }
+ podCPULimitsUsedChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "pod_%s.cpu_limits_used",
+ Title: "CPU limits used",
+ Units: "millicpu",
+ Fam: "pod allocated cpu",
+ Ctx: "k8s_state.pod_cpu_limits_used",
+ Priority: prioPodCPULimitsUsed,
+ Dims: module.Dims{
+ {ID: "pod_%s_cpu_limits_used", Name: "limits"},
+ },
+ }
+ podMemRequestsUsedChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "pod_%s.mem_requests_used",
+ Title: "Memory requests used",
+ Units: "bytes",
+ Fam: "pod allocated mem",
+ Ctx: "k8s_state.pod_mem_requests_used",
+ Priority: prioPodMemRequestsUsed,
+ Dims: module.Dims{
+ {ID: "pod_%s_mem_requests_used", Name: "requests"},
+ },
+ }
+ podMemLimitsUsedChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "pod_%s.mem_limits_used",
+ Title: "Memory limits used",
+ Units: "bytes",
+ Fam: "pod allocated mem",
+ Ctx: "k8s_state.pod_mem_limits_used",
+ Priority: prioPodMemLimitsUsed,
+ Dims: module.Dims{
+ {ID: "pod_%s_mem_limits_used", Name: "limits"},
+ },
+ }
+ podConditionChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "pod_%s.condition",
+ Title: "Condition",
+ Units: "state",
+ Fam: "pod condition",
+ Ctx: "k8s_state.pod_condition",
+ Priority: prioPodCondition,
+ Dims: module.Dims{
+ {ID: "pod_%s_cond_podready", Name: "pod_ready"},
+ {ID: "pod_%s_cond_podscheduled", Name: "pod_scheduled"},
+ {ID: "pod_%s_cond_podinitialized", Name: "pod_initialized"},
+ {ID: "pod_%s_cond_containersready", Name: "containers_ready"},
+ },
+ }
+ podPhaseChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "pod_%s.phase",
+ Title: "Phase",
+ Units: "state",
+ Fam: "pod phase",
+ Ctx: "k8s_state.pod_phase",
+ Priority: prioPodPhase,
+ Dims: module.Dims{
+ {ID: "pod_%s_phase_running", Name: "running"},
+ {ID: "pod_%s_phase_failed", Name: "failed"},
+ {ID: "pod_%s_phase_succeeded", Name: "succeeded"},
+ {ID: "pod_%s_phase_pending", Name: "pending"},
+ },
+ }
+ podAgeChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "pod_%s.age",
+ Title: "Age",
+ Units: "seconds",
+ Fam: "pod age",
+ Ctx: "k8s_state.pod_age",
+ Priority: prioPodAge,
+ Dims: module.Dims{
+ {ID: "pod_%s_age", Name: "age"},
+ },
+ }
+ podContainersCountChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "pod_%s.containers_count",
+ Title: "Containers",
+ Units: "containers",
+ Fam: "pod containers",
+ Ctx: "k8s_state.pod_containers",
+ Priority: prioPodContainersCount,
+ Dims: module.Dims{
+ {ID: "pod_%s_containers", Name: "containers"},
+ {ID: "pod_%s_init_containers", Name: "init_containers"},
+ },
+ }
+ podContainersStateChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "pod_%s.containers_state",
+ Title: "Containers state",
+ Units: "containers",
+ Fam: "pod containers",
+ Ctx: "k8s_state.pod_containers_state",
+ Type: module.Stacked,
+ Priority: prioPodContainersState,
+ Dims: module.Dims{
+ {ID: "pod_%s_containers_state_running", Name: "running"},
+ {ID: "pod_%s_containers_state_waiting", Name: "waiting"},
+ {ID: "pod_%s_containers_state_terminated", Name: "terminated"},
+ },
+ }
+ podInitContainersStateChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "pod_%s.init_containers_state",
+ Title: "Init containers state",
+ Units: "containers",
+ Fam: "pod containers",
+ Ctx: "k8s_state.pod_init_containers_state",
+ Type: module.Stacked,
+ Priority: prioPodInitContainersState,
+ Dims: module.Dims{
+ {ID: "pod_%s_init_containers_state_running", Name: "running"},
+ {ID: "pod_%s_init_containers_state_waiting", Name: "waiting"},
+ {ID: "pod_%s_init_containers_state_terminated", Name: "terminated"},
+ },
+ }
+)
+
+func (ks *KubeState) newPodCharts(ps *podState) *module.Charts {
+ charts := podChartsTmpl.Copy()
+ for _, c := range *charts {
+ c.ID = fmt.Sprintf(c.ID, replaceDots(ps.id()))
+ c.Labels = ks.newPodChartLabels(ps)
+ for _, d := range c.Dims {
+ d.ID = fmt.Sprintf(d.ID, ps.id())
+ }
+ }
+ return charts
+}
+
+func (ks *KubeState) newPodChartLabels(ps *podState) []module.Label {
+ labels := []module.Label{
+ {Key: labelKeyNamespace, Value: ps.namespace, Source: module.LabelSourceK8s},
+ {Key: labelKeyPodName, Value: ps.name, Source: module.LabelSourceK8s},
+ {Key: labelKeyNodeName, Value: ps.nodeName, Source: module.LabelSourceK8s},
+ {Key: labelKeyQoSClass, Value: ps.qosClass, Source: module.LabelSourceK8s},
+ {Key: labelKeyControllerKind, Value: ps.controllerKind, Source: module.LabelSourceK8s},
+ {Key: labelKeyControllerName, Value: ps.controllerName, Source: module.LabelSourceK8s},
+ {Key: labelKeyClusterID, Value: ks.kubeClusterID, Source: module.LabelSourceK8s},
+ {Key: labelKeyClusterName, Value: ks.kubeClusterName, Source: module.LabelSourceK8s},
+ }
+ return labels
+}
+
+func (ks *KubeState) addPodCharts(ps *podState) {
+ charts := ks.newPodCharts(ps)
+ if err := ks.Charts().Add(*charts...); err != nil {
+ ks.Warning(err)
+ }
+}
+
+func (ks *KubeState) updatePodChartsNodeLabel(ps *podState) {
+ prefix := fmt.Sprintf("pod_%s", replaceDots(ps.id()))
+ for _, c := range *ks.Charts() {
+ if strings.HasPrefix(c.ID, prefix) {
+ updateNodeLabel(c, ps.nodeName)
+ c.MarkNotCreated()
+ }
+ }
+}
+
+func updateNodeLabel(c *module.Chart, nodeName string) {
+ for i, l := range c.Labels {
+ if l.Key == labelKeyNodeName {
+ c.Labels[i].Value = nodeName
+ break
+ }
+ }
+}
+
+func (ks *KubeState) removePodCharts(ps *podState) {
+ prefix := fmt.Sprintf("pod_%s", replaceDots(ps.id()))
+ for _, c := range *ks.Charts() {
+ if strings.HasPrefix(c.ID, prefix) {
+ c.MarkRemove()
+ c.MarkNotCreated()
+ }
+ }
+}
+
+var (
+ containerReadinessStateChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "pod_%s_container_%s.readiness_state",
+ Title: "Readiness state",
+ Units: "state",
+ Fam: "container readiness",
+ Ctx: "k8s_state.pod_container_readiness_state",
+ Priority: prioPodContainerReadinessState,
+ Dims: module.Dims{
+ {ID: "pod_%s_container_%s_readiness", Name: "ready"},
+ },
+ }
+ containerRestartsChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "pod_%s_container_%s.restarts",
+ Title: "Restarts",
+ Units: "restarts",
+ Fam: "container restarts",
+ Ctx: "k8s_state.pod_container_restarts",
+ Priority: prioPodContainerRestarts,
+ Dims: module.Dims{
+ {ID: "pod_%s_container_%s_restarts", Name: "restarts"},
+ },
+ }
+ containersStateChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "pod_%s_container_%s.state",
+ Title: "Container state",
+ Units: "state",
+ Fam: "container state",
+ Ctx: "k8s_state.pod_container_state",
+ Priority: prioPodContainerState,
+ Dims: module.Dims{
+ {ID: "pod_%s_container_%s_state_running", Name: "running"},
+ {ID: "pod_%s_container_%s_state_waiting", Name: "waiting"},
+ {ID: "pod_%s_container_%s_state_terminated", Name: "terminated"},
+ },
+ }
+ containersStateWaitingChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "pod_%s_container_%s.state_waiting_reason",
+ Title: "Container waiting state reason",
+ Units: "state",
+ Fam: "container waiting reason",
+ Ctx: "k8s_state.pod_container_waiting_state_reason",
+ Priority: prioPodContainerWaitingStateReason,
+ }
+ containersStateTerminatedChartTmpl = module.Chart{
+ IDSep: true,
+ ID: "pod_%s_container_%s.state_terminated_reason",
+ Title: "Container terminated state reason",
+ Units: "state",
+ Fam: "container terminated reason",
+ Ctx: "k8s_state.pod_container_terminated_state_reason",
+ Priority: prioPodContainerTerminatedStateReason,
+ }
+)
+
+func (ks *KubeState) newContainerCharts(ps *podState, cs *containerState) *module.Charts {
+ charts := containerChartsTmpl.Copy()
+ for _, c := range *charts {
+ c.ID = fmt.Sprintf(c.ID, replaceDots(ps.id()), cs.name)
+ c.Labels = ks.newContainerChartLabels(ps, cs)
+ for _, d := range c.Dims {
+ d.ID = fmt.Sprintf(d.ID, ps.id(), cs.name)
+ }
+ }
+ return charts
+}
+
+func (ks *KubeState) newContainerChartLabels(ps *podState, cs *containerState) []module.Label {
+ labels := ks.newPodChartLabels(ps)
+ labels = append(
+ labels, module.Label{Key: labelKeyContainerName, Value: cs.name, Source: module.LabelSourceK8s},
+ )
+ return labels
+}
+
+func (ks *KubeState) addContainerCharts(ps *podState, cs *containerState) {
+ charts := ks.newContainerCharts(ps, cs)
+ if err := ks.Charts().Add(*charts...); err != nil {
+ ks.Warning(err)
+ }
+}
+
+func (ks *KubeState) addContainerWaitingStateReasonToChart(ps *podState, cs *containerState, reason string) {
+ id := fmt.Sprintf(containersStateWaitingChartTmpl.ID, replaceDots(ps.id()), cs.name)
+ c := ks.Charts().Get(id)
+ if c == nil {
+ ks.Warningf("chart '%s' does not exist", id)
+ return
+ }
+ dim := &module.Dim{
+ ID: fmt.Sprintf("pod_%s_container_%s_state_waiting_reason_%s", ps.id(), cs.name, reason),
+ Name: reason,
+ }
+ if err := c.AddDim(dim); err != nil {
+ ks.Warning(err)
+ return
+ }
+ c.MarkNotCreated()
+}
+
+func (ks *KubeState) addContainerTerminatedStateReasonToChart(ps *podState, cs *containerState, reason string) {
+ id := fmt.Sprintf(containersStateTerminatedChartTmpl.ID, replaceDots(ps.id()), cs.name)
+ c := ks.Charts().Get(id)
+ if c == nil {
+ ks.Warningf("chart '%s' does not exist", id)
+ return
+ }
+ dim := &module.Dim{
+ ID: fmt.Sprintf("pod_%s_container_%s_state_terminated_reason_%s", ps.id(), cs.name, reason),
+ Name: reason,
+ }
+ if err := c.AddDim(dim); err != nil {
+ ks.Warning(err)
+ return
+ }
+ c.MarkNotCreated()
+}
+
+var discoveryStatusChart = module.Chart{
+ ID: "discovery_discoverers_state",
+ Title: "Running discoverers state",
+ Units: "state",
+ Fam: "discovery",
+ Ctx: "k8s_state.discovery_discoverers_state",
+ Priority: prioDiscoveryDiscovererState,
+ Opts: module.Opts{Hidden: true},
+ Dims: module.Dims{
+ {ID: "discovery_node_discoverer_state", Name: "node"},
+ {ID: "discovery_pod_discoverer_state", Name: "pod"},
+ },
+}
+
+var reDots = regexp.MustCompile(`\.`)
+
+func replaceDots(v string) string {
+ return reDots.ReplaceAllString(v, "-")
+}
diff --git a/src/go/collectors/go.d.plugin/modules/k8s_state/client.go b/src/go/collectors/go.d.plugin/modules/k8s_state/client.go
new file mode 100644
index 000000000..315e823fe
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/k8s_state/client.go
@@ -0,0 +1,64 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+package k8s_state
+
+import (
+ "errors"
+ "os"
+ "path/filepath"
+
+ "k8s.io/client-go/kubernetes"
+ "k8s.io/client-go/rest"
+ "k8s.io/client-go/tools/clientcmd"
+
+ _ "k8s.io/client-go/plugin/pkg/client/auth/gcp"
+
+ "github.com/mattn/go-isatty"
+)
+
+const (
+ envKubeServiceHost = "KUBERNETES_SERVICE_HOST"
+ envKubeServicePort = "KUBERNETES_SERVICE_PORT"
+)
+
+func newKubeClient() (kubernetes.Interface, error) {
+ if os.Getenv(envKubeServiceHost) != "" && os.Getenv(envKubeServicePort) != "" {
+ return newKubeClientInCluster()
+ }
+ if isatty.IsTerminal(os.Stdout.Fd()) {
+ return newKubeClientOutOfCluster()
+ }
+ return nil, errors.New("can not create Kubernetes client: not inside a cluster")
+}
+
+func newKubeClientInCluster() (*kubernetes.Clientset, error) {
+ config, err := rest.InClusterConfig()
+ if err != nil {
+ return nil, err
+ }
+ config.UserAgent = "Netdata/kube-state"
+ return kubernetes.NewForConfig(config)
+}
+
+func newKubeClientOutOfCluster() (*kubernetes.Clientset, error) {
+ home := homeDir()
+ if home == "" {
+ return nil, errors.New("couldn't find home directory")
+ }
+
+ configPath := filepath.Join(home, ".kube", "config")
+ config, err := clientcmd.BuildConfigFromFlags("", configPath)
+ if err != nil {
+ return nil, err
+ }
+
+ config.UserAgent = "Netdata/kube-state"
+ return kubernetes.NewForConfig(config)
+}
+
+func homeDir() string {
+ if h := os.Getenv("HOME"); h != "" {
+ return h
+ }
+ return os.Getenv("USERPROFILE") // windows
+}
diff --git a/src/go/collectors/go.d.plugin/modules/k8s_state/cluster_meta.go b/src/go/collectors/go.d.plugin/modules/k8s_state/cluster_meta.go
new file mode 100644
index 000000000..e7eb809cc
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/k8s_state/cluster_meta.go
@@ -0,0 +1,84 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+package k8s_state
+
+import (
+ "fmt"
+ "io"
+ "net/http"
+ "time"
+
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+)
+
+func (ks *KubeState) getKubeClusterID() string {
+ ns, err := ks.client.CoreV1().Namespaces().Get(ks.ctx, "kube-system", metav1.GetOptions{})
+ if err != nil {
+ ks.Warningf("error on getting 'kube-system' namespace UID: %v", err)
+ return ""
+ }
+ return string(ns.UID)
+}
+
+func (ks *KubeState) getKubeClusterName() string {
+ client := http.Client{Timeout: time.Second}
+ n, err := getGKEKubeClusterName(client)
+ if err != nil {
+ ks.Debugf("error on getting GKE cluster name: %v", err)
+ }
+ return n
+}
+
+func getGKEKubeClusterName(client http.Client) (string, error) {
+ id, err := doMetaGKEHTTPReq(client, "http://metadata/computeMetadata/v1/project/project-id")
+ if err != nil {
+ return "", err
+ }
+ loc, err := doMetaGKEHTTPReq(client, "http://metadata/computeMetadata/v1/instance/attributes/cluster-location")
+ if err != nil {
+ return "", err
+ }
+ name, err := doMetaGKEHTTPReq(client, "http://metadata/computeMetadata/v1/instance/attributes/cluster-name")
+ if err != nil {
+ return "", err
+ }
+
+ return fmt.Sprintf("gke_%s_%s_%s", id, loc, name), nil
+}
+
+func doMetaGKEHTTPReq(client http.Client, url string) (string, error) {
+ req, err := http.NewRequest(http.MethodGet, url, nil)
+ if err != nil {
+ return "", err
+ }
+
+ req.Header.Add("Metadata-Flavor", "Google")
+ resp, err := client.Do(req)
+ if err != nil {
+ return "", err
+ }
+ defer closeHTTPRespBody(resp)
+
+ if resp.StatusCode != http.StatusOK {
+ return "", fmt.Errorf("'%s' returned HTTP status code %d", url, resp.StatusCode)
+ }
+
+ bs, err := io.ReadAll(resp.Body)
+ if err != nil {
+ return "", err
+ }
+
+ s := string(bs)
+ if s == "" {
+ return "", fmt.Errorf("an empty response from '%s'", url)
+ }
+
+ return s, nil
+}
+
+func closeHTTPRespBody(resp *http.Response) {
+ if resp != nil && resp.Body != nil {
+ _, _ = io.Copy(io.Discard, resp.Body)
+ _ = resp.Body.Close()
+ }
+}
diff --git a/src/go/collectors/go.d.plugin/modules/k8s_state/collect.go b/src/go/collectors/go.d.plugin/modules/k8s_state/collect.go
new file mode 100644
index 000000000..033d330ce
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/k8s_state/collect.go
@@ -0,0 +1,264 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+package k8s_state
+
+import (
+ "errors"
+ "fmt"
+ "strings"
+ "time"
+
+ "github.com/netdata/netdata/go/go.d.plugin/agent/module"
+
+ corev1 "k8s.io/api/core/v1"
+)
+
+const precision = 1000
+
+func (ks *KubeState) collect() (map[string]int64, error) {
+ if ks.discoverer == nil {
+ return nil, errors.New("nil discoverer")
+ }
+
+ ks.once.Do(func() {
+ ks.startTime = time.Now()
+ in := make(chan resource)
+
+ ks.wg.Add(1)
+ go func() { defer ks.wg.Done(); ks.runUpdateState(in) }()
+
+ ks.wg.Add(1)
+ go func() { defer ks.wg.Done(); ks.discoverer.run(ks.ctx, in) }()
+
+ ks.kubeClusterID = ks.getKubeClusterID()
+ ks.kubeClusterName = ks.getKubeClusterName()
+ if chart := ks.Charts().Get(discoveryStatusChart.ID); chart != nil {
+ chart.Labels = []module.Label{
+ {Key: labelKeyClusterID, Value: ks.kubeClusterID, Source: module.LabelSourceK8s},
+ {Key: labelKeyClusterName, Value: ks.kubeClusterName, Source: module.LabelSourceK8s},
+ }
+ }
+ })
+
+ mx := map[string]int64{
+ "discovery_node_discoverer_state": 1,
+ "discovery_pod_discoverer_state": 1,
+ }
+
+ if !ks.discoverer.ready() || time.Since(ks.startTime) < ks.initDelay {
+ return mx, nil
+ }
+
+ ks.state.Lock()
+ defer ks.state.Unlock()
+
+ ks.collectKubeState(mx)
+
+ return mx, nil
+}
+
+func (ks *KubeState) collectKubeState(mx map[string]int64) {
+ for _, ns := range ks.state.nodes {
+ ns.resetStats()
+ }
+ ks.collectPodsState(mx)
+ ks.collectNodesState(mx)
+}
+
+func (ks *KubeState) collectPodsState(mx map[string]int64) {
+ now := time.Now()
+ for _, ps := range ks.state.pods {
+ if ps.deleted {
+ delete(ks.state.pods, podSource(ps.namespace, ps.name))
+ ks.removePodCharts(ps)
+ continue
+ }
+ if ps.new {
+ ps.new = false
+ ks.addPodCharts(ps)
+ ps.unscheduled = ps.nodeName == ""
+ } else if ps.unscheduled && ps.nodeName != "" {
+ ps.unscheduled = false
+ ks.updatePodChartsNodeLabel(ps)
+ }
+
+ ns := ks.state.nodes[nodeSource(ps.nodeName)]
+ if ns != nil {
+ ns.stats.pods++
+ ns.stats.reqCPU += ps.reqCPU
+ ns.stats.limitCPU += ps.limitCPU
+ ns.stats.reqMem += ps.reqMem
+ ns.stats.limitMem += ps.limitMem
+ ns.stats.podsCondPodReady += condStatusToInt(ps.condPodReady)
+ ns.stats.podsCondPodScheduled += condStatusToInt(ps.condPodScheduled)
+ ns.stats.podsCondPodInitialized += condStatusToInt(ps.condPodInitialized)
+ ns.stats.podsCondContainersReady += condStatusToInt(ps.condContainersReady)
+ ns.stats.podsReadinessReady += boolToInt(ps.condPodReady == corev1.ConditionTrue)
+ ns.stats.podsReadinessUnready += boolToInt(ps.condPodReady != corev1.ConditionTrue)
+ ns.stats.podsPhasePending += boolToInt(ps.phase == corev1.PodPending)
+ ns.stats.podsPhaseRunning += boolToInt(ps.phase == corev1.PodRunning)
+ ns.stats.podsPhaseSucceeded += boolToInt(ps.phase == corev1.PodSucceeded)
+ ns.stats.podsPhaseFailed += boolToInt(ps.phase == corev1.PodFailed)
+ for _, cs := range ps.initContainers {
+ ns.stats.initContainers++
+ ns.stats.initContStateRunning += boolToInt(cs.stateRunning)
+ ns.stats.initContStateWaiting += boolToInt(cs.stateWaiting)
+ ns.stats.initContStateTerminated += boolToInt(cs.stateTerminated)
+ }
+ for _, cs := range ps.containers {
+ ns.stats.containers++
+ ns.stats.contStateRunning += boolToInt(cs.stateRunning)
+ ns.stats.contStateWaiting += boolToInt(cs.stateWaiting)
+ ns.stats.contStateTerminated += boolToInt(cs.stateTerminated)
+ }
+ }
+
+ px := fmt.Sprintf("pod_%s_", ps.id())
+
+ mx[px+"cond_podready"] = condStatusToInt(ps.condPodReady)
+ mx[px+"cond_podscheduled"] = condStatusToInt(ps.condPodScheduled)
+ mx[px+"cond_podinitialized"] = condStatusToInt(ps.condPodInitialized)
+ mx[px+"cond_containersready"] = condStatusToInt(ps.condContainersReady)
+ mx[px+"phase_running"] = boolToInt(ps.phase == corev1.PodRunning)
+ mx[px+"phase_failed"] = boolToInt(ps.phase == corev1.PodFailed)
+ mx[px+"phase_succeeded"] = boolToInt(ps.phase == corev1.PodSucceeded)
+ mx[px+"phase_pending"] = boolToInt(ps.phase == corev1.PodPending)
+ mx[px+"age"] = int64(now.Sub(ps.creationTime).Seconds())
+ mx[px+"cpu_requests_used"] = ps.reqCPU
+ mx[px+"cpu_limits_used"] = ps.limitCPU
+ mx[px+"mem_requests_used"] = ps.reqMem
+ mx[px+"mem_limits_used"] = ps.limitMem
+
+ mx[px+"init_containers"] = int64(len(ps.initContainers))
+ mx[px+"containers"] = int64(len(ps.containers))
+
+ mx[px+"init_containers_state_running"] = 0
+ mx[px+"init_containers_state_waiting"] = 0
+ mx[px+"init_containers_state_terminated"] = 0
+ for _, cs := range ps.initContainers {
+ mx[px+"init_containers_state_running"] += boolToInt(cs.stateRunning)
+ mx[px+"init_containers_state_waiting"] += boolToInt(cs.stateWaiting)
+ mx[px+"init_containers_state_terminated"] += boolToInt(cs.stateTerminated)
+ }
+ mx[px+"containers_state_running"] = 0
+ mx[px+"containers_state_waiting"] = 0
+ mx[px+"containers_state_terminated"] = 0
+ for _, cs := range ps.containers {
+ if cs.new {
+ cs.new = false
+ ks.addContainerCharts(ps, cs)
+ }
+ mx[px+"containers_state_running"] += boolToInt(cs.stateRunning)
+ mx[px+"containers_state_waiting"] += boolToInt(cs.stateWaiting)
+ mx[px+"containers_state_terminated"] += boolToInt(cs.stateTerminated)
+
+ ppx := fmt.Sprintf("%scontainer_%s_", px, cs.name)
+ mx[ppx+"state_running"] = boolToInt(cs.stateRunning)
+ mx[ppx+"state_waiting"] = boolToInt(cs.stateWaiting)
+ mx[ppx+"state_terminated"] = boolToInt(cs.stateTerminated)
+ mx[ppx+"readiness"] = boolToInt(cs.ready)
+ mx[ppx+"restarts"] = cs.restarts
+ for _, r := range cs.stateWaitingReasons {
+ if r.new {
+ r.new = false
+ ks.addContainerWaitingStateReasonToChart(ps, cs, r.reason)
+ }
+ mx[ppx+"state_waiting_reason_"+r.reason] = boolToInt(r.active)
+ }
+ for _, r := range cs.stateTerminatedReasons {
+ if r.new {
+ r.new = false
+ ks.addContainerTerminatedStateReasonToChart(ps, cs, r.reason)
+ }
+ mx[ppx+"state_terminated_reason_"+r.reason] = boolToInt(r.active)
+ }
+ }
+ }
+}
+
+func (ks *KubeState) collectNodesState(mx map[string]int64) {
+ now := time.Now()
+ for _, ns := range ks.state.nodes {
+ if ns.deleted {
+ delete(ks.state.nodes, nodeSource(ns.name))
+ ks.removeNodeCharts(ns)
+ continue
+ }
+ if ns.new {
+ ns.new = false
+ ks.addNodeCharts(ns)
+ }
+
+ px := fmt.Sprintf("node_%s_", ns.id())
+
+ for typ, cond := range ns.conditions {
+ if cond.new {
+ cond.new = false
+ ks.addNodeConditionToCharts(ns, typ)
+ }
+ mx[px+"cond_"+strings.ToLower(typ)] = condStatusToInt(cond.status)
+ }
+
+ mx[px+"age"] = int64(now.Sub(ns.creationTime).Seconds())
+ mx[px+"alloc_pods_util"] = calcPercentage(ns.stats.pods, ns.allocatablePods)
+ mx[px+"pods_readiness_ready"] = ns.stats.podsReadinessReady
+ mx[px+"pods_readiness_unready"] = ns.stats.podsReadinessUnready
+ mx[px+"pods_readiness"] = calcPercentage(ns.stats.podsReadinessReady, ns.stats.pods)
+ mx[px+"pods_phase_running"] = ns.stats.podsPhaseRunning
+ mx[px+"pods_phase_failed"] = ns.stats.podsPhaseFailed
+ mx[px+"pods_phase_succeeded"] = ns.stats.podsPhaseSucceeded
+ mx[px+"pods_phase_pending"] = ns.stats.podsPhasePending
+ mx[px+"pods_cond_podready"] = ns.stats.podsCondPodReady
+ mx[px+"pods_cond_podscheduled"] = ns.stats.podsCondPodScheduled
+ mx[px+"pods_cond_podinitialized"] = ns.stats.podsCondPodInitialized
+ mx[px+"pods_cond_containersready"] = ns.stats.podsCondContainersReady
+ mx[px+"pods_cond_containersready"] = ns.stats.podsCondContainersReady
+ mx[px+"schedulability_schedulable"] = boolToInt(!ns.unSchedulable)
+ mx[px+"schedulability_unschedulable"] = boolToInt(ns.unSchedulable)
+ mx[px+"alloc_pods_available"] = ns.allocatablePods - ns.stats.pods
+ mx[px+"alloc_pods_allocated"] = ns.stats.pods
+ mx[px+"alloc_cpu_requests_util"] = calcPercentage(ns.stats.reqCPU, ns.allocatableCPU)
+ mx[px+"alloc_cpu_limits_util"] = calcPercentage(ns.stats.limitCPU, ns.allocatableCPU)
+ mx[px+"alloc_mem_requests_util"] = calcPercentage(ns.stats.reqMem, ns.allocatableMem)
+ mx[px+"alloc_mem_limits_util"] = calcPercentage(ns.stats.limitMem, ns.allocatableMem)
+ mx[px+"alloc_cpu_requests_used"] = ns.stats.reqCPU
+ mx[px+"alloc_cpu_limits_used"] = ns.stats.limitCPU
+ mx[px+"alloc_mem_requests_used"] = ns.stats.reqMem
+ mx[px+"alloc_mem_limits_used"] = ns.stats.limitMem
+ mx[px+"init_containers"] = ns.stats.initContainers
+ mx[px+"containers"] = ns.stats.containers
+ mx[px+"containers_state_running"] = ns.stats.contStateRunning
+ mx[px+"containers_state_waiting"] = ns.stats.contStateWaiting
+ mx[px+"containers_state_terminated"] = ns.stats.contStateTerminated
+ mx[px+"init_containers_state_running"] = ns.stats.initContStateRunning
+ mx[px+"init_containers_state_waiting"] = ns.stats.initContStateWaiting
+ mx[px+"init_containers_state_terminated"] = ns.stats.initContStateTerminated
+ }
+}
+
+func boolToInt(v bool) int64 {
+ if v {
+ return 1
+ }
+ return 0
+}
+
+func condStatusToInt(cs corev1.ConditionStatus) int64 {
+ switch cs {
+ case corev1.ConditionFalse:
+ return 0
+ case corev1.ConditionTrue:
+ return 1
+ case corev1.ConditionUnknown:
+ return 0
+ default:
+ return 0
+ }
+}
+
+func calcPercentage(value, total int64) int64 {
+ if total == 0 {
+ return 0
+ }
+ return int64(float64(value) / float64(total) * 100 * precision)
+}
diff --git a/src/go/collectors/go.d.plugin/modules/k8s_state/config_schema.json b/src/go/collectors/go.d.plugin/modules/k8s_state/config_schema.json
new file mode 100644
index 000000000..ae66d7cb5
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/k8s_state/config_schema.json
@@ -0,0 +1,25 @@
+{
+ "jsonSchema": {
+ "$schema": "http://json-schema.org/draft-07/schema#",
+ "title": "Kubernetes Cluster State collector configuration.",
+ "type": "object",
+ "properties": {
+ "update_every": {
+ "title": "Update every",
+ "description": "Data collection interval, measured in seconds.",
+ "type": "integer",
+ "minimum": 1,
+ "default": 1
+ }
+ },
+ "additionalProperties": false,
+ "patternProperties": {
+ "^name$": {}
+ }
+ },
+ "uiSchema": {
+ "uiOptions": {
+ "fullPage": true
+ }
+ }
+}
diff --git a/src/go/collectors/go.d.plugin/modules/k8s_state/discover_kubernetes.go b/src/go/collectors/go.d.plugin/modules/k8s_state/discover_kubernetes.go
new file mode 100644
index 000000000..a4aeee974
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/k8s_state/discover_kubernetes.go
@@ -0,0 +1,160 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+package k8s_state
+
+import (
+ "context"
+ "os"
+ "sync"
+ "time"
+
+ "github.com/netdata/netdata/go/go.d.plugin/logger"
+
+ corev1 "k8s.io/api/core/v1"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "k8s.io/apimachinery/pkg/runtime"
+ "k8s.io/apimachinery/pkg/watch"
+ "k8s.io/client-go/kubernetes"
+ "k8s.io/client-go/tools/cache"
+ "k8s.io/client-go/util/workqueue"
+)
+
+func newKubeDiscovery(client kubernetes.Interface, l *logger.Logger) *kubeDiscovery {
+ return &kubeDiscovery{
+ client: client,
+ Logger: l,
+ readyCh: make(chan struct{}),
+ stopCh: make(chan struct{}),
+ }
+}
+
+type kubeDiscovery struct {
+ *logger.Logger
+ client kubernetes.Interface
+ discoverers []discoverer
+ readyCh chan struct{}
+ stopCh chan struct{}
+}
+
+func (d *kubeDiscovery) run(ctx context.Context, in chan<- resource) {
+ d.Info("kube_discoverer is started")
+ defer func() { close(d.stopCh); d.Info("kube_discoverer is stopped") }()
+
+ d.discoverers = d.setupDiscoverers(ctx)
+
+ var wg sync.WaitGroup
+ updates := make(chan resource)
+
+ for _, dd := range d.discoverers {
+ wg.Add(1)
+ go func(dd discoverer) { defer wg.Done(); dd.run(ctx, updates) }(dd)
+ }
+
+ wg.Add(1)
+ go func() { defer wg.Done(); d.runDiscover(ctx, updates, in) }()
+
+ close(d.readyCh)
+ wg.Wait()
+ <-ctx.Done()
+}
+
+func (d *kubeDiscovery) ready() bool {
+ if !isChanClosed(d.readyCh) {
+ return false
+ }
+ for _, dd := range d.discoverers {
+ if !dd.ready() {
+ return false
+ }
+ }
+ return true
+}
+
+func (d *kubeDiscovery) stopped() bool {
+ if !isChanClosed(d.stopCh) {
+ return false
+ }
+ for _, dd := range d.discoverers {
+ if !dd.stopped() {
+ return false
+ }
+ }
+ return true
+}
+
+func (d *kubeDiscovery) runDiscover(ctx context.Context, updates chan resource, in chan<- resource) {
+ for {
+ select {
+ case <-ctx.Done():
+ return
+ case r := <-updates:
+ select {
+ case <-ctx.Done():
+ return
+ case in <- r:
+ }
+ }
+ }
+}
+
+const resyncPeriod = 10 * time.Minute
+
+var (
+ myNodeName = os.Getenv("MY_NODE_NAME")
+)
+
+func (d *kubeDiscovery) setupDiscoverers(ctx context.Context) []discoverer {
+ node := d.client.CoreV1().Nodes()
+ nodeWatcher := &cache.ListWatch{
+ ListFunc: func(options metav1.ListOptions) (runtime.Object, error) { return node.List(ctx, options) },
+ WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) { return node.Watch(ctx, options) },
+ }
+
+ pod := d.client.CoreV1().Pods(corev1.NamespaceAll)
+ podWatcher := &cache.ListWatch{
+ ListFunc: func(options metav1.ListOptions) (runtime.Object, error) {
+ if myNodeName != "" {
+ options.FieldSelector = "spec.nodeName=" + myNodeName
+ }
+ return pod.List(ctx, options)
+ },
+ WatchFunc: func(options metav1.ListOptions) (watch.Interface, error) {
+ if myNodeName != "" {
+ options.FieldSelector = "spec.nodeName=" + myNodeName
+ }
+ return pod.Watch(ctx, options)
+ },
+ }
+
+ return []discoverer{
+ newNodeDiscoverer(cache.NewSharedInformer(nodeWatcher, &corev1.Node{}, resyncPeriod), d.Logger),
+ newPodDiscoverer(cache.NewSharedInformer(podWatcher, &corev1.Pod{}, resyncPeriod), d.Logger),
+ }
+}
+
+func enqueue(queue *workqueue.Type, obj interface{}) {
+ key, err := cache.DeletionHandlingMetaNamespaceKeyFunc(obj)
+ if err != nil {
+ return
+ }
+ queue.Add(key)
+}
+
+func send(ctx context.Context, in chan<- resource, r resource) {
+ if r == nil {
+ return
+ }
+ select {
+ case <-ctx.Done():
+ case in <- r:
+ }
+}
+
+func isChanClosed(ch chan struct{}) bool {
+ select {
+ case <-ch:
+ return true
+ default:
+ return false
+ }
+}
diff --git a/src/go/collectors/go.d.plugin/modules/k8s_state/discover_node.go b/src/go/collectors/go.d.plugin/modules/k8s_state/discover_node.go
new file mode 100644
index 000000000..29761b204
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/k8s_state/discover_node.go
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+package k8s_state
+
+import (
+ "context"
+
+ "github.com/netdata/netdata/go/go.d.plugin/logger"
+
+ "k8s.io/client-go/tools/cache"
+ "k8s.io/client-go/util/workqueue"
+)
+
+func newNodeDiscoverer(si cache.SharedInformer, l *logger.Logger) *nodeDiscoverer {
+ if si == nil {
+ panic("nil node shared informer")
+ }
+
+ queue := workqueue.NewWithConfig(workqueue.QueueConfig{Name: "node"})
+ _, _ = si.AddEventHandler(cache.ResourceEventHandlerFuncs{
+ AddFunc: func(obj interface{}) { enqueue(queue, obj) },
+ UpdateFunc: func(_, obj interface{}) { enqueue(queue, obj) },
+ DeleteFunc: func(obj interface{}) { enqueue(queue, obj) },
+ })
+
+ return &nodeDiscoverer{
+ Logger: l,
+ informer: si,
+ queue: queue,
+ readyCh: make(chan struct{}),
+ stopCh: make(chan struct{}),
+ }
+}
+
+type nodeResource struct {
+ src string
+ val interface{}
+}
+
+func (r nodeResource) source() string { return r.src }
+func (r nodeResource) kind() kubeResourceKind { return kubeResourceNode }
+func (r nodeResource) value() interface{} { return r.val }
+
+type nodeDiscoverer struct {
+ *logger.Logger
+ informer cache.SharedInformer
+ queue *workqueue.Type
+ readyCh chan struct{}
+ stopCh chan struct{}
+}
+
+func (d *nodeDiscoverer) run(ctx context.Context, in chan<- resource) {
+ d.Info("node_discoverer is started")
+ defer func() { close(d.stopCh); d.Info("node_discoverer is stopped") }()
+
+ defer d.queue.ShutDown()
+
+ go d.informer.Run(ctx.Done())
+
+ if !cache.WaitForCacheSync(ctx.Done(), d.informer.HasSynced) {
+ return
+ }
+
+ go d.runDiscover(ctx, in)
+ close(d.readyCh)
+
+ <-ctx.Done()
+}
+
+func (d *nodeDiscoverer) ready() bool { return isChanClosed(d.readyCh) }
+func (d *nodeDiscoverer) stopped() bool { return isChanClosed(d.stopCh) }
+
+func (d *nodeDiscoverer) runDiscover(ctx context.Context, in chan<- resource) {
+ for {
+ item, shutdown := d.queue.Get()
+ if shutdown {
+ return
+ }
+
+ func() {
+ defer d.queue.Done(item)
+
+ key := item.(string)
+ _, name, err := cache.SplitMetaNamespaceKey(key)
+ if err != nil {
+ return
+ }
+
+ item, exists, err := d.informer.GetStore().GetByKey(key)
+ if err != nil {
+ return
+ }
+
+ r := &nodeResource{src: nodeSource(name)}
+ if exists {
+ r.val = item
+ }
+ send(ctx, in, r)
+ }()
+ }
+}
+
+func nodeSource(name string) string {
+ return "k8s/node/" + name
+}
diff --git a/src/go/collectors/go.d.plugin/modules/k8s_state/discover_pod.go b/src/go/collectors/go.d.plugin/modules/k8s_state/discover_pod.go
new file mode 100644
index 000000000..2def7ad50
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/k8s_state/discover_pod.go
@@ -0,0 +1,105 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+package k8s_state
+
+import (
+ "context"
+
+ "github.com/netdata/netdata/go/go.d.plugin/logger"
+
+ "k8s.io/client-go/tools/cache"
+ "k8s.io/client-go/util/workqueue"
+)
+
+func newPodDiscoverer(si cache.SharedInformer, l *logger.Logger) *podDiscoverer {
+ if si == nil {
+ panic("nil pod shared informer")
+ }
+
+ queue := workqueue.NewWithConfig(workqueue.QueueConfig{Name: "pod"})
+ _, _ = si.AddEventHandler(cache.ResourceEventHandlerFuncs{
+ AddFunc: func(obj interface{}) { enqueue(queue, obj) },
+ UpdateFunc: func(_, obj interface{}) { enqueue(queue, obj) },
+ DeleteFunc: func(obj interface{}) { enqueue(queue, obj) },
+ })
+
+ return &podDiscoverer{
+ Logger: l,
+ informer: si,
+ queue: queue,
+ readyCh: make(chan struct{}),
+ stopCh: make(chan struct{}),
+ }
+}
+
+type podResource struct {
+ src string
+ val interface{}
+}
+
+func (r podResource) source() string { return r.src }
+func (r podResource) kind() kubeResourceKind { return kubeResourcePod }
+func (r podResource) value() interface{} { return r.val }
+
+type podDiscoverer struct {
+ *logger.Logger
+ informer cache.SharedInformer
+ queue *workqueue.Type
+ readyCh chan struct{}
+ stopCh chan struct{}
+}
+
+func (d *podDiscoverer) run(ctx context.Context, in chan<- resource) {
+ d.Info("pod_discoverer is started")
+ defer func() { close(d.stopCh); d.Info("pod_discoverer is stopped") }()
+
+ defer d.queue.ShutDown()
+
+ go d.informer.Run(ctx.Done())
+
+ if !cache.WaitForCacheSync(ctx.Done(), d.informer.HasSynced) {
+ return
+ }
+
+ go d.runDiscover(ctx, in)
+ close(d.readyCh)
+
+ <-ctx.Done()
+}
+
+func (d *podDiscoverer) ready() bool { return isChanClosed(d.readyCh) }
+func (d *podDiscoverer) stopped() bool { return isChanClosed(d.stopCh) }
+
+func (d *podDiscoverer) runDiscover(ctx context.Context, in chan<- resource) {
+ for {
+ item, shutdown := d.queue.Get()
+ if shutdown {
+ return
+ }
+
+ func() {
+ defer d.queue.Done(item)
+
+ key := item.(string)
+ ns, name, err := cache.SplitMetaNamespaceKey(key)
+ if err != nil {
+ return
+ }
+
+ item, exists, err := d.informer.GetStore().GetByKey(key)
+ if err != nil {
+ return
+ }
+
+ r := &podResource{src: podSource(ns, name)}
+ if exists {
+ r.val = item
+ }
+ send(ctx, in, r)
+ }()
+ }
+}
+
+func podSource(namespace, name string) string {
+ return "k8s/pod/" + namespace + "/" + name
+}
diff --git a/src/go/collectors/go.d.plugin/modules/k8s_state/init.go b/src/go/collectors/go.d.plugin/modules/k8s_state/init.go
new file mode 100644
index 000000000..998131394
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/k8s_state/init.go
@@ -0,0 +1,15 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+package k8s_state
+
+import (
+ "k8s.io/client-go/kubernetes"
+)
+
+func (ks *KubeState) initClient() (kubernetes.Interface, error) {
+ return ks.newKubeClient()
+}
+
+func (ks *KubeState) initDiscoverer(client kubernetes.Interface) discoverer {
+ return newKubeDiscovery(client, ks.Logger)
+}
diff --git a/src/go/collectors/go.d.plugin/modules/k8s_state/integrations/kubernetes_cluster_state.md b/src/go/collectors/go.d.plugin/modules/k8s_state/integrations/kubernetes_cluster_state.md
new file mode 100644
index 000000000..88d81e257
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/k8s_state/integrations/kubernetes_cluster_state.md
@@ -0,0 +1,218 @@
+<!--startmeta
+custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/go/collectors/go.d.plugin/modules/k8s_state/README.md"
+meta_yaml: "https://github.com/netdata/netdata/edit/master/src/go/collectors/go.d.plugin/modules/k8s_state/metadata.yaml"
+sidebar_label: "Kubernetes Cluster State"
+learn_status: "Published"
+learn_rel_path: "Collecting Metrics/Kubernetes"
+most_popular: True
+message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE COLLECTOR'S metadata.yaml FILE"
+endmeta-->
+
+# Kubernetes Cluster State
+
+
+<img src="https://netdata.cloud/img/kubernetes.svg" width="150"/>
+
+
+Plugin: go.d.plugin
+Module: k8s_state
+
+<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" />
+
+## Overview
+
+This collector monitors Kubernetes Nodes, Pods and Containers.
+
+
+
+
+This collector is supported on all platforms.
+
+This collector only supports collecting metrics from a single instance of this integration.
+
+
+### Default Behavior
+
+#### Auto-Detection
+
+This integration doesn't support auto-detection.
+
+#### Limits
+
+The default configuration for this integration does not impose any limits on data collection.
+
+#### Performance Impact
+
+The default configuration for this integration is not expected to impose a significant performance impact on the system.
+
+
+## Metrics
+
+Metrics grouped by *scope*.
+
+The scope defines the instance that the metric belongs to. An instance is uniquely identified by a set of labels.
+
+
+
+### Per node
+
+These metrics refer to the Node.
+
+Labels:
+
+| Label | Description |
+|:-----------|:----------------|
+| k8s_cluster_id | Cluster ID. This is equal to the kube-system namespace UID. |
+| k8s_cluster_name | Cluster name. Cluster name discovery only works in GKE. |
+| k8s_node_name | Node name. |
+
+Metrics:
+
+| Metric | Dimensions | Unit |
+|:------|:----------|:----|
+| k8s_state.node_allocatable_cpu_requests_utilization | requests | % |
+| k8s_state.node_allocatable_cpu_requests_used | requests | millicpu |
+| k8s_state.node_allocatable_cpu_limits_utilization | limits | % |
+| k8s_state.node_allocatable_cpu_limits_used | limits | millicpu |
+| k8s_state.node_allocatable_mem_requests_utilization | requests | % |
+| k8s_state.node_allocatable_mem_requests_used | requests | bytes |
+| k8s_state.node_allocatable_mem_limits_utilization | limits | % |
+| k8s_state.node_allocatable_mem_limits_used | limits | bytes |
+| k8s_state.node_allocatable_pods_utilization | allocated | % |
+| k8s_state.node_allocatable_pods_usage | available, allocated | pods |
+| k8s_state.node_condition | a dimension per condition | status |
+| k8s_state.node_schedulability | schedulable, unschedulable | state |
+| k8s_state.node_pods_readiness | ready | % |
+| k8s_state.node_pods_readiness_state | ready, unready | pods |
+| k8s_state.node_pods_condition | pod_ready, pod_scheduled, pod_initialized, containers_ready | pods |
+| k8s_state.node_pods_phase | running, failed, succeeded, pending | pods |
+| k8s_state.node_containers | containers, init_containers | containers |
+| k8s_state.node_containers_state | running, waiting, terminated | containers |
+| k8s_state.node_init_containers_state | running, waiting, terminated | containers |
+| k8s_state.node_age | age | seconds |
+
+### Per pod
+
+These metrics refer to the Pod.
+
+Labels:
+
+| Label | Description |
+|:-----------|:----------------|
+| k8s_cluster_id | Cluster ID. This is equal to the kube-system namespace UID. |
+| k8s_cluster_name | Cluster name. Cluster name discovery only works in GKE. |
+| k8s_node_name | Node name. |
+| k8s_namespace | Namespace. |
+| k8s_controller_kind | Controller kind (ReplicaSet, DaemonSet, StatefulSet, Job, etc.). |
+| k8s_controller_name | Controller name. |
+| k8s_pod_name | Pod name. |
+| k8s_qos_class | Pod QOS class (burstable, guaranteed, besteffort). |
+
+Metrics:
+
+| Metric | Dimensions | Unit |
+|:------|:----------|:----|
+| k8s_state.pod_cpu_requests_used | requests | millicpu |
+| k8s_state.pod_cpu_limits_used | limits | millicpu |
+| k8s_state.pod_mem_requests_used | requests | bytes |
+| k8s_state.pod_mem_limits_used | limits | bytes |
+| k8s_state.pod_condition | pod_ready, pod_scheduled, pod_initialized, containers_ready | state |
+| k8s_state.pod_phase | running, failed, succeeded, pending | state |
+| k8s_state.pod_age | age | seconds |
+| k8s_state.pod_containers | containers, init_containers | containers |
+| k8s_state.pod_containers_state | running, waiting, terminated | containers |
+| k8s_state.pod_init_containers_state | running, waiting, terminated | containers |
+
+### Per container
+
+These metrics refer to the Pod container.
+
+Labels:
+
+| Label | Description |
+|:-----------|:----------------|
+| k8s_cluster_id | Cluster ID. This is equal to the kube-system namespace UID. |
+| k8s_cluster_name | Cluster name. Cluster name discovery only works in GKE. |
+| k8s_node_name | Node name. |
+| k8s_namespace | Namespace. |
+| k8s_controller_kind | Controller kind (ReplicaSet, DaemonSet, StatefulSet, Job, etc.). |
+| k8s_controller_name | Controller name. |
+| k8s_pod_name | Pod name. |
+| k8s_qos_class | Pod QOS class (burstable, guaranteed, besteffort). |
+| k8s_container_name | Container name. |
+
+Metrics:
+
+| Metric | Dimensions | Unit |
+|:------|:----------|:----|
+| k8s_state.pod_container_readiness_state | ready | state |
+| k8s_state.pod_container_restarts | restarts | restarts |
+| k8s_state.pod_container_state | running, waiting, terminated | state |
+| k8s_state.pod_container_waiting_state_reason | a dimension per reason | state |
+| k8s_state.pod_container_terminated_state_reason | a dimension per reason | state |
+
+
+
+## Alerts
+
+There are no alerts configured by default for this integration.
+
+
+## Setup
+
+### Prerequisites
+
+No action required.
+
+### Configuration
+
+#### File
+
+The configuration file name for this integration is `go.d/k8s_state.conf`.
+
+
+You can edit the configuration file using the `edit-config` script from the
+Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).
+
+```bash
+cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata
+sudo ./edit-config go.d/k8s_state.conf
+```
+#### Options
+
+
+
+There are no configuration options.
+
+#### Examples
+There are no configuration examples.
+
+
+
+## Troubleshooting
+
+### Debug Mode
+
+To troubleshoot issues with the `k8s_state` collector, run the `go.d.plugin` with the debug option enabled. The output
+should give you clues as to why the collector isn't working.
+
+- Navigate to the `plugins.d` directory, usually at `/usr/libexec/netdata/plugins.d/`. If that's not the case on
+ your system, open `netdata.conf` and look for the `plugins` setting under `[directories]`.
+
+ ```bash
+ cd /usr/libexec/netdata/plugins.d/
+ ```
+
+- Switch to the `netdata` user.
+
+ ```bash
+ sudo -u netdata -s
+ ```
+
+- Run the `go.d.plugin` to debug the collector:
+
+ ```bash
+ ./go.d.plugin -d -m k8s_state
+ ```
+
+
diff --git a/src/go/collectors/go.d.plugin/modules/k8s_state/kube_state.go b/src/go/collectors/go.d.plugin/modules/k8s_state/kube_state.go
new file mode 100644
index 000000000..95fd2d1ca
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/k8s_state/kube_state.go
@@ -0,0 +1,147 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+package k8s_state
+
+import (
+ "context"
+ _ "embed"
+ "errors"
+ "fmt"
+ "sync"
+ "time"
+
+ "github.com/netdata/netdata/go/go.d.plugin/agent/module"
+
+ "k8s.io/client-go/kubernetes"
+)
+
+//go:embed "config_schema.json"
+var configSchema string
+
+func init() {
+ module.Register("k8s_state", module.Creator{
+ JobConfigSchema: configSchema,
+ Defaults: module.Defaults{
+ Disabled: true,
+ },
+ Create: func() module.Module { return New() },
+ Config: func() any { return &Config{} },
+ })
+}
+
+func New() *KubeState {
+ return &KubeState{
+ initDelay: time.Second * 3,
+ newKubeClient: newKubeClient,
+ charts: baseCharts.Copy(),
+ once: &sync.Once{},
+ wg: &sync.WaitGroup{},
+ state: newKubeState(),
+ }
+}
+
+type Config struct {
+ UpdateEvery int `yaml:"update_every,omitempty" json:"update_every"`
+}
+
+type (
+ KubeState struct {
+ module.Base
+ Config `yaml:",inline" json:""`
+
+ charts *module.Charts
+
+ client kubernetes.Interface
+ newKubeClient func() (kubernetes.Interface, error)
+
+ startTime time.Time
+ initDelay time.Duration
+ once *sync.Once
+ wg *sync.WaitGroup
+ discoverer discoverer
+ ctx context.Context
+ ctxCancel context.CancelFunc
+ kubeClusterID string
+ kubeClusterName string
+
+ state *kubeState
+ }
+ discoverer interface {
+ run(ctx context.Context, in chan<- resource)
+ ready() bool
+ stopped() bool
+ }
+)
+
+func (ks *KubeState) Configuration() any {
+ return ks.Config
+}
+
+func (ks *KubeState) Init() error {
+ client, err := ks.initClient()
+ if err != nil {
+ ks.Errorf("client initialization: %v", err)
+ return err
+ }
+ ks.client = client
+
+ ks.ctx, ks.ctxCancel = context.WithCancel(context.Background())
+
+ ks.discoverer = ks.initDiscoverer(ks.client)
+
+ return nil
+}
+
+func (ks *KubeState) Check() error {
+ if ks.client == nil || ks.discoverer == nil {
+ ks.Error("not initialized job")
+ return errors.New("not initialized")
+ }
+
+ ver, err := ks.client.Discovery().ServerVersion()
+ if err != nil {
+ err := fmt.Errorf("failed to connect to K8s API server: %v", err)
+ ks.Error(err)
+ return err
+ }
+
+ ks.Infof("successfully connected to the Kubernetes API server '%s'", ver)
+
+ return nil
+}
+
+func (ks *KubeState) Charts() *module.Charts {
+ return ks.charts
+}
+
+func (ks *KubeState) Collect() map[string]int64 {
+ ms, err := ks.collect()
+ if err != nil {
+ ks.Error(err)
+ }
+
+ if len(ms) == 0 {
+ return nil
+ }
+ return ms
+}
+
+func (ks *KubeState) Cleanup() {
+ if ks.ctxCancel == nil {
+ return
+ }
+ ks.ctxCancel()
+
+ c := make(chan struct{})
+ go func() { defer close(c); ks.wg.Wait() }()
+
+ t := time.NewTimer(time.Second * 5)
+ defer t.Stop()
+
+ select {
+ case <-c:
+ return
+ case <-t.C:
+ return
+ }
+}
diff --git a/src/go/collectors/go.d.plugin/modules/k8s_state/kube_state_test.go b/src/go/collectors/go.d.plugin/modules/k8s_state/kube_state_test.go
new file mode 100644
index 000000000..99560d6dc
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/k8s_state/kube_state_test.go
@@ -0,0 +1,859 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+package k8s_state
+
+import (
+ "context"
+ "errors"
+ "fmt"
+ "os"
+ "strings"
+ "testing"
+ "time"
+
+ "github.com/netdata/netdata/go/go.d.plugin/agent/module"
+
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+ corev1 "k8s.io/api/core/v1"
+ apiresource "k8s.io/apimachinery/pkg/api/resource"
+ metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
+ "k8s.io/apimachinery/pkg/version"
+ "k8s.io/client-go/discovery"
+ "k8s.io/client-go/kubernetes"
+ "k8s.io/client-go/kubernetes/fake"
+)
+
+var (
+ dataConfigJSON, _ = os.ReadFile("testdata/config.json")
+ dataConfigYAML, _ = os.ReadFile("testdata/config.yaml")
+)
+
+func Test_testDataIsValid(t *testing.T) {
+ for name, data := range map[string][]byte{
+ "dataConfigJSON": dataConfigJSON,
+ "dataConfigYAML": dataConfigYAML,
+ } {
+ require.NotNil(t, data, name)
+ }
+}
+
+func TestKubeState_ConfigurationSerialize(t *testing.T) {
+ module.TestConfigurationSerialize(t, &KubeState{}, dataConfigJSON, dataConfigYAML)
+}
+
+func TestKubeState_Init(t *testing.T) {
+ tests := map[string]struct {
+ wantFail bool
+ prepare func() *KubeState
+ }{
+ "success when no error on initializing K8s client": {
+ wantFail: false,
+ prepare: func() *KubeState {
+ ks := New()
+ ks.newKubeClient = func() (kubernetes.Interface, error) { return fake.NewSimpleClientset(), nil }
+ return ks
+ },
+ },
+ "fail when get an error on initializing K8s client": {
+ wantFail: true,
+ prepare: func() *KubeState {
+ ks := New()
+ ks.newKubeClient = func() (kubernetes.Interface, error) { return nil, errors.New("newKubeClient() error") }
+ return ks
+ },
+ },
+ }
+
+ for name, test := range tests {
+ t.Run(name, func(t *testing.T) {
+ ks := test.prepare()
+
+ if test.wantFail {
+ assert.Error(t, ks.Init())
+ } else {
+ assert.NoError(t, ks.Init())
+ }
+ })
+ }
+}
+
+func TestKubeState_Check(t *testing.T) {
+ tests := map[string]struct {
+ wantFail bool
+ prepare func() *KubeState
+ }{
+ "success when connected to the K8s API": {
+ wantFail: false,
+ prepare: func() *KubeState {
+ ks := New()
+ ks.newKubeClient = func() (kubernetes.Interface, error) { return fake.NewSimpleClientset(), nil }
+ return ks
+ },
+ },
+ "fail when not connected to the K8s API": {
+ wantFail: true,
+ prepare: func() *KubeState {
+ ks := New()
+ client := &brokenInfoKubeClient{fake.NewSimpleClientset()}
+ ks.newKubeClient = func() (kubernetes.Interface, error) { return client, nil }
+ return ks
+ },
+ },
+ }
+
+ for name, test := range tests {
+ t.Run(name, func(t *testing.T) {
+ ks := test.prepare()
+ require.NoError(t, ks.Init())
+
+ if test.wantFail {
+ assert.Error(t, ks.Check())
+ } else {
+ assert.NoError(t, ks.Check())
+ }
+ })
+ }
+}
+
+func TestKubeState_Charts(t *testing.T) {
+ ks := New()
+
+ assert.NotEmpty(t, *ks.Charts())
+}
+
+func TestKubeState_Cleanup(t *testing.T) {
+ tests := map[string]struct {
+ prepare func() *KubeState
+ doInit bool
+ doCollect bool
+ }{
+ "before init": {
+ doInit: false,
+ doCollect: false,
+ prepare: func() *KubeState {
+ ks := New()
+ ks.newKubeClient = func() (kubernetes.Interface, error) { return fake.NewSimpleClientset(), nil }
+ return ks
+ },
+ },
+ "after init": {
+ doInit: true,
+ doCollect: false,
+ prepare: func() *KubeState {
+ ks := New()
+ ks.newKubeClient = func() (kubernetes.Interface, error) { return fake.NewSimpleClientset(), nil }
+ return ks
+ },
+ },
+ "after collect": {
+ doInit: true,
+ doCollect: true,
+ prepare: func() *KubeState {
+ ks := New()
+ ks.newKubeClient = func() (kubernetes.Interface, error) { return fake.NewSimpleClientset(), nil }
+ return ks
+ },
+ },
+ }
+
+ for name, test := range tests {
+ t.Run(name, func(t *testing.T) {
+ ks := test.prepare()
+
+ if test.doInit {
+ _ = ks.Init()
+ }
+ if test.doCollect {
+ _ = ks.Collect()
+ time.Sleep(ks.initDelay)
+ }
+
+ assert.NotPanics(t, ks.Cleanup)
+ time.Sleep(time.Second)
+ if test.doCollect {
+ assert.True(t, ks.discoverer.stopped())
+ }
+ })
+ }
+}
+
+func TestKubeState_Collect(t *testing.T) {
+ type (
+ testCaseStep func(t *testing.T, ks *KubeState)
+ testCase struct {
+ client kubernetes.Interface
+ steps []testCaseStep
+ }
+ )
+
+ tests := map[string]struct {
+ create func(t *testing.T) testCase
+ }{
+ "Node only": {
+ create: func(t *testing.T) testCase {
+ client := fake.NewSimpleClientset(
+ newNode("node01"),
+ )
+
+ step1 := func(t *testing.T, ks *KubeState) {
+ mx := ks.Collect()
+ expected := map[string]int64{
+ "discovery_node_discoverer_state": 1,
+ "discovery_pod_discoverer_state": 1,
+ "node_node01_age": 3,
+ "node_node01_alloc_cpu_limits_used": 0,
+ "node_node01_alloc_cpu_limits_util": 0,
+ "node_node01_alloc_cpu_requests_used": 0,
+ "node_node01_alloc_cpu_requests_util": 0,
+ "node_node01_alloc_mem_limits_used": 0,
+ "node_node01_alloc_mem_limits_util": 0,
+ "node_node01_alloc_mem_requests_used": 0,
+ "node_node01_alloc_mem_requests_util": 0,
+ "node_node01_alloc_pods_allocated": 0,
+ "node_node01_alloc_pods_available": 110,
+ "node_node01_alloc_pods_util": 0,
+ "node_node01_cond_diskpressure": 0,
+ "node_node01_cond_memorypressure": 0,
+ "node_node01_cond_networkunavailable": 0,
+ "node_node01_cond_pidpressure": 0,
+ "node_node01_cond_ready": 1,
+ "node_node01_schedulability_schedulable": 1,
+ "node_node01_schedulability_unschedulable": 0,
+ "node_node01_containers": 0,
+ "node_node01_containers_state_running": 0,
+ "node_node01_containers_state_terminated": 0,
+ "node_node01_containers_state_waiting": 0,
+ "node_node01_init_containers": 0,
+ "node_node01_init_containers_state_running": 0,
+ "node_node01_init_containers_state_terminated": 0,
+ "node_node01_init_containers_state_waiting": 0,
+ "node_node01_pods_cond_containersready": 0,
+ "node_node01_pods_cond_podinitialized": 0,
+ "node_node01_pods_cond_podready": 0,
+ "node_node01_pods_cond_podscheduled": 0,
+ "node_node01_pods_phase_failed": 0,
+ "node_node01_pods_phase_pending": 0,
+ "node_node01_pods_phase_running": 0,
+ "node_node01_pods_phase_succeeded": 0,
+ "node_node01_pods_readiness": 0,
+ "node_node01_pods_readiness_ready": 0,
+ "node_node01_pods_readiness_unready": 0,
+ }
+ copyAge(expected, mx)
+ assert.Equal(t, expected, mx)
+ assert.Equal(t,
+ len(nodeChartsTmpl)+len(baseCharts),
+ len(*ks.Charts()),
+ )
+ }
+
+ return testCase{
+ client: client,
+ steps: []testCaseStep{step1},
+ }
+ },
+ },
+ "Pod only": {
+ create: func(t *testing.T) testCase {
+ pod := newPod("node01", "pod01")
+ client := fake.NewSimpleClientset(
+ pod,
+ )
+
+ step1 := func(t *testing.T, ks *KubeState) {
+ mx := ks.Collect()
+ expected := map[string]int64{
+ "discovery_node_discoverer_state": 1,
+ "discovery_pod_discoverer_state": 1,
+ "pod_default_pod01_age": 3,
+ "pod_default_pod01_cpu_limits_used": 400,
+ "pod_default_pod01_cpu_requests_used": 200,
+ "pod_default_pod01_mem_limits_used": 419430400,
+ "pod_default_pod01_mem_requests_used": 209715200,
+ "pod_default_pod01_cond_containersready": 1,
+ "pod_default_pod01_cond_podinitialized": 1,
+ "pod_default_pod01_cond_podready": 1,
+ "pod_default_pod01_cond_podscheduled": 1,
+ "pod_default_pod01_container_container1_readiness": 1,
+ "pod_default_pod01_container_container1_restarts": 0,
+ "pod_default_pod01_container_container1_state_running": 1,
+ "pod_default_pod01_container_container1_state_terminated": 0,
+ "pod_default_pod01_container_container1_state_waiting": 0,
+ "pod_default_pod01_container_container2_readiness": 1,
+ "pod_default_pod01_container_container2_restarts": 0,
+ "pod_default_pod01_container_container2_state_running": 1,
+ "pod_default_pod01_container_container2_state_terminated": 0,
+ "pod_default_pod01_container_container2_state_waiting": 0,
+ "pod_default_pod01_containers": 2,
+ "pod_default_pod01_containers_state_running": 2,
+ "pod_default_pod01_containers_state_terminated": 0,
+ "pod_default_pod01_containers_state_waiting": 0,
+ "pod_default_pod01_init_containers": 1,
+ "pod_default_pod01_init_containers_state_running": 0,
+ "pod_default_pod01_init_containers_state_terminated": 1,
+ "pod_default_pod01_init_containers_state_waiting": 0,
+ "pod_default_pod01_phase_failed": 0,
+ "pod_default_pod01_phase_pending": 0,
+ "pod_default_pod01_phase_running": 1,
+ "pod_default_pod01_phase_succeeded": 0,
+ }
+ copyAge(expected, mx)
+
+ assert.Equal(t, expected, mx)
+ assert.Equal(t,
+ len(podChartsTmpl)+len(containerChartsTmpl)*len(pod.Spec.Containers)+len(baseCharts),
+ len(*ks.Charts()),
+ )
+ }
+
+ return testCase{
+ client: client,
+ steps: []testCaseStep{step1},
+ }
+ },
+ },
+ "Nodes and Pods": {
+ create: func(t *testing.T) testCase {
+ node := newNode("node01")
+ pod := newPod(node.Name, "pod01")
+ client := fake.NewSimpleClientset(
+ node,
+ pod,
+ )
+
+ step1 := func(t *testing.T, ks *KubeState) {
+ mx := ks.Collect()
+ expected := map[string]int64{
+ "discovery_node_discoverer_state": 1,
+ "discovery_pod_discoverer_state": 1,
+ "node_node01_age": 3,
+ "node_node01_alloc_cpu_limits_used": 400,
+ "node_node01_alloc_cpu_limits_util": 11428,
+ "node_node01_alloc_cpu_requests_used": 200,
+ "node_node01_alloc_cpu_requests_util": 5714,
+ "node_node01_alloc_mem_limits_used": 419430400,
+ "node_node01_alloc_mem_limits_util": 11428,
+ "node_node01_alloc_mem_requests_used": 209715200,
+ "node_node01_alloc_mem_requests_util": 5714,
+ "node_node01_alloc_pods_allocated": 1,
+ "node_node01_alloc_pods_available": 109,
+ "node_node01_alloc_pods_util": 909,
+ "node_node01_cond_diskpressure": 0,
+ "node_node01_cond_memorypressure": 0,
+ "node_node01_cond_networkunavailable": 0,
+ "node_node01_cond_pidpressure": 0,
+ "node_node01_cond_ready": 1,
+ "node_node01_schedulability_schedulable": 1,
+ "node_node01_schedulability_unschedulable": 0,
+ "node_node01_containers": 2,
+ "node_node01_containers_state_running": 2,
+ "node_node01_containers_state_terminated": 0,
+ "node_node01_containers_state_waiting": 0,
+ "node_node01_init_containers": 1,
+ "node_node01_init_containers_state_running": 0,
+ "node_node01_init_containers_state_terminated": 1,
+ "node_node01_init_containers_state_waiting": 0,
+ "node_node01_pods_cond_containersready": 1,
+ "node_node01_pods_cond_podinitialized": 1,
+ "node_node01_pods_cond_podready": 1,
+ "node_node01_pods_cond_podscheduled": 1,
+ "node_node01_pods_phase_failed": 0,
+ "node_node01_pods_phase_pending": 0,
+ "node_node01_pods_phase_running": 1,
+ "node_node01_pods_phase_succeeded": 0,
+ "node_node01_pods_readiness": 100000,
+ "node_node01_pods_readiness_ready": 1,
+ "node_node01_pods_readiness_unready": 0,
+ "pod_default_pod01_age": 3,
+ "pod_default_pod01_cpu_limits_used": 400,
+ "pod_default_pod01_cpu_requests_used": 200,
+ "pod_default_pod01_mem_limits_used": 419430400,
+ "pod_default_pod01_mem_requests_used": 209715200,
+ "pod_default_pod01_cond_containersready": 1,
+ "pod_default_pod01_cond_podinitialized": 1,
+ "pod_default_pod01_cond_podready": 1,
+ "pod_default_pod01_cond_podscheduled": 1,
+ "pod_default_pod01_container_container1_readiness": 1,
+ "pod_default_pod01_container_container1_restarts": 0,
+ "pod_default_pod01_container_container1_state_running": 1,
+ "pod_default_pod01_container_container1_state_terminated": 0,
+ "pod_default_pod01_container_container1_state_waiting": 0,
+ "pod_default_pod01_container_container2_readiness": 1,
+ "pod_default_pod01_container_container2_restarts": 0,
+ "pod_default_pod01_container_container2_state_running": 1,
+ "pod_default_pod01_container_container2_state_terminated": 0,
+ "pod_default_pod01_container_container2_state_waiting": 0,
+ "pod_default_pod01_containers": 2,
+ "pod_default_pod01_containers_state_running": 2,
+ "pod_default_pod01_containers_state_terminated": 0,
+ "pod_default_pod01_containers_state_waiting": 0,
+ "pod_default_pod01_init_containers": 1,
+ "pod_default_pod01_init_containers_state_running": 0,
+ "pod_default_pod01_init_containers_state_terminated": 1,
+ "pod_default_pod01_init_containers_state_waiting": 0,
+ "pod_default_pod01_phase_failed": 0,
+ "pod_default_pod01_phase_pending": 0,
+ "pod_default_pod01_phase_running": 1,
+ "pod_default_pod01_phase_succeeded": 0,
+ }
+ copyAge(expected, mx)
+
+ assert.Equal(t, expected, mx)
+ assert.Equal(t,
+ len(nodeChartsTmpl)+len(podChartsTmpl)+len(containerChartsTmpl)*len(pod.Spec.Containers)+len(baseCharts),
+ len(*ks.Charts()),
+ )
+ }
+
+ return testCase{
+ client: client,
+ steps: []testCaseStep{step1},
+ }
+ },
+ },
+ "delete a Pod in runtime": {
+ create: func(t *testing.T) testCase {
+ ctx := context.Background()
+ node := newNode("node01")
+ pod := newPod(node.Name, "pod01")
+ client := fake.NewSimpleClientset(
+ node,
+ pod,
+ )
+ step1 := func(t *testing.T, ks *KubeState) {
+ _ = ks.Collect()
+ _ = client.CoreV1().Pods(pod.Namespace).Delete(ctx, pod.Name, metav1.DeleteOptions{})
+ }
+
+ step2 := func(t *testing.T, ks *KubeState) {
+ mx := ks.Collect()
+ expected := map[string]int64{
+ "discovery_node_discoverer_state": 1,
+ "discovery_pod_discoverer_state": 1,
+ "node_node01_age": 4,
+ "node_node01_alloc_cpu_limits_used": 0,
+ "node_node01_alloc_cpu_limits_util": 0,
+ "node_node01_alloc_cpu_requests_used": 0,
+ "node_node01_alloc_cpu_requests_util": 0,
+ "node_node01_alloc_mem_limits_used": 0,
+ "node_node01_alloc_mem_limits_util": 0,
+ "node_node01_alloc_mem_requests_used": 0,
+ "node_node01_alloc_mem_requests_util": 0,
+ "node_node01_alloc_pods_allocated": 0,
+ "node_node01_alloc_pods_available": 110,
+ "node_node01_alloc_pods_util": 0,
+ "node_node01_cond_diskpressure": 0,
+ "node_node01_cond_memorypressure": 0,
+ "node_node01_cond_networkunavailable": 0,
+ "node_node01_cond_pidpressure": 0,
+ "node_node01_cond_ready": 1,
+ "node_node01_schedulability_schedulable": 1,
+ "node_node01_schedulability_unschedulable": 0,
+ "node_node01_containers": 0,
+ "node_node01_containers_state_running": 0,
+ "node_node01_containers_state_terminated": 0,
+ "node_node01_containers_state_waiting": 0,
+ "node_node01_init_containers": 0,
+ "node_node01_init_containers_state_running": 0,
+ "node_node01_init_containers_state_terminated": 0,
+ "node_node01_init_containers_state_waiting": 0,
+ "node_node01_pods_cond_containersready": 0,
+ "node_node01_pods_cond_podinitialized": 0,
+ "node_node01_pods_cond_podready": 0,
+ "node_node01_pods_cond_podscheduled": 0,
+ "node_node01_pods_phase_failed": 0,
+ "node_node01_pods_phase_pending": 0,
+ "node_node01_pods_phase_running": 0,
+ "node_node01_pods_phase_succeeded": 0,
+ "node_node01_pods_readiness": 0,
+ "node_node01_pods_readiness_ready": 0,
+ "node_node01_pods_readiness_unready": 0,
+ }
+ copyAge(expected, mx)
+
+ assert.Equal(t, expected, mx)
+ assert.Equal(t,
+ len(nodeChartsTmpl)+len(podChartsTmpl)+len(containerChartsTmpl)*len(pod.Spec.Containers)+len(baseCharts),
+ len(*ks.Charts()),
+ )
+ assert.Equal(t,
+ len(podChartsTmpl)+len(containerChartsTmpl)*len(pod.Spec.Containers),
+ calcObsoleteCharts(*ks.Charts()),
+ )
+ }
+
+ return testCase{
+ client: client,
+ steps: []testCaseStep{step1, step2},
+ }
+ },
+ },
+ "slow spec.NodeName set": {
+ create: func(t *testing.T) testCase {
+ ctx := context.Background()
+ node := newNode("node01")
+ podOrig := newPod(node.Name, "pod01")
+ podOrig.Spec.NodeName = ""
+ client := fake.NewSimpleClientset(
+ node,
+ podOrig,
+ )
+ podUpdated := newPod(node.Name, "pod01") // with set Spec.NodeName
+
+ step1 := func(t *testing.T, ks *KubeState) {
+ _ = ks.Collect()
+ for _, c := range *ks.Charts() {
+ if strings.HasPrefix(c.ID, "pod_") {
+ ok := isLabelValueSet(c, labelKeyNodeName)
+ assert.Falsef(t, ok, "chart '%s' has not empty %s label", c.ID, labelKeyNodeName)
+ }
+ }
+ }
+ step2 := func(t *testing.T, ks *KubeState) {
+ _, _ = client.CoreV1().Pods(podOrig.Namespace).Update(ctx, podUpdated, metav1.UpdateOptions{})
+ time.Sleep(time.Millisecond * 50)
+ _ = ks.Collect()
+
+ for _, c := range *ks.Charts() {
+ if strings.HasPrefix(c.ID, "pod_") {
+ ok := isLabelValueSet(c, labelKeyNodeName)
+ assert.Truef(t, ok, "chart '%s' has empty %s label", c.ID, labelKeyNodeName)
+ }
+ }
+ }
+
+ return testCase{
+ client: client,
+ steps: []testCaseStep{step1, step2},
+ }
+ },
+ },
+ "add a Pod in runtime": {
+ create: func(t *testing.T) testCase {
+ ctx := context.Background()
+ node := newNode("node01")
+ pod1 := newPod(node.Name, "pod01")
+ pod2 := newPod(node.Name, "pod02")
+ client := fake.NewSimpleClientset(
+ node,
+ pod1,
+ )
+ step1 := func(t *testing.T, ks *KubeState) {
+ _ = ks.Collect()
+ _, _ = client.CoreV1().Pods(pod1.Namespace).Create(ctx, pod2, metav1.CreateOptions{})
+ }
+
+ step2 := func(t *testing.T, ks *KubeState) {
+ mx := ks.Collect()
+ expected := map[string]int64{
+ "discovery_node_discoverer_state": 1,
+ "discovery_pod_discoverer_state": 1,
+ "node_node01_age": 4,
+ "node_node01_alloc_cpu_limits_used": 800,
+ "node_node01_alloc_cpu_limits_util": 22857,
+ "node_node01_alloc_cpu_requests_used": 400,
+ "node_node01_alloc_cpu_requests_util": 11428,
+ "node_node01_alloc_mem_limits_used": 838860800,
+ "node_node01_alloc_mem_limits_util": 22857,
+ "node_node01_alloc_mem_requests_used": 419430400,
+ "node_node01_alloc_mem_requests_util": 11428,
+ "node_node01_alloc_pods_allocated": 2,
+ "node_node01_alloc_pods_available": 108,
+ "node_node01_alloc_pods_util": 1818,
+ "node_node01_cond_diskpressure": 0,
+ "node_node01_cond_memorypressure": 0,
+ "node_node01_cond_networkunavailable": 0,
+ "node_node01_cond_pidpressure": 0,
+ "node_node01_cond_ready": 1,
+ "node_node01_schedulability_schedulable": 1,
+ "node_node01_schedulability_unschedulable": 0,
+ "node_node01_containers": 4,
+ "node_node01_containers_state_running": 4,
+ "node_node01_containers_state_terminated": 0,
+ "node_node01_containers_state_waiting": 0,
+ "node_node01_init_containers": 2,
+ "node_node01_init_containers_state_running": 0,
+ "node_node01_init_containers_state_terminated": 2,
+ "node_node01_init_containers_state_waiting": 0,
+ "node_node01_pods_cond_containersready": 2,
+ "node_node01_pods_cond_podinitialized": 2,
+ "node_node01_pods_cond_podready": 2,
+ "node_node01_pods_cond_podscheduled": 2,
+ "node_node01_pods_phase_failed": 0,
+ "node_node01_pods_phase_pending": 0,
+ "node_node01_pods_phase_running": 2,
+ "node_node01_pods_phase_succeeded": 0,
+ "node_node01_pods_readiness": 100000,
+ "node_node01_pods_readiness_ready": 2,
+ "node_node01_pods_readiness_unready": 0,
+ "pod_default_pod01_age": 4,
+ "pod_default_pod01_cpu_limits_used": 400,
+ "pod_default_pod01_cpu_requests_used": 200,
+ "pod_default_pod01_mem_limits_used": 419430400,
+ "pod_default_pod01_mem_requests_used": 209715200,
+ "pod_default_pod01_cond_containersready": 1,
+ "pod_default_pod01_cond_podinitialized": 1,
+ "pod_default_pod01_cond_podready": 1,
+ "pod_default_pod01_cond_podscheduled": 1,
+ "pod_default_pod01_container_container1_readiness": 1,
+ "pod_default_pod01_container_container1_restarts": 0,
+ "pod_default_pod01_container_container1_state_running": 1,
+ "pod_default_pod01_container_container1_state_terminated": 0,
+ "pod_default_pod01_container_container1_state_waiting": 0,
+ "pod_default_pod01_container_container2_readiness": 1,
+ "pod_default_pod01_container_container2_restarts": 0,
+ "pod_default_pod01_container_container2_state_running": 1,
+ "pod_default_pod01_container_container2_state_terminated": 0,
+ "pod_default_pod01_container_container2_state_waiting": 0,
+ "pod_default_pod01_containers": 2,
+ "pod_default_pod01_containers_state_running": 2,
+ "pod_default_pod01_containers_state_terminated": 0,
+ "pod_default_pod01_containers_state_waiting": 0,
+ "pod_default_pod01_init_containers": 1,
+ "pod_default_pod01_init_containers_state_running": 0,
+ "pod_default_pod01_init_containers_state_terminated": 1,
+ "pod_default_pod01_init_containers_state_waiting": 0,
+ "pod_default_pod01_phase_failed": 0,
+ "pod_default_pod01_phase_pending": 0,
+ "pod_default_pod01_phase_running": 1,
+ "pod_default_pod01_phase_succeeded": 0,
+ "pod_default_pod02_age": 4,
+ "pod_default_pod02_cpu_limits_used": 400,
+ "pod_default_pod02_cpu_requests_used": 200,
+ "pod_default_pod02_mem_limits_used": 419430400,
+ "pod_default_pod02_mem_requests_used": 209715200,
+ "pod_default_pod02_cond_containersready": 1,
+ "pod_default_pod02_cond_podinitialized": 1,
+ "pod_default_pod02_cond_podready": 1,
+ "pod_default_pod02_cond_podscheduled": 1,
+ "pod_default_pod02_container_container1_readiness": 1,
+ "pod_default_pod02_container_container1_restarts": 0,
+ "pod_default_pod02_container_container1_state_running": 1,
+ "pod_default_pod02_container_container1_state_terminated": 0,
+ "pod_default_pod02_container_container1_state_waiting": 0,
+ "pod_default_pod02_container_container2_readiness": 1,
+ "pod_default_pod02_container_container2_restarts": 0,
+ "pod_default_pod02_container_container2_state_running": 1,
+ "pod_default_pod02_container_container2_state_terminated": 0,
+ "pod_default_pod02_container_container2_state_waiting": 0,
+ "pod_default_pod02_containers": 2,
+ "pod_default_pod02_containers_state_running": 2,
+ "pod_default_pod02_containers_state_terminated": 0,
+ "pod_default_pod02_containers_state_waiting": 0,
+ "pod_default_pod02_init_containers": 1,
+ "pod_default_pod02_init_containers_state_running": 0,
+ "pod_default_pod02_init_containers_state_terminated": 1,
+ "pod_default_pod02_init_containers_state_waiting": 0,
+ "pod_default_pod02_phase_failed": 0,
+ "pod_default_pod02_phase_pending": 0,
+ "pod_default_pod02_phase_running": 1,
+ "pod_default_pod02_phase_succeeded": 0,
+ }
+ copyAge(expected, mx)
+
+ assert.Equal(t, expected, mx)
+ assert.Equal(t,
+ len(nodeChartsTmpl)+
+ len(podChartsTmpl)*2+
+ len(containerChartsTmpl)*len(pod1.Spec.Containers)+
+ len(containerChartsTmpl)*len(pod2.Spec.Containers)+
+ len(baseCharts),
+ len(*ks.Charts()),
+ )
+ }
+
+ return testCase{
+ client: client,
+ steps: []testCaseStep{step1, step2},
+ }
+ },
+ },
+ }
+
+ for name, creator := range tests {
+ t.Run(name, func(t *testing.T) {
+ test := creator.create(t)
+
+ ks := New()
+ ks.newKubeClient = func() (kubernetes.Interface, error) { return test.client, nil }
+
+ require.NoError(t, ks.Init())
+ require.NoError(t, ks.Check())
+ defer ks.Cleanup()
+
+ for i, executeStep := range test.steps {
+ if i == 0 {
+ _ = ks.Collect()
+ time.Sleep(ks.initDelay)
+ } else {
+ time.Sleep(time.Second)
+ }
+ executeStep(t, ks)
+ }
+ })
+ }
+}
+
+func newNode(name string) *corev1.Node {
+ return &corev1.Node{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: name,
+ CreationTimestamp: metav1.Time{Time: time.Now()},
+ },
+ Status: corev1.NodeStatus{
+ Capacity: corev1.ResourceList{
+ corev1.ResourceCPU: mustQuantity("4000m"),
+ corev1.ResourceMemory: mustQuantity("4000Mi"),
+ "pods": mustQuantity("110"),
+ },
+ Allocatable: corev1.ResourceList{
+ corev1.ResourceCPU: mustQuantity("3500m"),
+ corev1.ResourceMemory: mustQuantity("3500Mi"),
+ "pods": mustQuantity("110"),
+ },
+ Conditions: []corev1.NodeCondition{
+ {Type: corev1.NodeReady, Status: corev1.ConditionTrue},
+ {Type: corev1.NodeMemoryPressure, Status: corev1.ConditionFalse},
+ {Type: corev1.NodeDiskPressure, Status: corev1.ConditionFalse},
+ {Type: corev1.NodePIDPressure, Status: corev1.ConditionFalse},
+ {Type: corev1.NodeNetworkUnavailable, Status: corev1.ConditionFalse},
+ },
+ },
+ }
+}
+
+func newPod(nodeName, name string) *corev1.Pod {
+ return &corev1.Pod{
+ ObjectMeta: metav1.ObjectMeta{
+ Name: name,
+ Namespace: corev1.NamespaceDefault,
+ CreationTimestamp: metav1.Time{Time: time.Now()},
+ },
+ Spec: corev1.PodSpec{
+ NodeName: nodeName,
+ InitContainers: []corev1.Container{
+ {
+ Name: "init-container1",
+ Resources: corev1.ResourceRequirements{
+ Limits: corev1.ResourceList{
+ corev1.ResourceCPU: mustQuantity("50m"),
+ corev1.ResourceMemory: mustQuantity("50Mi"),
+ },
+ Requests: corev1.ResourceList{
+ corev1.ResourceCPU: mustQuantity("10m"),
+ corev1.ResourceMemory: mustQuantity("10Mi"),
+ },
+ },
+ },
+ },
+ Containers: []corev1.Container{
+ {
+ Name: "container1",
+ Resources: corev1.ResourceRequirements{
+ Limits: corev1.ResourceList{
+ corev1.ResourceCPU: mustQuantity("200m"),
+ corev1.ResourceMemory: mustQuantity("200Mi"),
+ },
+ Requests: corev1.ResourceList{
+ corev1.ResourceCPU: mustQuantity("100m"),
+ corev1.ResourceMemory: mustQuantity("100Mi"),
+ },
+ },
+ },
+ {
+ Name: "container2",
+ Resources: corev1.ResourceRequirements{
+ Limits: corev1.ResourceList{
+ corev1.ResourceCPU: mustQuantity("200m"),
+ corev1.ResourceMemory: mustQuantity("200Mi")},
+ Requests: corev1.ResourceList{
+ corev1.ResourceCPU: mustQuantity("100m"),
+ corev1.ResourceMemory: mustQuantity("100Mi"),
+ },
+ },
+ },
+ },
+ },
+ Status: corev1.PodStatus{
+ Phase: corev1.PodRunning,
+ Conditions: []corev1.PodCondition{
+ {Type: corev1.PodReady, Status: corev1.ConditionTrue},
+ {Type: corev1.PodScheduled, Status: corev1.ConditionTrue},
+ {Type: corev1.PodInitialized, Status: corev1.ConditionTrue},
+ {Type: corev1.ContainersReady, Status: corev1.ConditionTrue},
+ },
+ InitContainerStatuses: []corev1.ContainerStatus{
+ {
+ Name: "init-container1",
+ State: corev1.ContainerState{Terminated: &corev1.ContainerStateTerminated{}},
+ },
+ },
+ ContainerStatuses: []corev1.ContainerStatus{
+ {
+ Name: "container1",
+ Ready: true,
+ State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}},
+ },
+ {
+ Name: "container2",
+ Ready: true,
+ State: corev1.ContainerState{Running: &corev1.ContainerStateRunning{}},
+ },
+ },
+ },
+ }
+}
+
+type brokenInfoKubeClient struct {
+ kubernetes.Interface
+}
+
+func (kc *brokenInfoKubeClient) Discovery() discovery.DiscoveryInterface {
+ return &brokenInfoDiscovery{kc.Interface.Discovery()}
+}
+
+type brokenInfoDiscovery struct {
+ discovery.DiscoveryInterface
+}
+
+func (d *brokenInfoDiscovery) ServerVersion() (*version.Info, error) {
+ return nil, errors.New("brokenInfoDiscovery.ServerVersion() error")
+}
+
+func calcObsoleteCharts(charts module.Charts) (num int) {
+ for _, c := range charts {
+ if c.Obsolete {
+ num++
+ }
+ }
+ return num
+}
+
+func mustQuantity(s string) apiresource.Quantity {
+ q, err := apiresource.ParseQuantity(s)
+ if err != nil {
+ panic(fmt.Sprintf("fail to create resource quantity: %v", err))
+ }
+ return q
+}
+
+func copyAge(dst, src map[string]int64) {
+ for k, v := range src {
+ if !strings.HasSuffix(k, "_age") {
+ continue
+ }
+ if _, ok := dst[k]; ok {
+ dst[k] = v
+ }
+ }
+}
+
+func isLabelValueSet(c *module.Chart, name string) bool {
+ for _, l := range c.Labels {
+ if l.Key == name {
+ return l.Value != ""
+ }
+ }
+ return false
+}
diff --git a/src/go/collectors/go.d.plugin/modules/k8s_state/metadata.yaml b/src/go/collectors/go.d.plugin/modules/k8s_state/metadata.yaml
new file mode 100644
index 000000000..7617b297f
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/k8s_state/metadata.yaml
@@ -0,0 +1,356 @@
+plugin_name: go.d.plugin
+modules:
+ - meta:
+ id: collector-go.d.plugin-k8s_state
+ plugin_name: go.d.plugin
+ module_name: k8s_state
+ monitored_instance:
+ name: Kubernetes Cluster State
+ link: https://kubernetes.io/
+ icon_filename: kubernetes.svg
+ categories:
+ - data-collection.kubernetes
+ keywords:
+ - kubernetes
+ - k8s
+ related_resources:
+ integrations:
+ list: []
+ info_provided_to_referring_integrations:
+ description: ""
+ most_popular: true
+ overview:
+ data_collection:
+ metrics_description: |
+ This collector monitors Kubernetes Nodes, Pods and Containers.
+ method_description: ""
+ supported_platforms:
+ include: []
+ exclude: []
+ multi_instance: false
+ additional_permissions:
+ description: ""
+ default_behavior:
+ auto_detection:
+ description: ""
+ limits:
+ description: ""
+ performance_impact:
+ description: ""
+ setup:
+ prerequisites:
+ list: []
+ configuration:
+ file:
+ name: go.d/k8s_state.conf
+ options:
+ description: ""
+ folding:
+ title: Config options
+ enabled: true
+ list: []
+ examples:
+ folding:
+ title: Config
+ enabled: true
+ list: []
+ troubleshooting:
+ problems:
+ list: []
+ alerts: []
+ metrics:
+ folding:
+ title: Metrics
+ enabled: false
+ description: ""
+ availability: []
+ scopes:
+ - name: node
+ description: These metrics refer to the Node.
+ labels:
+ - name: k8s_cluster_id
+ description: Cluster ID. This is equal to the kube-system namespace UID.
+ - name: k8s_cluster_name
+ description: Cluster name. Cluster name discovery only works in GKE.
+ - name: k8s_node_name
+ description: Node name.
+ metrics:
+ - name: k8s_state.node_allocatable_cpu_requests_utilization
+ description: CPU requests utilization
+ unit: '%'
+ chart_type: line
+ dimensions:
+ - name: requests
+ - name: k8s_state.node_allocatable_cpu_requests_used
+ description: CPU requests used
+ unit: millicpu
+ chart_type: line
+ dimensions:
+ - name: requests
+ - name: k8s_state.node_allocatable_cpu_limits_utilization
+ description: CPU limits utilization
+ unit: '%'
+ chart_type: line
+ dimensions:
+ - name: limits
+ - name: k8s_state.node_allocatable_cpu_limits_used
+ description: CPU limits used
+ unit: millicpu
+ chart_type: line
+ dimensions:
+ - name: limits
+ - name: k8s_state.node_allocatable_mem_requests_utilization
+ description: Memory requests utilization
+ unit: '%'
+ chart_type: line
+ dimensions:
+ - name: requests
+ - name: k8s_state.node_allocatable_mem_requests_used
+ description: Memory requests used
+ unit: bytes
+ chart_type: line
+ dimensions:
+ - name: requests
+ - name: k8s_state.node_allocatable_mem_limits_utilization
+ description: Memory limits utilization
+ unit: '%'
+ chart_type: line
+ dimensions:
+ - name: limits
+ - name: k8s_state.node_allocatable_mem_limits_used
+ description: Memory limits used
+ unit: bytes
+ chart_type: line
+ dimensions:
+ - name: limits
+ - name: k8s_state.node_allocatable_pods_utilization
+ description: Pods resource utilization
+ unit: '%'
+ chart_type: line
+ dimensions:
+ - name: allocated
+ - name: k8s_state.node_allocatable_pods_usage
+ description: Pods resource usage
+ unit: pods
+ chart_type: stacked
+ dimensions:
+ - name: available
+ - name: allocated
+ - name: k8s_state.node_condition
+ description: Condition status
+ unit: status
+ chart_type: line
+ dimensions:
+ - name: a dimension per condition
+ - name: k8s_state.node_schedulability
+ description: Schedulability
+ unit: state
+ chart_type: line
+ dimensions:
+ - name: schedulable
+ - name: unschedulable
+ - name: k8s_state.node_pods_readiness
+ description: Pods readiness
+ unit: '%'
+ chart_type: line
+ dimensions:
+ - name: ready
+ - name: k8s_state.node_pods_readiness_state
+ description: Pods readiness state
+ unit: pods
+ chart_type: line
+ dimensions:
+ - name: ready
+ - name: unready
+ - name: k8s_state.node_pods_condition
+ description: Pods condition
+ unit: pods
+ chart_type: line
+ dimensions:
+ - name: pod_ready
+ - name: pod_scheduled
+ - name: pod_initialized
+ - name: containers_ready
+ - name: k8s_state.node_pods_phase
+ description: Pods phase
+ unit: pods
+ chart_type: stacked
+ dimensions:
+ - name: running
+ - name: failed
+ - name: succeeded
+ - name: pending
+ - name: k8s_state.node_containers
+ description: Containers
+ unit: containers
+ chart_type: line
+ dimensions:
+ - name: containers
+ - name: init_containers
+ - name: k8s_state.node_containers_state
+ description: Containers state
+ unit: containers
+ chart_type: stacked
+ dimensions:
+ - name: running
+ - name: waiting
+ - name: terminated
+ - name: k8s_state.node_init_containers_state
+ description: Init containers state
+ unit: containers
+ chart_type: stacked
+ dimensions:
+ - name: running
+ - name: waiting
+ - name: terminated
+ - name: k8s_state.node_age
+ description: Age
+ unit: seconds
+ chart_type: line
+ dimensions:
+ - name: age
+ - name: pod
+ description: These metrics refer to the Pod.
+ labels:
+ - name: k8s_cluster_id
+ description: Cluster ID. This is equal to the kube-system namespace UID.
+ - name: k8s_cluster_name
+ description: Cluster name. Cluster name discovery only works in GKE.
+ - name: k8s_node_name
+ description: Node name.
+ - name: k8s_namespace
+ description: Namespace.
+ - name: k8s_controller_kind
+ description: Controller kind (ReplicaSet, DaemonSet, StatefulSet, Job, etc.).
+ - name: k8s_controller_name
+ description: Controller name.
+ - name: k8s_pod_name
+ description: Pod name.
+ - name: k8s_qos_class
+ description: Pod QOS class (burstable, guaranteed, besteffort).
+ metrics:
+ - name: k8s_state.pod_cpu_requests_used
+ description: CPU requests used
+ unit: millicpu
+ chart_type: line
+ dimensions:
+ - name: requests
+ - name: k8s_state.pod_cpu_limits_used
+ description: CPU limits used
+ unit: millicpu
+ chart_type: line
+ dimensions:
+ - name: limits
+ - name: k8s_state.pod_mem_requests_used
+ description: Memory requests used
+ unit: bytes
+ chart_type: line
+ dimensions:
+ - name: requests
+ - name: k8s_state.pod_mem_limits_used
+ description: Memory limits used
+ unit: bytes
+ chart_type: line
+ dimensions:
+ - name: limits
+ - name: k8s_state.pod_condition
+ description: Condition
+ unit: state
+ chart_type: line
+ dimensions:
+ - name: pod_ready
+ - name: pod_scheduled
+ - name: pod_initialized
+ - name: containers_ready
+ - name: k8s_state.pod_phase
+ description: Phase
+ unit: state
+ chart_type: line
+ dimensions:
+ - name: running
+ - name: failed
+ - name: succeeded
+ - name: pending
+ - name: k8s_state.pod_age
+ description: Age
+ unit: seconds
+ chart_type: line
+ dimensions:
+ - name: age
+ - name: k8s_state.pod_containers
+ description: Containers
+ unit: containers
+ chart_type: line
+ dimensions:
+ - name: containers
+ - name: init_containers
+ - name: k8s_state.pod_containers_state
+ description: Containers state
+ unit: containers
+ chart_type: stacked
+ dimensions:
+ - name: running
+ - name: waiting
+ - name: terminated
+ - name: k8s_state.pod_init_containers_state
+ description: Init containers state
+ unit: containers
+ chart_type: stacked
+ dimensions:
+ - name: running
+ - name: waiting
+ - name: terminated
+ - name: container
+ description: These metrics refer to the Pod container.
+ labels:
+ - name: k8s_cluster_id
+ description: Cluster ID. This is equal to the kube-system namespace UID.
+ - name: k8s_cluster_name
+ description: Cluster name. Cluster name discovery only works in GKE.
+ - name: k8s_node_name
+ description: Node name.
+ - name: k8s_namespace
+ description: Namespace.
+ - name: k8s_controller_kind
+ description: Controller kind (ReplicaSet, DaemonSet, StatefulSet, Job, etc.).
+ - name: k8s_controller_name
+ description: Controller name.
+ - name: k8s_pod_name
+ description: Pod name.
+ - name: k8s_qos_class
+ description: Pod QOS class (burstable, guaranteed, besteffort).
+ - name: k8s_container_name
+ description: Container name.
+ metrics:
+ - name: k8s_state.pod_container_readiness_state
+ description: Readiness state
+ unit: state
+ chart_type: line
+ dimensions:
+ - name: ready
+ - name: k8s_state.pod_container_restarts
+ description: Restarts
+ unit: restarts
+ chart_type: line
+ dimensions:
+ - name: restarts
+ - name: k8s_state.pod_container_state
+ description: Container state
+ unit: state
+ chart_type: line
+ dimensions:
+ - name: running
+ - name: waiting
+ - name: terminated
+ - name: k8s_state.pod_container_waiting_state_reason
+ description: Container waiting state reason
+ unit: state
+ chart_type: line
+ dimensions:
+ - name: a dimension per reason
+ - name: k8s_state.pod_container_terminated_state_reason
+ description: Container terminated state reason
+ unit: state
+ chart_type: line
+ dimensions:
+ - name: a dimension per reason
diff --git a/src/go/collectors/go.d.plugin/modules/k8s_state/resource.go b/src/go/collectors/go.d.plugin/modules/k8s_state/resource.go
new file mode 100644
index 000000000..cabd41a67
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/k8s_state/resource.go
@@ -0,0 +1,44 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+package k8s_state
+
+import (
+ "fmt"
+
+ corev1 "k8s.io/api/core/v1"
+)
+
+type resource interface {
+ source() string
+ kind() kubeResourceKind
+ value() interface{}
+}
+
+type kubeResourceKind uint8
+
+const (
+ kubeResourceNode kubeResourceKind = iota + 1
+ kubeResourcePod
+)
+
+func toNode(i interface{}) (*corev1.Node, error) {
+ switch v := i.(type) {
+ case *corev1.Node:
+ return v, nil
+ case resource:
+ return toNode(v.value())
+ default:
+ return nil, fmt.Errorf("unexpected type: %T (expected %T or %T)", v, &corev1.Node{}, resource(nil))
+ }
+}
+
+func toPod(i interface{}) (*corev1.Pod, error) {
+ switch v := i.(type) {
+ case *corev1.Pod:
+ return v, nil
+ case resource:
+ return toPod(v.value())
+ default:
+ return nil, fmt.Errorf("unexpected type: %T (expected %T or %T)", v, &corev1.Pod{}, resource(nil))
+ }
+}
diff --git a/src/go/collectors/go.d.plugin/modules/k8s_state/state.go b/src/go/collectors/go.d.plugin/modules/k8s_state/state.go
new file mode 100644
index 000000000..72bac88ee
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/k8s_state/state.go
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+package k8s_state
+
+import (
+ "sync"
+ "time"
+
+ corev1 "k8s.io/api/core/v1"
+)
+
+func newKubeState() *kubeState {
+ return &kubeState{
+ Mutex: &sync.Mutex{},
+ nodes: make(map[string]*nodeState),
+ pods: make(map[string]*podState),
+ }
+}
+
+func newNodeState() *nodeState {
+ return &nodeState{
+ new: true,
+ labels: make(map[string]string),
+ conditions: make(map[string]*nodeStateCondition),
+ }
+}
+
+func newPodState() *podState {
+ return &podState{
+ new: true,
+ labels: make(map[string]string),
+ initContainers: make(map[string]*containerState),
+ containers: make(map[string]*containerState),
+ }
+}
+
+func newContainerState() *containerState {
+ return &containerState{
+ new: true,
+ stateWaitingReasons: make(map[string]*containerStateReason),
+ stateTerminatedReasons: make(map[string]*containerStateReason),
+ }
+}
+
+type kubeState struct {
+ *sync.Mutex
+ nodes map[string]*nodeState
+ pods map[string]*podState
+}
+
+type (
+ nodeState struct {
+ new bool
+ deleted bool
+
+ name string
+ unSchedulable bool
+ labels map[string]string
+ creationTime time.Time
+ allocatableCPU int64
+ allocatableMem int64
+ allocatablePods int64
+ conditions map[string]*nodeStateCondition
+
+ stats nodeStateStats
+ }
+ nodeStateCondition struct {
+ new bool
+ // https://kubernetes.io/docs/concepts/architecture/nodes/#condition
+ //typ corev1.NodeConditionType
+ status corev1.ConditionStatus
+ }
+ nodeStateStats struct {
+ reqCPU int64
+ limitCPU int64
+ reqMem int64
+ limitMem int64
+ pods int64
+
+ podsCondPodReady int64
+ podsCondPodScheduled int64
+ podsCondPodInitialized int64
+ podsCondContainersReady int64
+
+ podsReadinessReady int64
+ podsReadinessUnready int64
+
+ podsPhaseRunning int64
+ podsPhaseFailed int64
+ podsPhaseSucceeded int64
+ podsPhasePending int64
+
+ containers int64
+ initContainers int64
+ initContStateRunning int64
+ initContStateWaiting int64
+ initContStateTerminated int64
+ contStateRunning int64
+ contStateWaiting int64
+ contStateTerminated int64
+ }
+)
+
+func (ns *nodeState) id() string { return ns.name }
+func (ns *nodeState) resetStats() { ns.stats = nodeStateStats{} }
+
+type (
+ podState struct {
+ new bool
+ deleted bool
+ unscheduled bool
+
+ name string
+ nodeName string
+ namespace string
+ uid string
+ labels map[string]string
+ controllerKind string
+ controllerName string
+ qosClass string
+ creationTime time.Time
+ reqCPU int64
+ reqMem int64
+ limitCPU int64
+ limitMem int64
+ // https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-conditions
+ condPodScheduled corev1.ConditionStatus
+ condContainersReady corev1.ConditionStatus
+ condPodInitialized corev1.ConditionStatus
+ condPodReady corev1.ConditionStatus
+ // https://kubernetes.io/docs/concepts/workloads/pods/pod-lifecycle/#pod-phase
+ phase corev1.PodPhase
+
+ initContainers map[string]*containerState
+ containers map[string]*containerState
+ }
+)
+
+func (ps podState) id() string { return ps.namespace + "_" + ps.name }
+
+type (
+ containerState struct {
+ new bool
+
+ name string
+ uid string
+
+ podName string
+ nodeName string
+ namespace string
+
+ ready bool
+ restarts int64
+ stateRunning bool
+ stateWaiting bool
+ stateTerminated bool
+ stateWaitingReasons map[string]*containerStateReason
+ stateTerminatedReasons map[string]*containerStateReason
+ }
+ containerStateReason struct {
+ new bool
+ reason string
+ active bool
+ }
+)
diff --git a/src/go/collectors/go.d.plugin/modules/k8s_state/testdata/config.json b/src/go/collectors/go.d.plugin/modules/k8s_state/testdata/config.json
new file mode 100644
index 000000000..0e3f7c403
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/k8s_state/testdata/config.json
@@ -0,0 +1,3 @@
+{
+ "update_every": 123
+}
diff --git a/src/go/collectors/go.d.plugin/modules/k8s_state/testdata/config.yaml b/src/go/collectors/go.d.plugin/modules/k8s_state/testdata/config.yaml
new file mode 100644
index 000000000..f21a3a7a0
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/k8s_state/testdata/config.yaml
@@ -0,0 +1 @@
+update_every: 123
diff --git a/src/go/collectors/go.d.plugin/modules/k8s_state/update_node_state.go b/src/go/collectors/go.d.plugin/modules/k8s_state/update_node_state.go
new file mode 100644
index 000000000..80f5c26c8
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/k8s_state/update_node_state.go
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+package k8s_state
+
+func (ks *KubeState) updateNodeState(r resource) {
+ if r.value() == nil {
+ if ns, ok := ks.state.nodes[r.source()]; ok {
+ ns.deleted = true
+ }
+ return
+ }
+
+ node, err := toNode(r)
+ if err != nil {
+ ks.Warning(err)
+ return
+ }
+
+ if myNodeName != "" && node.Name != myNodeName {
+ return
+ }
+
+ ns, ok := ks.state.nodes[r.source()]
+ if !ok {
+ ns = newNodeState()
+ ks.state.nodes[r.source()] = ns
+ }
+
+ if !ok {
+ ns.name = node.Name
+ ns.creationTime = node.CreationTimestamp.Time
+ ns.allocatableCPU = int64(node.Status.Allocatable.Cpu().AsApproximateFloat64() * 1000)
+ ns.allocatableMem = node.Status.Allocatable.Memory().Value()
+ ns.allocatablePods = node.Status.Allocatable.Pods().Value()
+ copyLabels(ns.labels, node.Labels)
+ }
+
+ ns.unSchedulable = node.Spec.Unschedulable
+
+ for _, c := range node.Status.Conditions {
+ if v, ok := ns.conditions[string(c.Type)]; !ok {
+ ns.conditions[string(c.Type)] = &nodeStateCondition{new: true, status: c.Status}
+ } else {
+ v.status = c.Status
+ }
+ }
+}
diff --git a/src/go/collectors/go.d.plugin/modules/k8s_state/update_pod_state.go b/src/go/collectors/go.d.plugin/modules/k8s_state/update_pod_state.go
new file mode 100644
index 000000000..22ef0f7fc
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/k8s_state/update_pod_state.go
@@ -0,0 +1,165 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+package k8s_state
+
+import (
+ "strings"
+
+ corev1 "k8s.io/api/core/v1"
+)
+
+func (ks *KubeState) updatePodState(r resource) {
+ if r.value() == nil {
+ if ps, ok := ks.state.pods[r.source()]; ok {
+ ps.deleted = true
+ }
+ return
+ }
+
+ pod, err := toPod(r)
+ if err != nil {
+ ks.Warning(err)
+ return
+ }
+
+ ps, ok := ks.state.pods[r.source()]
+ if !ok {
+ ps = newPodState()
+ ks.state.pods[r.source()] = ps
+ }
+
+ if !ok {
+ ps.name = pod.Name
+ ps.nodeName = pod.Spec.NodeName
+ ps.namespace = pod.Namespace
+ ps.creationTime = pod.CreationTimestamp.Time
+ ps.uid = string(pod.UID)
+ ps.qosClass = strings.ToLower(string(pod.Status.QOSClass))
+ copyLabels(ps.labels, pod.Labels)
+ for _, ref := range pod.OwnerReferences {
+ if ref.Controller != nil && *ref.Controller {
+ ps.controllerKind = ref.Kind
+ ps.controllerName = ref.Name
+ }
+ }
+ var res struct{ rCPU, lCPU, rMem, lMem, irCPU, ilCPU, irMem, ilMem int64 }
+ for _, cntr := range pod.Spec.Containers {
+ res.rCPU += int64(cntr.Resources.Requests.Cpu().AsApproximateFloat64() * 1000)
+ res.lCPU += int64(cntr.Resources.Limits.Cpu().AsApproximateFloat64() * 1000)
+ res.rMem += cntr.Resources.Requests.Memory().Value()
+ res.lMem += cntr.Resources.Limits.Memory().Value()
+ }
+ for _, cntr := range pod.Spec.InitContainers {
+ res.irCPU += int64(cntr.Resources.Requests.Cpu().AsApproximateFloat64() * 1000)
+ res.ilCPU += int64(cntr.Resources.Limits.Cpu().AsApproximateFloat64() * 1000)
+ res.irMem += cntr.Resources.Requests.Memory().Value()
+ res.ilMem += cntr.Resources.Limits.Memory().Value()
+ }
+ ps.reqCPU = max(res.rCPU, res.irCPU)
+ ps.limitCPU = max(res.lCPU, res.ilCPU)
+ ps.reqMem = max(res.rMem, res.irMem)
+ ps.limitMem = max(res.lMem, res.ilMem)
+ }
+ if ps.nodeName == "" {
+ ps.nodeName = pod.Spec.NodeName
+ }
+
+ for _, c := range pod.Status.Conditions {
+ switch c.Type {
+ case corev1.ContainersReady:
+ ps.condContainersReady = c.Status
+ case corev1.PodInitialized:
+ ps.condPodInitialized = c.Status
+ case corev1.PodReady:
+ ps.condPodReady = c.Status
+ case corev1.PodScheduled:
+ ps.condPodScheduled = c.Status
+ }
+ }
+
+ ps.phase = pod.Status.Phase
+
+ for _, cs := range ps.containers {
+ for _, r := range cs.stateWaitingReasons {
+ r.active = false
+ }
+ for _, r := range cs.stateTerminatedReasons {
+ r.active = false
+ }
+ }
+
+ for _, cntr := range pod.Status.ContainerStatuses {
+ cs, ok := ps.containers[cntr.Name]
+ if !ok {
+ cs = newContainerState()
+ ps.containers[cntr.Name] = cs
+ }
+ if !ok {
+ cs.name = cntr.Name
+ cs.podName = pod.Name
+ cs.namespace = pod.Namespace
+ cs.nodeName = pod.Spec.NodeName
+ cs.uid = extractContainerID(cntr.ContainerID)
+ }
+ cs.ready = cntr.Ready
+ cs.restarts = int64(cntr.RestartCount)
+ cs.stateRunning = cntr.State.Running != nil
+ cs.stateWaiting = cntr.State.Waiting != nil
+ cs.stateTerminated = cntr.State.Terminated != nil
+
+ if cntr.State.Waiting != nil {
+ reason := cntr.State.Waiting.Reason
+ r, ok := cs.stateWaitingReasons[reason]
+ if !ok {
+ r = &containerStateReason{new: true, reason: reason}
+ cs.stateWaitingReasons[reason] = r
+ }
+ r.active = true
+ }
+
+ if cntr.State.Terminated != nil {
+ reason := cntr.State.Terminated.Reason
+ r, ok := cs.stateTerminatedReasons[reason]
+ if !ok {
+ r = &containerStateReason{new: true, reason: reason}
+ cs.stateTerminatedReasons[reason] = r
+ }
+ r.active = true
+ }
+ }
+
+ for _, cntr := range pod.Status.InitContainerStatuses {
+ cs, ok := ps.initContainers[cntr.Name]
+ if !ok {
+ cs = newContainerState()
+ ps.initContainers[cntr.Name] = cs
+ }
+ if !ok {
+ cs.name = cntr.Name
+ cs.podName = pod.Name
+ cs.namespace = pod.Namespace
+ cs.nodeName = pod.Spec.NodeName
+ cs.uid = extractContainerID(cntr.ContainerID)
+ }
+ cs.ready = cntr.Ready
+ cs.restarts = int64(cntr.RestartCount)
+ cs.stateRunning = cntr.State.Running != nil
+ cs.stateWaiting = cntr.State.Waiting != nil
+ cs.stateTerminated = cntr.State.Terminated != nil
+ }
+}
+
+func max(a, b int64) int64 {
+ if a < b {
+ return b
+ }
+ return a
+}
+
+func extractContainerID(id string) string {
+ // docker://d98...
+ if i := strings.LastIndexByte(id, '/'); i != -1 {
+ id = id[i+1:]
+ }
+ return id
+}
diff --git a/src/go/collectors/go.d.plugin/modules/k8s_state/update_state.go b/src/go/collectors/go.d.plugin/modules/k8s_state/update_state.go
new file mode 100644
index 000000000..88f3272c1
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/k8s_state/update_state.go
@@ -0,0 +1,27 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+package k8s_state
+
+func (ks *KubeState) runUpdateState(in <-chan resource) {
+ for {
+ select {
+ case <-ks.ctx.Done():
+ return
+ case r := <-in:
+ ks.state.Lock()
+ switch r.kind() {
+ case kubeResourceNode:
+ ks.updateNodeState(r)
+ case kubeResourcePod:
+ ks.updatePodState(r)
+ }
+ ks.state.Unlock()
+ }
+ }
+}
+
+func copyLabels(dst, src map[string]string) {
+ for k, v := range src {
+ dst[k] = v
+ }
+}