diff options
Diffstat (limited to 'src/go/plugin/go.d')
17 files changed, 99 insertions, 103 deletions
diff --git a/src/go/plugin/go.d/agent/discovery/sd/discoverer/netlisteners/netlisteners.go b/src/go/plugin/go.d/agent/discovery/sd/discoverer/netlisteners/netlisteners.go index 6f536c49..60dd92cb 100644 --- a/src/go/plugin/go.d/agent/discovery/sd/discoverer/netlisteners/netlisteners.go +++ b/src/go/plugin/go.d/agent/discovery/sd/discoverer/netlisteners/netlisteners.go @@ -84,6 +84,9 @@ type ( cache map[uint64]*cacheItem // [target.Hash] started chan struct{} + + successRuns int64 + timeoutRuns int64 } cacheItem struct { lastSeenTime time.Time @@ -118,7 +121,7 @@ func (d *Discoverer) Discover(ctx context.Context, in chan<- []model.TargetGroup return case <-tk.C: if err := d.discoverLocalListeners(ctx, in); err != nil { - d.Warning(err) + d.Error(err) return } } @@ -128,12 +131,20 @@ func (d *Discoverer) Discover(ctx context.Context, in chan<- []model.TargetGroup func (d *Discoverer) discoverLocalListeners(ctx context.Context, in chan<- []model.TargetGroup) error { bs, err := d.ll.discover(ctx) if err != nil { - if errors.Is(err, context.Canceled) { + if errors.Is(err, context.DeadlineExceeded) { + // there is no point in continuing pointless attempts/use cpu + // https://github.com/netdata/netdata/discussions/18751#discussioncomment-10908472 + if d.timeoutRuns++; d.timeoutRuns > 5 && d.successRuns == 0 { + return err + } + d.Warning(err) return nil } return err } + d.successRuns++ + tgts, err := d.parseLocalListeners(bs) if err != nil { return err diff --git a/src/go/plugin/go.d/modules/nvidia_smi/charts.go b/src/go/plugin/go.d/modules/nvidia_smi/charts.go index 746c8eed..d6b0af36 100644 --- a/src/go/plugin/go.d/modules/nvidia_smi/charts.go +++ b/src/go/plugin/go.d/modules/nvidia_smi/charts.go @@ -4,6 +4,7 @@ package nvidia_smi import ( "fmt" + "strconv" "strings" "github.com/netdata/netdata/go/plugins/plugin/go.d/agent/module" @@ -261,7 +262,7 @@ var ( } ) -func (nv *NvidiaSmi) addGPUXMLCharts(gpu gpuInfo) { +func (nv *NvidiaSmi) addGpuCharts(gpu gpuInfo, index int) { charts := gpuXMLCharts.Copy() if !isValidValue(gpu.Utilization.GpuUtil) { @@ -294,7 +295,7 @@ func (nv *NvidiaSmi) addGPUXMLCharts(gpu gpuInfo) { for _, c := range *charts { c.ID = fmt.Sprintf(c.ID, strings.ToLower(gpu.UUID)) c.Labels = []module.Label{ - // csv output has no 'product_brand' + {Key: "index", Value: strconv.Itoa(index)}, {Key: "uuid", Value: gpu.UUID}, {Key: "product_name", Value: gpu.ProductName}, } diff --git a/src/go/plugin/go.d/modules/nvidia_smi/collect.go b/src/go/plugin/go.d/modules/nvidia_smi/collect.go index f621d191..3548f90f 100644 --- a/src/go/plugin/go.d/modules/nvidia_smi/collect.go +++ b/src/go/plugin/go.d/modules/nvidia_smi/collect.go @@ -38,7 +38,7 @@ func (nv *NvidiaSmi) collectGPUInfo(mx map[string]int64) error { seenGPU := make(map[string]bool) seenMIG := make(map[string]bool) - for _, gpu := range info.GPUs { + for i, gpu := range info.GPUs { if !isValidValue(gpu.UUID) { continue } @@ -49,7 +49,7 @@ func (nv *NvidiaSmi) collectGPUInfo(mx map[string]int64) error { if !nv.gpus[px] { nv.gpus[px] = true - nv.addGPUXMLCharts(gpu) + nv.addGpuCharts(gpu, i) } addMetric(mx, px+"pcie_bandwidth_usage_rx", gpu.PCI.RxUtil, 1024) // KB => bytes diff --git a/src/go/plugin/go.d/modules/nvidia_smi/config_schema.json b/src/go/plugin/go.d/modules/nvidia_smi/config_schema.json index 3f93badc..46b48095 100644 --- a/src/go/plugin/go.d/modules/nvidia_smi/config_schema.json +++ b/src/go/plugin/go.d/modules/nvidia_smi/config_schema.json @@ -19,7 +19,7 @@ }, "timeout": { "title": "Timeout", - "description": "Timeout for executing the binary, specified in seconds.", + "description": "The maximum duration, in seconds, to wait for an `nvidia-smi` command to complete.", "type": "number", "minimum": 0.5, "default": 10 @@ -47,7 +47,7 @@ "ui:help": "If an absolute path is provided, the collector will use it directly; otherwise, it will search for the binary in directories specified in the PATH environment variable." }, "timeout": { - "ui:help": "Accepts decimals for precise control (e.g., type 1.5 for 1.5 seconds)." + "ui:help": "This setting applies differently based on the collector's mode. **Loop Mode:** In loop mode, the timeout primarily determines how long to wait for the initial `nvidia-smi` execution. If the initial query takes longer than the timeout, the collector may report an error. For systems with multiple GPUs, the initial load time can sometimes be significant (e.g., 5-10 seconds). **Regular Mode:** If the collector is in regular mode, the timeout specifies how long to wait for each individual `nvidia-smi` execution." }, "loop_mode": { "ui:help": "In loop mode, `nvidia-smi` will repeatedly query GPU data at specified intervals, defined by the `-l SEC` or `--loop=SEC` parameter, rather than just running the query once. This enables ongoing performance tracking by putting the application to sleep between queries." diff --git a/src/go/plugin/go.d/modules/nvidia_smi/exec.go b/src/go/plugin/go.d/modules/nvidia_smi/exec.go index 11a26131..497db87a 100644 --- a/src/go/plugin/go.d/modules/nvidia_smi/exec.go +++ b/src/go/plugin/go.d/modules/nvidia_smi/exec.go @@ -34,7 +34,7 @@ func newNvidiaSmiBinary(path string, cfg Config, log *logger.Logger) (nvidiaSmiB Logger: log, binPath: path, updateEvery: cfg.UpdateEvery, - firstSampleTimeout: time.Second * 3, + firstSampleTimeout: cfg.Timeout.Duration(), } if err := smi.run(); err != nil { diff --git a/src/go/plugin/go.d/modules/nvidia_smi/metadata.yaml b/src/go/plugin/go.d/modules/nvidia_smi/metadata.yaml index 2a79b5ac..f44d5753 100644 --- a/src/go/plugin/go.d/modules/nvidia_smi/metadata.yaml +++ b/src/go/plugin/go.d/modules/nvidia_smi/metadata.yaml @@ -65,8 +65,8 @@ modules: default_value: nvidia_smi required: false - name: timeout - description: nvidia_smi binary execution timeout. - default_value: 2 + description: The maximum duration, in seconds, to wait for an `nvidia-smi` command to complete. This setting applies differently based on the collector's mode. **Loop Mode:** In loop mode, the timeout primarily determines how long to wait for the initial `nvidia-smi` execution. If the initial query takes longer than the timeout, the collector may report an error. For systems with multiple GPUs, the initial load time can sometimes be significant (e.g., 5-10 seconds). **Regular Mode:** If the collector is in regular mode, the timeout specifies how long to wait for each individual `nvidia-smi` execution. + default_value: 10 required: false - name: loop_mode description: "When enabled, `nvidia-smi` is executed continuously in a separate thread using the `-l` option." @@ -98,7 +98,9 @@ modules: description: These metrics refer to the GPU. labels: - name: uuid - description: GPU id (e.g. 00000000:00:04.0) + description: GPU uuid (e.g. GPU-27b94a00-ed54-5c24-b1fd-1054085de32a) + - name: index + description: GPU index (nvidia_smi typically orders GPUs by PCI bus ID) - name: product_name description: GPU product name (e.g. NVIDIA A100-SXM4-40GB) metrics: @@ -211,7 +213,7 @@ modules: description: These metrics refer to the Multi-Instance GPU (MIG). labels: - name: uuid - description: GPU id (e.g. 00000000:00:04.0) + description: GPU uuid (e.g. GPU-27b94a00-ed54-5c24-b1fd-1054085de32a) - name: product_name description: GPU product name (e.g. NVIDIA A100-SXM4-40GB) - name: gpu_instance_id diff --git a/src/go/plugin/go.d/modules/postgres/collect.go b/src/go/plugin/go.d/modules/postgres/collect.go index 6186932c..91f3b523 100644 --- a/src/go/plugin/go.d/modules/postgres/collect.go +++ b/src/go/plugin/go.d/modules/postgres/collect.go @@ -18,6 +18,7 @@ const ( pgVersion94 = 9_04_00 pgVersion10 = 10_00_00 pgVersion11 = 11_00_00 + pgVersion17 = 17_00_00 ) func (p *Postgres) collect() (map[string]int64, error) { diff --git a/src/go/plugin/go.d/modules/postgres/do_query_global.go b/src/go/plugin/go.d/modules/postgres/do_query_global.go index c70772a2..6f1ac1ed 100644 --- a/src/go/plugin/go.d/modules/postgres/do_query_global.go +++ b/src/go/plugin/go.d/modules/postgres/do_query_global.go @@ -97,7 +97,7 @@ func (p *Postgres) doQueryConnectionsState() error { } func (p *Postgres) doQueryCheckpoints() error { - q := queryCheckpoints() + q := queryCheckpoints(p.pgVersion) return p.doQuery(q, func(column, value string, _ bool) { switch column { diff --git a/src/go/plugin/go.d/modules/postgres/postgres_test.go b/src/go/plugin/go.d/modules/postgres/postgres_test.go index 7e91b288..95652458 100644 --- a/src/go/plugin/go.d/modules/postgres/postgres_test.go +++ b/src/go/plugin/go.d/modules/postgres/postgres_test.go @@ -155,7 +155,7 @@ func TestPostgres_Check(t *testing.T) { mockExpect(t, m, queryServerCurrentConnectionsUsed(), dataVer140004ServerCurrentConnections) mockExpect(t, m, queryServerConnectionsState(), dataVer140004ServerConnectionsState) - mockExpect(t, m, queryCheckpoints(), dataVer140004Checkpoints) + mockExpect(t, m, queryCheckpoints(140004), dataVer140004Checkpoints) mockExpect(t, m, queryServerUptime(), dataVer140004ServerUptime) mockExpect(t, m, queryTXIDWraparound(), dataVer140004TXIDWraparound) mockExpect(t, m, queryWALWrites(140004), dataVer140004WALWrites) @@ -258,7 +258,7 @@ func TestPostgres_Collect(t *testing.T) { mockExpect(t, m, queryServerCurrentConnectionsUsed(), dataVer140004ServerCurrentConnections) mockExpect(t, m, queryServerConnectionsState(), dataVer140004ServerConnectionsState) - mockExpect(t, m, queryCheckpoints(), dataVer140004Checkpoints) + mockExpect(t, m, queryCheckpoints(140004), dataVer140004Checkpoints) mockExpect(t, m, queryServerUptime(), dataVer140004ServerUptime) mockExpect(t, m, queryTXIDWraparound(), dataVer140004TXIDWraparound) mockExpect(t, m, queryWALWrites(140004), dataVer140004WALWrites) diff --git a/src/go/plugin/go.d/modules/postgres/queries.go b/src/go/plugin/go.d/modules/postgres/queries.go index f6afc934..4f6262bb 100644 --- a/src/go/plugin/go.d/modules/postgres/queries.go +++ b/src/go/plugin/go.d/modules/postgres/queries.go @@ -51,12 +51,14 @@ GROUP BY state; ` } -func queryCheckpoints() string { +func queryCheckpoints(version int) string { // definition by version: https://pgpedia.info/p/pg_stat_bgwriter.html // docs: https://www.postgresql.org/docs/current/monitoring-stats.html#MONITORING-PG-STAT-BGWRITER-VIEW // code: https://github.com/postgres/postgres/blob/366283961ac0ed6d89014444c6090f3fd02fce0a/src/backend/catalog/system_views.sql#L1104 - return ` + if version < pgVersion17 { + + return ` SELECT checkpoints_timed, checkpoints_req, checkpoint_write_time, @@ -69,6 +71,21 @@ SELECT checkpoints_timed, buffers_alloc * current_setting('block_size')::numeric AS buffers_alloc_bytes FROM pg_stat_bgwriter; ` + } + return ` +SELECT + chkpt.num_timed AS checkpoints_timed, + chkpt.num_requested AS checkpoints_req, + chkpt.write_time AS checkpoint_write_time, + chkpt.sync_time AS checkpoint_sync_time, + chkpt.buffers_written * current_setting('block_size')::numeric AS buffers_checkpoint_bytes, + bgwrtr.buffers_clean * current_setting('block_size')::numeric AS buffers_clean_bytes, + bgwrtr.maxwritten_clean, + bgwrtr.buffers_alloc * current_setting('block_size')::numeric AS buffers_alloc_bytes +FROM + pg_stat_bgwriter AS bgwrtr, + pg_stat_checkpointer AS chkpt; +` } func queryServerUptime() string { diff --git a/src/go/plugin/go.d/modules/smartctl/collect.go b/src/go/plugin/go.d/modules/smartctl/collect.go index 35585db6..b76d0998 100644 --- a/src/go/plugin/go.d/modules/smartctl/collect.go +++ b/src/go/plugin/go.d/modules/smartctl/collect.go @@ -181,7 +181,7 @@ func isSmartAttrValid(a *smartAttribute) bool { } func isDeviceInLowerPowerMode(r *gjson.Result) bool { - if !isExitStatusHasBit(r, 1) { + if !isExitStatusHasAnyBit(r, 1) { return false } @@ -194,7 +194,7 @@ func isDeviceInLowerPowerMode(r *gjson.Result) bool { } func isDeviceOpenFailedNoSuchDevice(r *gjson.Result) bool { - if !isExitStatusHasBit(r, 1) { + if !isExitStatusHasAnyBit(r, 1) { return false } @@ -206,9 +206,16 @@ func isDeviceOpenFailedNoSuchDevice(r *gjson.Result) bool { }) } -func isExitStatusHasBit(r *gjson.Result, bit int) bool { +func isExitStatusHasAnyBit(r *gjson.Result, bit int, bits ...int) bool { // https://manpages.debian.org/bullseye/smartmontools/smartctl.8.en.html#EXIT_STATUS status := int(r.Get("smartctl.exit_status").Int()) - mask := 1 << bit - return (status & mask) != 0 + + for _, b := range append([]int{bit}, bits...) { + mask := 1 << b + if (status & mask) != 0 { + return true + } + } + + return false } diff --git a/src/go/plugin/go.d/modules/smartctl/scan.go b/src/go/plugin/go.d/modules/smartctl/scan.go index 5564897a..d904ca28 100644 --- a/src/go/plugin/go.d/modules/smartctl/scan.go +++ b/src/go/plugin/go.d/modules/smartctl/scan.go @@ -100,12 +100,12 @@ func (s *Smartctl) handleGuessedScsiScannedDevice(dev *scanDevice) { } resp, _ := s.exec.deviceInfo(dev.name, "sat", s.NoCheckPowerMode) - if resp == nil || resp.Get("smartctl.exit_status").Int() != 0 { + if resp == nil || isExitStatusHasAnyBit(resp, 0, 1, 2) { return } - atts, ok := newSmartDevice(resp).ataSmartAttributeTable() - if !ok || len(atts) == 0 { + attrs, ok := newSmartDevice(resp).ataSmartAttributeTable() + if !ok || len(attrs) == 0 { return } diff --git a/src/go/plugin/go.d/modules/uwsgi/client.go b/src/go/plugin/go.d/modules/uwsgi/client.go index 40368074..487aeb93 100644 --- a/src/go/plugin/go.d/modules/uwsgi/client.go +++ b/src/go/plugin/go.d/modules/uwsgi/client.go @@ -5,35 +5,25 @@ package uwsgi import ( "bytes" "fmt" + "time" "github.com/netdata/netdata/go/plugins/plugin/go.d/pkg/socket" ) type uwsgiConn interface { - connect() error - disconnect() queryStats() ([]byte, error) } func newUwsgiConn(conf Config) uwsgiConn { - return &uwsgiClient{conn: socket.New(socket.Config{ - Address: conf.Address, - ConnectTimeout: conf.Timeout.Duration(), - ReadTimeout: conf.Timeout.Duration(), - WriteTimeout: conf.Timeout.Duration(), - })} + return &uwsgiClient{ + address: conf.Address, + timeout: conf.Timeout.Duration(), + } } type uwsgiClient struct { - conn socket.Client -} - -func (c *uwsgiClient) connect() error { - return c.conn.Connect() -} - -func (c *uwsgiClient) disconnect() { - _ = c.conn.Disconnect() + address string + timeout time.Duration } func (c *uwsgiClient) queryStats() ([]byte, error) { @@ -42,7 +32,14 @@ func (c *uwsgiClient) queryStats() ([]byte, error) { var err error const readLineLimit = 1000 * 10 - clientErr := c.conn.Command("", func(bs []byte) bool { + cfg := socket.Config{ + Address: c.address, + ConnectTimeout: c.timeout, + ReadTimeout: c.timeout, + WriteTimeout: c.timeout, + } + + clientErr := socket.ConnectAndRead(cfg, func(bs []byte) bool { b.Write(bs) b.WriteByte('\n') diff --git a/src/go/plugin/go.d/modules/uwsgi/collect.go b/src/go/plugin/go.d/modules/uwsgi/collect.go index 3f440535..a89704c8 100644 --- a/src/go/plugin/go.d/modules/uwsgi/collect.go +++ b/src/go/plugin/go.d/modules/uwsgi/collect.go @@ -27,14 +27,7 @@ type workerStats struct { } func (u *Uwsgi) collect() (map[string]int64, error) { - conn, err := u.establishConn() - if err != nil { - return nil, fmt.Errorf("failed to connect: %v", err) - } - - defer conn.disconnect() - - stats, err := conn.queryStats() + stats, err := u.conn.queryStats() if err != nil { return nil, fmt.Errorf("failed to query stats: %v", err) } @@ -110,16 +103,6 @@ func (u *Uwsgi) collectStats(mx map[string]int64, stats []byte) error { return nil } -func (u *Uwsgi) establishConn() (uwsgiConn, error) { - conn := u.newConn(u.Config) - - if err := conn.connect(); err != nil { - return nil, err - } - - return conn, nil -} - func boolToInt(b bool) int64 { if b { return 1 diff --git a/src/go/plugin/go.d/modules/uwsgi/uwsgi.go b/src/go/plugin/go.d/modules/uwsgi/uwsgi.go index 7fe98503..c70dd656 100644 --- a/src/go/plugin/go.d/modules/uwsgi/uwsgi.go +++ b/src/go/plugin/go.d/modules/uwsgi/uwsgi.go @@ -28,7 +28,6 @@ func New() *Uwsgi { Address: "127.0.0.1:1717", Timeout: web.Duration(time.Second * 1), }, - newConn: newUwsgiConn, charts: charts.Copy(), seenWorkers: make(map[int]bool), } @@ -46,7 +45,7 @@ type Uwsgi struct { charts *module.Charts - newConn func(Config) uwsgiConn + conn uwsgiConn seenWorkers map[int]bool } @@ -61,6 +60,8 @@ func (u *Uwsgi) Init() error { return errors.New("address not set") } + u.conn = newUwsgiConn(u.Config) + return nil } diff --git a/src/go/plugin/go.d/modules/uwsgi/uwsgi_test.go b/src/go/plugin/go.d/modules/uwsgi/uwsgi_test.go index 900c4853..ce3f2125 100644 --- a/src/go/plugin/go.d/modules/uwsgi/uwsgi_test.go +++ b/src/go/plugin/go.d/modules/uwsgi/uwsgi_test.go @@ -81,7 +81,7 @@ func TestUwsgi_Cleanup(t *testing.T) { "after check": { prepare: func() *Uwsgi { uw := New() - uw.newConn = func(config Config) uwsgiConn { return prepareMockOk() } + uw.conn = prepareMockOk() _ = uw.Check() return uw }, @@ -89,7 +89,7 @@ func TestUwsgi_Cleanup(t *testing.T) { "after collect": { prepare: func() *Uwsgi { uw := New() - uw.newConn = func(config Config) uwsgiConn { return prepareMockOk() } + uw.conn = prepareMockOk() _ = uw.Collect() return uw }, @@ -122,10 +122,6 @@ func TestUwsgi_Check(t *testing.T) { wantFail: false, prepareMock: prepareMockOkNoWorkers, }, - "err on connect": { - wantFail: true, - prepareMock: prepareMockErrOnConnect, - }, "unexpected response": { wantFail: true, prepareMock: prepareMockUnexpectedResponse, @@ -140,7 +136,7 @@ func TestUwsgi_Check(t *testing.T) { t.Run(name, func(t *testing.T) { uw := New() mock := test.prepareMock() - uw.newConn = func(config Config) uwsgiConn { return mock } + uw.conn = mock if test.wantFail { assert.Error(t, uw.Check()) @@ -229,12 +225,6 @@ func TestUwsgi_Collect(t *testing.T) { disconnectBeforeCleanup: true, disconnectAfterCleanup: true, }, - "err on connect": { - prepareMock: prepareMockErrOnConnect, - wantCharts: len(charts), - disconnectBeforeCleanup: false, - disconnectAfterCleanup: false, - }, "err on query stats": { prepareMock: prepareMockErrOnQueryStats, wantCharts: len(charts), @@ -247,7 +237,7 @@ func TestUwsgi_Collect(t *testing.T) { t.Run(name, func(t *testing.T) { uw := New() mock := test.prepareMock() - uw.newConn = func(config Config) uwsgiConn { return mock } + uw.conn = mock mx := uw.Collect() @@ -257,10 +247,6 @@ func TestUwsgi_Collect(t *testing.T) { module.TestMetricsHasAllChartsDims(t, uw.Charts(), mx) } assert.Equal(t, test.wantCharts, len(*uw.Charts()), "want charts") - - assert.Equal(t, test.disconnectBeforeCleanup, mock.disconnectCalled, "disconnect before cleanup") - uw.Cleanup() - assert.Equal(t, test.disconnectAfterCleanup, mock.disconnectCalled, "disconnect after cleanup") }) } } @@ -277,12 +263,6 @@ func prepareMockOkNoWorkers() *mockUwsgiConn { } } -func prepareMockErrOnConnect() *mockUwsgiConn { - return &mockUwsgiConn{ - errOnConnect: true, - } -} - func prepareMockErrOnQueryStats() *mockUwsgiConn { return &mockUwsgiConn{ errOnQueryStats: true, @@ -300,21 +280,8 @@ func prepareMockEmptyResponse() *mockUwsgiConn { } type mockUwsgiConn struct { - errOnConnect bool - errOnQueryStats bool - statsResponse []byte - disconnectCalled bool -} - -func (m *mockUwsgiConn) connect() error { - if m.errOnConnect { - return errors.New("mock.connect() error") - } - return nil -} - -func (m *mockUwsgiConn) disconnect() { - m.disconnectCalled = true + errOnQueryStats bool + statsResponse []byte } func (m *mockUwsgiConn) queryStats() ([]byte, error) { diff --git a/src/go/plugin/go.d/pkg/socket/client.go b/src/go/plugin/go.d/pkg/socket/client.go index 26ae1dfa..c2bcbd9e 100644 --- a/src/go/plugin/go.d/pkg/socket/client.go +++ b/src/go/plugin/go.d/pkg/socket/client.go @@ -21,6 +21,15 @@ func New(config Config) *Socket { } } +func ConnectAndRead(config Config, process Processor) error { + s := New(config) + if err := s.Connect(); err != nil { + return err + } + defer func() { _ = s.Disconnect() }() + return read(s.conn, process, s.ReadTimeout) +} + // Socket is the implementation of a socket client. type Socket struct { Config |