summaryrefslogtreecommitdiffstats
path: root/src/go/collectors/go.d.plugin/modules/nvidia_smi/collect_xml.go
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-08-26 08:15:20 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-08-26 08:15:20 +0000
commit87d772a7d708fec12f48cd8adc0dedff6e1025da (patch)
tree1fee344c64cc3f43074a01981e21126c8482a522 /src/go/collectors/go.d.plugin/modules/nvidia_smi/collect_xml.go
parentAdding upstream version 1.46.3. (diff)
downloadnetdata-upstream.tar.xz
netdata-upstream.zip
Adding upstream version 1.47.0.upstream/1.47.0upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r--src/go/plugin/go.d/modules/nvidia_smi/collect.go (renamed from src/go/collectors/go.d.plugin/modules/nvidia_smi/collect_xml.go)177
1 files changed, 58 insertions, 119 deletions
diff --git a/src/go/collectors/go.d.plugin/modules/nvidia_smi/collect_xml.go b/src/go/plugin/go.d/modules/nvidia_smi/collect.go
index 2ab3180a8..f621d191b 100644
--- a/src/go/collectors/go.d.plugin/modules/nvidia_smi/collect_xml.go
+++ b/src/go/plugin/go.d/modules/nvidia_smi/collect.go
@@ -4,18 +4,33 @@ package nvidia_smi
import (
"encoding/xml"
+ "errors"
"fmt"
"strconv"
"strings"
)
-func (nv *NvidiaSMI) collectGPUInfoXML(mx map[string]int64) error {
- bs, err := nv.exec.queryGPUInfoXML()
+func (nv *NvidiaSmi) collect() (map[string]int64, error) {
+ if nv.exec == nil {
+ return nil, errors.New("nvidia-smi exec is not initialized")
+ }
+
+ mx := make(map[string]int64)
+
+ if err := nv.collectGPUInfo(mx); err != nil {
+ return nil, err
+ }
+
+ return mx, nil
+}
+
+func (nv *NvidiaSmi) collectGPUInfo(mx map[string]int64) error {
+ bs, err := nv.exec.queryGPUInfo()
if err != nil {
return fmt.Errorf("error on quering XML GPU info: %v", err)
}
- info := &xmlInfo{}
+ info := &gpusInfo{}
if err := xml.Unmarshal(bs, info); err != nil {
return fmt.Errorf("error on unmarshaling XML GPU info response: %v", err)
}
@@ -39,11 +54,11 @@ func (nv *NvidiaSMI) collectGPUInfoXML(mx map[string]int64) error {
addMetric(mx, px+"pcie_bandwidth_usage_rx", gpu.PCI.RxUtil, 1024) // KB => bytes
addMetric(mx, px+"pcie_bandwidth_usage_tx", gpu.PCI.TxUtil, 1024) // KB => bytes
- if max := calcMaxPCIEBandwidth(gpu); max > 0 {
+ if maxBw := calcMaxPCIEBandwidth(gpu); maxBw > 0 {
rx := parseFloat(gpu.PCI.RxUtil) * 1024 // KB => bytes
tx := parseFloat(gpu.PCI.TxUtil) * 1024 // KB => bytes
- mx[px+"pcie_bandwidth_utilization_rx"] = int64((rx * 100 / max) * 100)
- mx[px+"pcie_bandwidth_utilization_tx"] = int64((tx * 100 / max) * 100)
+ mx[px+"pcie_bandwidth_utilization_rx"] = int64((rx * 100 / maxBw) * 100)
+ mx[px+"pcie_bandwidth_utilization_tx"] = int64((tx * 100 / maxBw) * 100)
}
addMetric(mx, px+"fan_speed_perc", gpu.FanSpeed, 0)
addMetric(mx, px+"gpu_utilization", gpu.Utilization.GpuUtil, 0)
@@ -88,7 +103,7 @@ func (nv *NvidiaSMI) collectGPUInfoXML(mx map[string]int64) error {
if !nv.migs[px] {
nv.migs[px] = true
- nv.addMIGDeviceXMLCharts(gpu, mig)
+ nv.addMIGDeviceCharts(gpu, mig)
}
addMetric(mx, px+"ecc_error_sram_uncorrectable", mig.ECCErrorCount.VolatileCount.SRAMUncorrectable, 0)
@@ -117,7 +132,7 @@ func (nv *NvidiaSMI) collectGPUInfoXML(mx map[string]int64) error {
return nil
}
-func calcMaxPCIEBandwidth(gpu xmlGPUInfo) float64 {
+func calcMaxPCIEBandwidth(gpu gpuInfo) float64 {
gen := gpu.PCI.PCIGPULinkInfo.PCIEGen.MaxLinkGen
width := strings.TrimSuffix(gpu.PCI.PCIGPULinkInfo.LinkWidths.MaxLinkWidth, "x")
@@ -146,120 +161,44 @@ func calcMaxPCIEBandwidth(gpu xmlGPUInfo) float64 {
return (speed*parseFloat(width)*(1-enc) - 1) * 1e9 / 8 // Gb/s => bytes
}
-type (
- xmlInfo struct {
- GPUs []xmlGPUInfo `xml:"gpu"`
+func addMetric(mx map[string]int64, key, value string, mul int) {
+ if !isValidValue(value) {
+ return
}
- xmlGPUInfo struct {
- ID string `xml:"id,attr"`
- ProductName string `xml:"product_name"`
- ProductBrand string `xml:"product_brand"`
- ProductArchitecture string `xml:"product_architecture"`
- UUID string `xml:"uuid"`
- FanSpeed string `xml:"fan_speed"`
- PerformanceState string `xml:"performance_state"`
- MIGMode struct {
- CurrentMIG string `xml:"current_mig"`
- } `xml:"mig_mode"`
- MIGDevices struct {
- MIGDevice []xmlMIGDeviceInfo `xml:"mig_device"`
- } `xml:"mig_devices"`
- PCI struct {
- TxUtil string `xml:"tx_util"`
- RxUtil string `xml:"rx_util"`
- PCIGPULinkInfo struct {
- PCIEGen struct {
- MaxLinkGen string `xml:"max_link_gen"`
- } `xml:"pcie_gen"`
- LinkWidths struct {
- MaxLinkWidth string `xml:"max_link_width"`
- } `xml:"link_widths"`
- } `xml:"pci_gpu_link_info"`
- } `xml:"pci"`
- Utilization struct {
- GpuUtil string `xml:"gpu_util"`
- MemoryUtil string `xml:"memory_util"`
- EncoderUtil string `xml:"encoder_util"`
- DecoderUtil string `xml:"decoder_util"`
- } `xml:"utilization"`
- FBMemoryUsage struct {
- Total string `xml:"total"`
- Reserved string `xml:"reserved"`
- Used string `xml:"used"`
- Free string `xml:"free"`
- } `xml:"fb_memory_usage"`
- Bar1MemoryUsage struct {
- Total string `xml:"total"`
- Used string `xml:"used"`
- Free string `xml:"free"`
- } `xml:"bar1_memory_usage"`
- Temperature struct {
- GpuTemp string `xml:"gpu_temp"`
- GpuTempMaxThreshold string `xml:"gpu_temp_max_threshold"`
- GpuTempSlowThreshold string `xml:"gpu_temp_slow_threshold"`
- GpuTempMaxGpuThreshold string `xml:"gpu_temp_max_gpu_threshold"`
- GpuTargetTemperature string `xml:"gpu_target_temperature"`
- MemoryTemp string `xml:"memory_temp"`
- GpuTempMaxMemThreshold string `xml:"gpu_temp_max_mem_threshold"`
- } `xml:"temperature"`
- Clocks struct {
- GraphicsClock string `xml:"graphics_clock"`
- SmClock string `xml:"sm_clock"`
- MemClock string `xml:"mem_clock"`
- VideoClock string `xml:"video_clock"`
- } `xml:"clocks"`
- PowerReadings *xmlPowerReadings `xml:"power_readings"`
- GPUPowerReadings *xmlPowerReadings `xml:"gpu_power_readings"`
- Voltage struct {
- GraphicsVolt string `xml:"graphics_volt"`
- } `xml:"voltage"`
- Processes struct {
- ProcessInfo []struct {
- PID string `xml:"pid"`
- ProcessName string `xml:"process_name"`
- UsedMemory string `xml:"used_memory"`
- } `sml:"process_info"`
- } `xml:"processes"`
+
+ value = removeUnits(value)
+
+ v, err := strconv.ParseFloat(value, 64)
+ if err != nil {
+ return
}
- xmlPowerReadings struct {
- //PowerState string `xml:"power_state"`
- //PowerManagement string `xml:"power_management"`
- PowerDraw string `xml:"power_draw"`
- //PowerLimit string `xml:"power_limit"`
- //DefaultPowerLimit string `xml:"default_power_limit"`
- //EnforcedPowerLimit string `xml:"enforced_power_limit"`
- //MinPowerLimit string `xml:"min_power_limit"`
- //MaxPowerLimit string `xml:"max_power_limit"`
+ if mul > 0 {
+ v *= float64(mul)
}
- xmlMIGDeviceInfo struct {
- Index string `xml:"index"`
- GPUInstanceID string `xml:"gpu_instance_id"`
- ComputeInstanceID string `xml:"compute_instance_id"`
- DeviceAttributes struct {
- Shared struct {
- MultiprocessorCount string `xml:"multiprocessor_count"`
- CopyEngineCount string `xml:"copy_engine_count"`
- EncoderCount string `xml:"encoder_count"`
- DecoderCount string `xml:"decoder_count"`
- OFACount string `xml:"ofa_count"`
- JPGCount string `xml:"jpg_count"`
- } `xml:"shared"`
- } `xml:"device_attributes"`
- ECCErrorCount struct {
- VolatileCount struct {
- SRAMUncorrectable string `xml:"sram_uncorrectable"`
- } `xml:"volatile_count"`
- } `xml:"ecc_error_count"`
- FBMemoryUsage struct {
- Free string `xml:"free"`
- Used string `xml:"used"`
- Reserved string `xml:"reserved"`
- } `xml:"fb_memory_usage"`
- BAR1MemoryUsage struct {
- Free string `xml:"free"`
- Used string `xml:"used"`
- } `xml:"bar1_memory_usage"`
+ mx[key] = int64(v)
+}
+
+func isValidValue(v string) bool {
+ return v != "" && v != "N/A" && v != "[N/A]"
+}
+
+func parseFloat(s string) float64 {
+ v, _ := strconv.ParseFloat(removeUnits(s), 64)
+ return v
+}
+
+func removeUnits(s string) string {
+ if i := strings.IndexByte(s, ' '); i != -1 {
+ s = s[:i]
}
-)
+ return s
+}
+
+func boolToInt(v bool) int64 {
+ if v {
+ return 1
+ }
+ return 0
+}