summaryrefslogtreecommitdiffstats
path: root/src/go/collectors/go.d.plugin/modules/nvidia_smi/collect_xml.go
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/go/collectors/go.d.plugin/modules/nvidia_smi/collect_xml.go265
1 files changed, 265 insertions, 0 deletions
diff --git a/src/go/collectors/go.d.plugin/modules/nvidia_smi/collect_xml.go b/src/go/collectors/go.d.plugin/modules/nvidia_smi/collect_xml.go
new file mode 100644
index 000000000..2ab3180a8
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/nvidia_smi/collect_xml.go
@@ -0,0 +1,265 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+package nvidia_smi
+
+import (
+ "encoding/xml"
+ "fmt"
+ "strconv"
+ "strings"
+)
+
+func (nv *NvidiaSMI) collectGPUInfoXML(mx map[string]int64) error {
+ bs, err := nv.exec.queryGPUInfoXML()
+ if err != nil {
+ return fmt.Errorf("error on quering XML GPU info: %v", err)
+ }
+
+ info := &xmlInfo{}
+ if err := xml.Unmarshal(bs, info); err != nil {
+ return fmt.Errorf("error on unmarshaling XML GPU info response: %v", err)
+ }
+
+ seenGPU := make(map[string]bool)
+ seenMIG := make(map[string]bool)
+
+ for _, gpu := range info.GPUs {
+ if !isValidValue(gpu.UUID) {
+ continue
+ }
+
+ px := "gpu_" + gpu.UUID + "_"
+
+ seenGPU[px] = true
+
+ if !nv.gpus[px] {
+ nv.gpus[px] = true
+ nv.addGPUXMLCharts(gpu)
+ }
+
+ addMetric(mx, px+"pcie_bandwidth_usage_rx", gpu.PCI.RxUtil, 1024) // KB => bytes
+ addMetric(mx, px+"pcie_bandwidth_usage_tx", gpu.PCI.TxUtil, 1024) // KB => bytes
+ if max := calcMaxPCIEBandwidth(gpu); max > 0 {
+ rx := parseFloat(gpu.PCI.RxUtil) * 1024 // KB => bytes
+ tx := parseFloat(gpu.PCI.TxUtil) * 1024 // KB => bytes
+ mx[px+"pcie_bandwidth_utilization_rx"] = int64((rx * 100 / max) * 100)
+ mx[px+"pcie_bandwidth_utilization_tx"] = int64((tx * 100 / max) * 100)
+ }
+ addMetric(mx, px+"fan_speed_perc", gpu.FanSpeed, 0)
+ addMetric(mx, px+"gpu_utilization", gpu.Utilization.GpuUtil, 0)
+ addMetric(mx, px+"mem_utilization", gpu.Utilization.MemoryUtil, 0)
+ addMetric(mx, px+"decoder_utilization", gpu.Utilization.DecoderUtil, 0)
+ addMetric(mx, px+"encoder_utilization", gpu.Utilization.EncoderUtil, 0)
+ addMetric(mx, px+"frame_buffer_memory_usage_free", gpu.FBMemoryUsage.Free, 1024*1024) // MiB => bytes
+ addMetric(mx, px+"frame_buffer_memory_usage_used", gpu.FBMemoryUsage.Used, 1024*1024) // MiB => bytes
+ addMetric(mx, px+"frame_buffer_memory_usage_reserved", gpu.FBMemoryUsage.Reserved, 1024*1024) // MiB => bytes
+ addMetric(mx, px+"bar1_memory_usage_free", gpu.Bar1MemoryUsage.Free, 1024*1024) // MiB => bytes
+ addMetric(mx, px+"bar1_memory_usage_used", gpu.Bar1MemoryUsage.Used, 1024*1024) // MiB => bytes
+ addMetric(mx, px+"temperature", gpu.Temperature.GpuTemp, 0)
+ addMetric(mx, px+"graphics_clock", gpu.Clocks.GraphicsClock, 0)
+ addMetric(mx, px+"video_clock", gpu.Clocks.VideoClock, 0)
+ addMetric(mx, px+"sm_clock", gpu.Clocks.SmClock, 0)
+ addMetric(mx, px+"mem_clock", gpu.Clocks.MemClock, 0)
+ if gpu.PowerReadings != nil {
+ addMetric(mx, px+"power_draw", gpu.PowerReadings.PowerDraw, 0)
+ } else if gpu.GPUPowerReadings != nil {
+ addMetric(mx, px+"power_draw", gpu.GPUPowerReadings.PowerDraw, 0)
+ }
+ addMetric(mx, px+"voltage", gpu.Voltage.GraphicsVolt, 0)
+ for i := 0; i < 16; i++ {
+ s := "P" + strconv.Itoa(i)
+ mx[px+"performance_state_"+s] = boolToInt(gpu.PerformanceState == s)
+ }
+ if isValidValue(gpu.MIGMode.CurrentMIG) {
+ mode := strings.ToLower(gpu.MIGMode.CurrentMIG)
+ mx[px+"mig_current_mode_enabled"] = boolToInt(mode == "enabled")
+ mx[px+"mig_current_mode_disabled"] = boolToInt(mode == "disabled")
+ mx[px+"mig_devices_count"] = int64(len(gpu.MIGDevices.MIGDevice))
+ }
+
+ for _, mig := range gpu.MIGDevices.MIGDevice {
+ if !isValidValue(mig.GPUInstanceID) {
+ continue
+ }
+
+ px := "mig_instance_" + mig.GPUInstanceID + "_" + px
+
+ seenMIG[px] = true
+
+ if !nv.migs[px] {
+ nv.migs[px] = true
+ nv.addMIGDeviceXMLCharts(gpu, mig)
+ }
+
+ addMetric(mx, px+"ecc_error_sram_uncorrectable", mig.ECCErrorCount.VolatileCount.SRAMUncorrectable, 0)
+ addMetric(mx, px+"frame_buffer_memory_usage_free", mig.FBMemoryUsage.Free, 1024*1024) // MiB => bytes
+ addMetric(mx, px+"frame_buffer_memory_usage_used", mig.FBMemoryUsage.Used, 1024*1024) // MiB => bytes
+ addMetric(mx, px+"frame_buffer_memory_usage_reserved", mig.FBMemoryUsage.Reserved, 1024*1024) // MiB => bytes
+ addMetric(mx, px+"bar1_memory_usage_free", mig.BAR1MemoryUsage.Free, 1024*1024) // MiB => bytes
+ addMetric(mx, px+"bar1_memory_usage_used", mig.BAR1MemoryUsage.Used, 1024*1024) // MiB => bytes
+ }
+ }
+
+ for px := range nv.gpus {
+ if !seenGPU[px] {
+ delete(nv.gpus, px)
+ nv.removeCharts(px)
+ }
+ }
+
+ for px := range nv.migs {
+ if !seenMIG[px] {
+ delete(nv.migs, px)
+ nv.removeCharts(px)
+ }
+ }
+
+ return nil
+}
+
+func calcMaxPCIEBandwidth(gpu xmlGPUInfo) float64 {
+ gen := gpu.PCI.PCIGPULinkInfo.PCIEGen.MaxLinkGen
+ width := strings.TrimSuffix(gpu.PCI.PCIGPULinkInfo.LinkWidths.MaxLinkWidth, "x")
+
+ if !isValidValue(gen) || !isValidValue(width) {
+ return 0
+ }
+
+ // https://enterprise-support.nvidia.com/s/article/understanding-pcie-configuration-for-maximum-performance
+ var speed, enc float64
+ switch gen {
+ case "1":
+ speed, enc = 2.5, 1.0/5.0
+ case "2":
+ speed, enc = 5, 1.0/5.0
+ case "3":
+ speed, enc = 8, 2.0/130.0
+ case "4":
+ speed, enc = 16, 2.0/130.0
+ case "5":
+ speed, enc = 32, 2.0/130.0
+ default:
+ return 0
+ }
+
+ // Maximum PCIe Bandwidth = SPEED * WIDTH * (1 - ENCODING) - 1Gb/s
+ return (speed*parseFloat(width)*(1-enc) - 1) * 1e9 / 8 // Gb/s => bytes
+}
+
+type (
+ xmlInfo struct {
+ GPUs []xmlGPUInfo `xml:"gpu"`
+ }
+ xmlGPUInfo struct {
+ ID string `xml:"id,attr"`
+ ProductName string `xml:"product_name"`
+ ProductBrand string `xml:"product_brand"`
+ ProductArchitecture string `xml:"product_architecture"`
+ UUID string `xml:"uuid"`
+ FanSpeed string `xml:"fan_speed"`
+ PerformanceState string `xml:"performance_state"`
+ MIGMode struct {
+ CurrentMIG string `xml:"current_mig"`
+ } `xml:"mig_mode"`
+ MIGDevices struct {
+ MIGDevice []xmlMIGDeviceInfo `xml:"mig_device"`
+ } `xml:"mig_devices"`
+ PCI struct {
+ TxUtil string `xml:"tx_util"`
+ RxUtil string `xml:"rx_util"`
+ PCIGPULinkInfo struct {
+ PCIEGen struct {
+ MaxLinkGen string `xml:"max_link_gen"`
+ } `xml:"pcie_gen"`
+ LinkWidths struct {
+ MaxLinkWidth string `xml:"max_link_width"`
+ } `xml:"link_widths"`
+ } `xml:"pci_gpu_link_info"`
+ } `xml:"pci"`
+ Utilization struct {
+ GpuUtil string `xml:"gpu_util"`
+ MemoryUtil string `xml:"memory_util"`
+ EncoderUtil string `xml:"encoder_util"`
+ DecoderUtil string `xml:"decoder_util"`
+ } `xml:"utilization"`
+ FBMemoryUsage struct {
+ Total string `xml:"total"`
+ Reserved string `xml:"reserved"`
+ Used string `xml:"used"`
+ Free string `xml:"free"`
+ } `xml:"fb_memory_usage"`
+ Bar1MemoryUsage struct {
+ Total string `xml:"total"`
+ Used string `xml:"used"`
+ Free string `xml:"free"`
+ } `xml:"bar1_memory_usage"`
+ Temperature struct {
+ GpuTemp string `xml:"gpu_temp"`
+ GpuTempMaxThreshold string `xml:"gpu_temp_max_threshold"`
+ GpuTempSlowThreshold string `xml:"gpu_temp_slow_threshold"`
+ GpuTempMaxGpuThreshold string `xml:"gpu_temp_max_gpu_threshold"`
+ GpuTargetTemperature string `xml:"gpu_target_temperature"`
+ MemoryTemp string `xml:"memory_temp"`
+ GpuTempMaxMemThreshold string `xml:"gpu_temp_max_mem_threshold"`
+ } `xml:"temperature"`
+ Clocks struct {
+ GraphicsClock string `xml:"graphics_clock"`
+ SmClock string `xml:"sm_clock"`
+ MemClock string `xml:"mem_clock"`
+ VideoClock string `xml:"video_clock"`
+ } `xml:"clocks"`
+ PowerReadings *xmlPowerReadings `xml:"power_readings"`
+ GPUPowerReadings *xmlPowerReadings `xml:"gpu_power_readings"`
+ Voltage struct {
+ GraphicsVolt string `xml:"graphics_volt"`
+ } `xml:"voltage"`
+ Processes struct {
+ ProcessInfo []struct {
+ PID string `xml:"pid"`
+ ProcessName string `xml:"process_name"`
+ UsedMemory string `xml:"used_memory"`
+ } `sml:"process_info"`
+ } `xml:"processes"`
+ }
+
+ xmlPowerReadings struct {
+ //PowerState string `xml:"power_state"`
+ //PowerManagement string `xml:"power_management"`
+ PowerDraw string `xml:"power_draw"`
+ //PowerLimit string `xml:"power_limit"`
+ //DefaultPowerLimit string `xml:"default_power_limit"`
+ //EnforcedPowerLimit string `xml:"enforced_power_limit"`
+ //MinPowerLimit string `xml:"min_power_limit"`
+ //MaxPowerLimit string `xml:"max_power_limit"`
+ }
+
+ xmlMIGDeviceInfo struct {
+ Index string `xml:"index"`
+ GPUInstanceID string `xml:"gpu_instance_id"`
+ ComputeInstanceID string `xml:"compute_instance_id"`
+ DeviceAttributes struct {
+ Shared struct {
+ MultiprocessorCount string `xml:"multiprocessor_count"`
+ CopyEngineCount string `xml:"copy_engine_count"`
+ EncoderCount string `xml:"encoder_count"`
+ DecoderCount string `xml:"decoder_count"`
+ OFACount string `xml:"ofa_count"`
+ JPGCount string `xml:"jpg_count"`
+ } `xml:"shared"`
+ } `xml:"device_attributes"`
+ ECCErrorCount struct {
+ VolatileCount struct {
+ SRAMUncorrectable string `xml:"sram_uncorrectable"`
+ } `xml:"volatile_count"`
+ } `xml:"ecc_error_count"`
+ FBMemoryUsage struct {
+ Free string `xml:"free"`
+ Used string `xml:"used"`
+ Reserved string `xml:"reserved"`
+ } `xml:"fb_memory_usage"`
+ BAR1MemoryUsage struct {
+ Free string `xml:"free"`
+ Used string `xml:"used"`
+ } `xml:"bar1_memory_usage"`
+ }
+)