diff options
Diffstat (limited to '')
-rw-r--r-- | src/go/collectors/go.d.plugin/modules/nvidia_smi/collect_xml.go | 265 |
1 files changed, 265 insertions, 0 deletions
diff --git a/src/go/collectors/go.d.plugin/modules/nvidia_smi/collect_xml.go b/src/go/collectors/go.d.plugin/modules/nvidia_smi/collect_xml.go new file mode 100644 index 000000000..2ab3180a8 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/nvidia_smi/collect_xml.go @@ -0,0 +1,265 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package nvidia_smi + +import ( + "encoding/xml" + "fmt" + "strconv" + "strings" +) + +func (nv *NvidiaSMI) collectGPUInfoXML(mx map[string]int64) error { + bs, err := nv.exec.queryGPUInfoXML() + if err != nil { + return fmt.Errorf("error on quering XML GPU info: %v", err) + } + + info := &xmlInfo{} + if err := xml.Unmarshal(bs, info); err != nil { + return fmt.Errorf("error on unmarshaling XML GPU info response: %v", err) + } + + seenGPU := make(map[string]bool) + seenMIG := make(map[string]bool) + + for _, gpu := range info.GPUs { + if !isValidValue(gpu.UUID) { + continue + } + + px := "gpu_" + gpu.UUID + "_" + + seenGPU[px] = true + + if !nv.gpus[px] { + nv.gpus[px] = true + nv.addGPUXMLCharts(gpu) + } + + addMetric(mx, px+"pcie_bandwidth_usage_rx", gpu.PCI.RxUtil, 1024) // KB => bytes + addMetric(mx, px+"pcie_bandwidth_usage_tx", gpu.PCI.TxUtil, 1024) // KB => bytes + if max := calcMaxPCIEBandwidth(gpu); max > 0 { + rx := parseFloat(gpu.PCI.RxUtil) * 1024 // KB => bytes + tx := parseFloat(gpu.PCI.TxUtil) * 1024 // KB => bytes + mx[px+"pcie_bandwidth_utilization_rx"] = int64((rx * 100 / max) * 100) + mx[px+"pcie_bandwidth_utilization_tx"] = int64((tx * 100 / max) * 100) + } + addMetric(mx, px+"fan_speed_perc", gpu.FanSpeed, 0) + addMetric(mx, px+"gpu_utilization", gpu.Utilization.GpuUtil, 0) + addMetric(mx, px+"mem_utilization", gpu.Utilization.MemoryUtil, 0) + addMetric(mx, px+"decoder_utilization", gpu.Utilization.DecoderUtil, 0) + addMetric(mx, px+"encoder_utilization", gpu.Utilization.EncoderUtil, 0) + addMetric(mx, px+"frame_buffer_memory_usage_free", gpu.FBMemoryUsage.Free, 1024*1024) // MiB => bytes + addMetric(mx, px+"frame_buffer_memory_usage_used", gpu.FBMemoryUsage.Used, 1024*1024) // MiB => bytes + addMetric(mx, px+"frame_buffer_memory_usage_reserved", gpu.FBMemoryUsage.Reserved, 1024*1024) // MiB => bytes + addMetric(mx, px+"bar1_memory_usage_free", gpu.Bar1MemoryUsage.Free, 1024*1024) // MiB => bytes + addMetric(mx, px+"bar1_memory_usage_used", gpu.Bar1MemoryUsage.Used, 1024*1024) // MiB => bytes + addMetric(mx, px+"temperature", gpu.Temperature.GpuTemp, 0) + addMetric(mx, px+"graphics_clock", gpu.Clocks.GraphicsClock, 0) + addMetric(mx, px+"video_clock", gpu.Clocks.VideoClock, 0) + addMetric(mx, px+"sm_clock", gpu.Clocks.SmClock, 0) + addMetric(mx, px+"mem_clock", gpu.Clocks.MemClock, 0) + if gpu.PowerReadings != nil { + addMetric(mx, px+"power_draw", gpu.PowerReadings.PowerDraw, 0) + } else if gpu.GPUPowerReadings != nil { + addMetric(mx, px+"power_draw", gpu.GPUPowerReadings.PowerDraw, 0) + } + addMetric(mx, px+"voltage", gpu.Voltage.GraphicsVolt, 0) + for i := 0; i < 16; i++ { + s := "P" + strconv.Itoa(i) + mx[px+"performance_state_"+s] = boolToInt(gpu.PerformanceState == s) + } + if isValidValue(gpu.MIGMode.CurrentMIG) { + mode := strings.ToLower(gpu.MIGMode.CurrentMIG) + mx[px+"mig_current_mode_enabled"] = boolToInt(mode == "enabled") + mx[px+"mig_current_mode_disabled"] = boolToInt(mode == "disabled") + mx[px+"mig_devices_count"] = int64(len(gpu.MIGDevices.MIGDevice)) + } + + for _, mig := range gpu.MIGDevices.MIGDevice { + if !isValidValue(mig.GPUInstanceID) { + continue + } + + px := "mig_instance_" + mig.GPUInstanceID + "_" + px + + seenMIG[px] = true + + if !nv.migs[px] { + nv.migs[px] = true + nv.addMIGDeviceXMLCharts(gpu, mig) + } + + addMetric(mx, px+"ecc_error_sram_uncorrectable", mig.ECCErrorCount.VolatileCount.SRAMUncorrectable, 0) + addMetric(mx, px+"frame_buffer_memory_usage_free", mig.FBMemoryUsage.Free, 1024*1024) // MiB => bytes + addMetric(mx, px+"frame_buffer_memory_usage_used", mig.FBMemoryUsage.Used, 1024*1024) // MiB => bytes + addMetric(mx, px+"frame_buffer_memory_usage_reserved", mig.FBMemoryUsage.Reserved, 1024*1024) // MiB => bytes + addMetric(mx, px+"bar1_memory_usage_free", mig.BAR1MemoryUsage.Free, 1024*1024) // MiB => bytes + addMetric(mx, px+"bar1_memory_usage_used", mig.BAR1MemoryUsage.Used, 1024*1024) // MiB => bytes + } + } + + for px := range nv.gpus { + if !seenGPU[px] { + delete(nv.gpus, px) + nv.removeCharts(px) + } + } + + for px := range nv.migs { + if !seenMIG[px] { + delete(nv.migs, px) + nv.removeCharts(px) + } + } + + return nil +} + +func calcMaxPCIEBandwidth(gpu xmlGPUInfo) float64 { + gen := gpu.PCI.PCIGPULinkInfo.PCIEGen.MaxLinkGen + width := strings.TrimSuffix(gpu.PCI.PCIGPULinkInfo.LinkWidths.MaxLinkWidth, "x") + + if !isValidValue(gen) || !isValidValue(width) { + return 0 + } + + // https://enterprise-support.nvidia.com/s/article/understanding-pcie-configuration-for-maximum-performance + var speed, enc float64 + switch gen { + case "1": + speed, enc = 2.5, 1.0/5.0 + case "2": + speed, enc = 5, 1.0/5.0 + case "3": + speed, enc = 8, 2.0/130.0 + case "4": + speed, enc = 16, 2.0/130.0 + case "5": + speed, enc = 32, 2.0/130.0 + default: + return 0 + } + + // Maximum PCIe Bandwidth = SPEED * WIDTH * (1 - ENCODING) - 1Gb/s + return (speed*parseFloat(width)*(1-enc) - 1) * 1e9 / 8 // Gb/s => bytes +} + +type ( + xmlInfo struct { + GPUs []xmlGPUInfo `xml:"gpu"` + } + xmlGPUInfo struct { + ID string `xml:"id,attr"` + ProductName string `xml:"product_name"` + ProductBrand string `xml:"product_brand"` + ProductArchitecture string `xml:"product_architecture"` + UUID string `xml:"uuid"` + FanSpeed string `xml:"fan_speed"` + PerformanceState string `xml:"performance_state"` + MIGMode struct { + CurrentMIG string `xml:"current_mig"` + } `xml:"mig_mode"` + MIGDevices struct { + MIGDevice []xmlMIGDeviceInfo `xml:"mig_device"` + } `xml:"mig_devices"` + PCI struct { + TxUtil string `xml:"tx_util"` + RxUtil string `xml:"rx_util"` + PCIGPULinkInfo struct { + PCIEGen struct { + MaxLinkGen string `xml:"max_link_gen"` + } `xml:"pcie_gen"` + LinkWidths struct { + MaxLinkWidth string `xml:"max_link_width"` + } `xml:"link_widths"` + } `xml:"pci_gpu_link_info"` + } `xml:"pci"` + Utilization struct { + GpuUtil string `xml:"gpu_util"` + MemoryUtil string `xml:"memory_util"` + EncoderUtil string `xml:"encoder_util"` + DecoderUtil string `xml:"decoder_util"` + } `xml:"utilization"` + FBMemoryUsage struct { + Total string `xml:"total"` + Reserved string `xml:"reserved"` + Used string `xml:"used"` + Free string `xml:"free"` + } `xml:"fb_memory_usage"` + Bar1MemoryUsage struct { + Total string `xml:"total"` + Used string `xml:"used"` + Free string `xml:"free"` + } `xml:"bar1_memory_usage"` + Temperature struct { + GpuTemp string `xml:"gpu_temp"` + GpuTempMaxThreshold string `xml:"gpu_temp_max_threshold"` + GpuTempSlowThreshold string `xml:"gpu_temp_slow_threshold"` + GpuTempMaxGpuThreshold string `xml:"gpu_temp_max_gpu_threshold"` + GpuTargetTemperature string `xml:"gpu_target_temperature"` + MemoryTemp string `xml:"memory_temp"` + GpuTempMaxMemThreshold string `xml:"gpu_temp_max_mem_threshold"` + } `xml:"temperature"` + Clocks struct { + GraphicsClock string `xml:"graphics_clock"` + SmClock string `xml:"sm_clock"` + MemClock string `xml:"mem_clock"` + VideoClock string `xml:"video_clock"` + } `xml:"clocks"` + PowerReadings *xmlPowerReadings `xml:"power_readings"` + GPUPowerReadings *xmlPowerReadings `xml:"gpu_power_readings"` + Voltage struct { + GraphicsVolt string `xml:"graphics_volt"` + } `xml:"voltage"` + Processes struct { + ProcessInfo []struct { + PID string `xml:"pid"` + ProcessName string `xml:"process_name"` + UsedMemory string `xml:"used_memory"` + } `sml:"process_info"` + } `xml:"processes"` + } + + xmlPowerReadings struct { + //PowerState string `xml:"power_state"` + //PowerManagement string `xml:"power_management"` + PowerDraw string `xml:"power_draw"` + //PowerLimit string `xml:"power_limit"` + //DefaultPowerLimit string `xml:"default_power_limit"` + //EnforcedPowerLimit string `xml:"enforced_power_limit"` + //MinPowerLimit string `xml:"min_power_limit"` + //MaxPowerLimit string `xml:"max_power_limit"` + } + + xmlMIGDeviceInfo struct { + Index string `xml:"index"` + GPUInstanceID string `xml:"gpu_instance_id"` + ComputeInstanceID string `xml:"compute_instance_id"` + DeviceAttributes struct { + Shared struct { + MultiprocessorCount string `xml:"multiprocessor_count"` + CopyEngineCount string `xml:"copy_engine_count"` + EncoderCount string `xml:"encoder_count"` + DecoderCount string `xml:"decoder_count"` + OFACount string `xml:"ofa_count"` + JPGCount string `xml:"jpg_count"` + } `xml:"shared"` + } `xml:"device_attributes"` + ECCErrorCount struct { + VolatileCount struct { + SRAMUncorrectable string `xml:"sram_uncorrectable"` + } `xml:"volatile_count"` + } `xml:"ecc_error_count"` + FBMemoryUsage struct { + Free string `xml:"free"` + Used string `xml:"used"` + Reserved string `xml:"reserved"` + } `xml:"fb_memory_usage"` + BAR1MemoryUsage struct { + Free string `xml:"free"` + Used string `xml:"used"` + } `xml:"bar1_memory_usage"` + } +) |