From 87d772a7d708fec12f48cd8adc0dedff6e1025da Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Mon, 26 Aug 2024 10:15:20 +0200 Subject: Adding upstream version 1.47.0. Signed-off-by: Daniel Baumann --- .../go.d.plugin/modules/nvidia_smi/collect_xml.go | 265 --------------------- 1 file changed, 265 deletions(-) delete mode 100644 src/go/collectors/go.d.plugin/modules/nvidia_smi/collect_xml.go (limited to 'src/go/collectors/go.d.plugin/modules/nvidia_smi/collect_xml.go') diff --git a/src/go/collectors/go.d.plugin/modules/nvidia_smi/collect_xml.go b/src/go/collectors/go.d.plugin/modules/nvidia_smi/collect_xml.go deleted file mode 100644 index 2ab3180a8..000000000 --- a/src/go/collectors/go.d.plugin/modules/nvidia_smi/collect_xml.go +++ /dev/null @@ -1,265 +0,0 @@ -// SPDX-License-Identifier: GPL-3.0-or-later - -package nvidia_smi - -import ( - "encoding/xml" - "fmt" - "strconv" - "strings" -) - -func (nv *NvidiaSMI) collectGPUInfoXML(mx map[string]int64) error { - bs, err := nv.exec.queryGPUInfoXML() - if err != nil { - return fmt.Errorf("error on quering XML GPU info: %v", err) - } - - info := &xmlInfo{} - if err := xml.Unmarshal(bs, info); err != nil { - return fmt.Errorf("error on unmarshaling XML GPU info response: %v", err) - } - - seenGPU := make(map[string]bool) - seenMIG := make(map[string]bool) - - for _, gpu := range info.GPUs { - if !isValidValue(gpu.UUID) { - continue - } - - px := "gpu_" + gpu.UUID + "_" - - seenGPU[px] = true - - if !nv.gpus[px] { - nv.gpus[px] = true - nv.addGPUXMLCharts(gpu) - } - - addMetric(mx, px+"pcie_bandwidth_usage_rx", gpu.PCI.RxUtil, 1024) // KB => bytes - addMetric(mx, px+"pcie_bandwidth_usage_tx", gpu.PCI.TxUtil, 1024) // KB => bytes - if max := calcMaxPCIEBandwidth(gpu); max > 0 { - rx := parseFloat(gpu.PCI.RxUtil) * 1024 // KB => bytes - tx := parseFloat(gpu.PCI.TxUtil) * 1024 // KB => bytes - mx[px+"pcie_bandwidth_utilization_rx"] = int64((rx * 100 / max) * 100) - mx[px+"pcie_bandwidth_utilization_tx"] = int64((tx * 100 / max) * 100) - } - addMetric(mx, px+"fan_speed_perc", gpu.FanSpeed, 0) - addMetric(mx, px+"gpu_utilization", gpu.Utilization.GpuUtil, 0) - addMetric(mx, px+"mem_utilization", gpu.Utilization.MemoryUtil, 0) - addMetric(mx, px+"decoder_utilization", gpu.Utilization.DecoderUtil, 0) - addMetric(mx, px+"encoder_utilization", gpu.Utilization.EncoderUtil, 0) - addMetric(mx, px+"frame_buffer_memory_usage_free", gpu.FBMemoryUsage.Free, 1024*1024) // MiB => bytes - addMetric(mx, px+"frame_buffer_memory_usage_used", gpu.FBMemoryUsage.Used, 1024*1024) // MiB => bytes - addMetric(mx, px+"frame_buffer_memory_usage_reserved", gpu.FBMemoryUsage.Reserved, 1024*1024) // MiB => bytes - addMetric(mx, px+"bar1_memory_usage_free", gpu.Bar1MemoryUsage.Free, 1024*1024) // MiB => bytes - addMetric(mx, px+"bar1_memory_usage_used", gpu.Bar1MemoryUsage.Used, 1024*1024) // MiB => bytes - addMetric(mx, px+"temperature", gpu.Temperature.GpuTemp, 0) - addMetric(mx, px+"graphics_clock", gpu.Clocks.GraphicsClock, 0) - addMetric(mx, px+"video_clock", gpu.Clocks.VideoClock, 0) - addMetric(mx, px+"sm_clock", gpu.Clocks.SmClock, 0) - addMetric(mx, px+"mem_clock", gpu.Clocks.MemClock, 0) - if gpu.PowerReadings != nil { - addMetric(mx, px+"power_draw", gpu.PowerReadings.PowerDraw, 0) - } else if gpu.GPUPowerReadings != nil { - addMetric(mx, px+"power_draw", gpu.GPUPowerReadings.PowerDraw, 0) - } - addMetric(mx, px+"voltage", gpu.Voltage.GraphicsVolt, 0) - for i := 0; i < 16; i++ { - s := "P" + strconv.Itoa(i) - mx[px+"performance_state_"+s] = boolToInt(gpu.PerformanceState == s) - } - if isValidValue(gpu.MIGMode.CurrentMIG) { - mode := strings.ToLower(gpu.MIGMode.CurrentMIG) - mx[px+"mig_current_mode_enabled"] = boolToInt(mode == "enabled") - mx[px+"mig_current_mode_disabled"] = boolToInt(mode == "disabled") - mx[px+"mig_devices_count"] = int64(len(gpu.MIGDevices.MIGDevice)) - } - - for _, mig := range gpu.MIGDevices.MIGDevice { - if !isValidValue(mig.GPUInstanceID) { - continue - } - - px := "mig_instance_" + mig.GPUInstanceID + "_" + px - - seenMIG[px] = true - - if !nv.migs[px] { - nv.migs[px] = true - nv.addMIGDeviceXMLCharts(gpu, mig) - } - - addMetric(mx, px+"ecc_error_sram_uncorrectable", mig.ECCErrorCount.VolatileCount.SRAMUncorrectable, 0) - addMetric(mx, px+"frame_buffer_memory_usage_free", mig.FBMemoryUsage.Free, 1024*1024) // MiB => bytes - addMetric(mx, px+"frame_buffer_memory_usage_used", mig.FBMemoryUsage.Used, 1024*1024) // MiB => bytes - addMetric(mx, px+"frame_buffer_memory_usage_reserved", mig.FBMemoryUsage.Reserved, 1024*1024) // MiB => bytes - addMetric(mx, px+"bar1_memory_usage_free", mig.BAR1MemoryUsage.Free, 1024*1024) // MiB => bytes - addMetric(mx, px+"bar1_memory_usage_used", mig.BAR1MemoryUsage.Used, 1024*1024) // MiB => bytes - } - } - - for px := range nv.gpus { - if !seenGPU[px] { - delete(nv.gpus, px) - nv.removeCharts(px) - } - } - - for px := range nv.migs { - if !seenMIG[px] { - delete(nv.migs, px) - nv.removeCharts(px) - } - } - - return nil -} - -func calcMaxPCIEBandwidth(gpu xmlGPUInfo) float64 { - gen := gpu.PCI.PCIGPULinkInfo.PCIEGen.MaxLinkGen - width := strings.TrimSuffix(gpu.PCI.PCIGPULinkInfo.LinkWidths.MaxLinkWidth, "x") - - if !isValidValue(gen) || !isValidValue(width) { - return 0 - } - - // https://enterprise-support.nvidia.com/s/article/understanding-pcie-configuration-for-maximum-performance - var speed, enc float64 - switch gen { - case "1": - speed, enc = 2.5, 1.0/5.0 - case "2": - speed, enc = 5, 1.0/5.0 - case "3": - speed, enc = 8, 2.0/130.0 - case "4": - speed, enc = 16, 2.0/130.0 - case "5": - speed, enc = 32, 2.0/130.0 - default: - return 0 - } - - // Maximum PCIe Bandwidth = SPEED * WIDTH * (1 - ENCODING) - 1Gb/s - return (speed*parseFloat(width)*(1-enc) - 1) * 1e9 / 8 // Gb/s => bytes -} - -type ( - xmlInfo struct { - GPUs []xmlGPUInfo `xml:"gpu"` - } - xmlGPUInfo struct { - ID string `xml:"id,attr"` - ProductName string `xml:"product_name"` - ProductBrand string `xml:"product_brand"` - ProductArchitecture string `xml:"product_architecture"` - UUID string `xml:"uuid"` - FanSpeed string `xml:"fan_speed"` - PerformanceState string `xml:"performance_state"` - MIGMode struct { - CurrentMIG string `xml:"current_mig"` - } `xml:"mig_mode"` - MIGDevices struct { - MIGDevice []xmlMIGDeviceInfo `xml:"mig_device"` - } `xml:"mig_devices"` - PCI struct { - TxUtil string `xml:"tx_util"` - RxUtil string `xml:"rx_util"` - PCIGPULinkInfo struct { - PCIEGen struct { - MaxLinkGen string `xml:"max_link_gen"` - } `xml:"pcie_gen"` - LinkWidths struct { - MaxLinkWidth string `xml:"max_link_width"` - } `xml:"link_widths"` - } `xml:"pci_gpu_link_info"` - } `xml:"pci"` - Utilization struct { - GpuUtil string `xml:"gpu_util"` - MemoryUtil string `xml:"memory_util"` - EncoderUtil string `xml:"encoder_util"` - DecoderUtil string `xml:"decoder_util"` - } `xml:"utilization"` - FBMemoryUsage struct { - Total string `xml:"total"` - Reserved string `xml:"reserved"` - Used string `xml:"used"` - Free string `xml:"free"` - } `xml:"fb_memory_usage"` - Bar1MemoryUsage struct { - Total string `xml:"total"` - Used string `xml:"used"` - Free string `xml:"free"` - } `xml:"bar1_memory_usage"` - Temperature struct { - GpuTemp string `xml:"gpu_temp"` - GpuTempMaxThreshold string `xml:"gpu_temp_max_threshold"` - GpuTempSlowThreshold string `xml:"gpu_temp_slow_threshold"` - GpuTempMaxGpuThreshold string `xml:"gpu_temp_max_gpu_threshold"` - GpuTargetTemperature string `xml:"gpu_target_temperature"` - MemoryTemp string `xml:"memory_temp"` - GpuTempMaxMemThreshold string `xml:"gpu_temp_max_mem_threshold"` - } `xml:"temperature"` - Clocks struct { - GraphicsClock string `xml:"graphics_clock"` - SmClock string `xml:"sm_clock"` - MemClock string `xml:"mem_clock"` - VideoClock string `xml:"video_clock"` - } `xml:"clocks"` - PowerReadings *xmlPowerReadings `xml:"power_readings"` - GPUPowerReadings *xmlPowerReadings `xml:"gpu_power_readings"` - Voltage struct { - GraphicsVolt string `xml:"graphics_volt"` - } `xml:"voltage"` - Processes struct { - ProcessInfo []struct { - PID string `xml:"pid"` - ProcessName string `xml:"process_name"` - UsedMemory string `xml:"used_memory"` - } `sml:"process_info"` - } `xml:"processes"` - } - - xmlPowerReadings struct { - //PowerState string `xml:"power_state"` - //PowerManagement string `xml:"power_management"` - PowerDraw string `xml:"power_draw"` - //PowerLimit string `xml:"power_limit"` - //DefaultPowerLimit string `xml:"default_power_limit"` - //EnforcedPowerLimit string `xml:"enforced_power_limit"` - //MinPowerLimit string `xml:"min_power_limit"` - //MaxPowerLimit string `xml:"max_power_limit"` - } - - xmlMIGDeviceInfo struct { - Index string `xml:"index"` - GPUInstanceID string `xml:"gpu_instance_id"` - ComputeInstanceID string `xml:"compute_instance_id"` - DeviceAttributes struct { - Shared struct { - MultiprocessorCount string `xml:"multiprocessor_count"` - CopyEngineCount string `xml:"copy_engine_count"` - EncoderCount string `xml:"encoder_count"` - DecoderCount string `xml:"decoder_count"` - OFACount string `xml:"ofa_count"` - JPGCount string `xml:"jpg_count"` - } `xml:"shared"` - } `xml:"device_attributes"` - ECCErrorCount struct { - VolatileCount struct { - SRAMUncorrectable string `xml:"sram_uncorrectable"` - } `xml:"volatile_count"` - } `xml:"ecc_error_count"` - FBMemoryUsage struct { - Free string `xml:"free"` - Used string `xml:"used"` - Reserved string `xml:"reserved"` - } `xml:"fb_memory_usage"` - BAR1MemoryUsage struct { - Free string `xml:"free"` - Used string `xml:"used"` - } `xml:"bar1_memory_usage"` - } -) -- cgit v1.2.3