diff options
Diffstat (limited to 'src/go/plugin/go.d/modules/nvidia_smi/charts.go')
-rw-r--r-- | src/go/plugin/go.d/modules/nvidia_smi/charts.go | 370 |
1 files changed, 370 insertions, 0 deletions
diff --git a/src/go/plugin/go.d/modules/nvidia_smi/charts.go b/src/go/plugin/go.d/modules/nvidia_smi/charts.go new file mode 100644 index 000000000..746c8eed3 --- /dev/null +++ b/src/go/plugin/go.d/modules/nvidia_smi/charts.go @@ -0,0 +1,370 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package nvidia_smi + +import ( + "fmt" + "strings" + + "github.com/netdata/netdata/go/plugins/plugin/go.d/agent/module" +) + +const ( + prioGPUPCIBandwidthUsage = module.Priority + iota + prioGPUPCIBandwidthUtilization + prioGPUFanSpeed + prioGPUUtilization + prioGPUMemUtilization + prioGPUDecoderUtilization + prioGPUEncoderUtilization + prioGPUMIGModeStatus + prioGPUMIGDevicesCount + prioGPUFBMemoryUsage + prioGPUMIGFBMemoryUsage + prioGPUBAR1MemoryUsage + prioGPUMIGBAR1MemoryUsage + prioGPUTemperatureChart + prioGPUVoltageChart + prioGPUClockFreq + prioGPUPowerDraw + prioGPUPerformanceState +) + +var ( + gpuXMLCharts = module.Charts{ + gpuPCIBandwidthUsageChartTmpl.Copy(), + gpuPCIBandwidthUtilizationChartTmpl.Copy(), + gpuFanSpeedPercChartTmpl.Copy(), + gpuUtilizationChartTmpl.Copy(), + gpuMemUtilizationChartTmpl.Copy(), + gpuDecoderUtilizationChartTmpl.Copy(), + gpuEncoderUtilizationChartTmpl.Copy(), + gpuMIGModeCurrentStatusChartTmpl.Copy(), + gpuMIGDevicesCountChartTmpl.Copy(), + gpuFrameBufferMemoryUsageChartTmpl.Copy(), + gpuBAR1MemoryUsageChartTmpl.Copy(), + gpuVoltageChartTmpl.Copy(), + gpuTemperatureChartTmpl.Copy(), + gpuClockFreqChartTmpl.Copy(), + gpuPowerDrawChartTmpl.Copy(), + gpuPerformanceStateChartTmpl.Copy(), + } + migDeviceXMLCharts = module.Charts{ + migDeviceFrameBufferMemoryUsageChartTmpl.Copy(), + migDeviceBAR1MemoryUsageChartTmpl.Copy(), + } +) + +var ( + gpuPCIBandwidthUsageChartTmpl = module.Chart{ + ID: "gpu_%s_pcie_bandwidth_usage", + Title: "PCI Express Bandwidth Usage", + Units: "B/s", + Fam: "pcie bandwidth", + Ctx: "nvidia_smi.gpu_pcie_bandwidth_usage", + Type: module.Area, + Priority: prioGPUPCIBandwidthUsage, + Dims: module.Dims{ + {ID: "gpu_%s_pcie_bandwidth_usage_rx", Name: "rx"}, + {ID: "gpu_%s_pcie_bandwidth_usage_tx", Name: "tx", Mul: -1}, + }, + } + gpuPCIBandwidthUtilizationChartTmpl = module.Chart{ + ID: "gpu_%s_pcie_bandwidth_utilization", + Title: "PCI Express Bandwidth Utilization", + Units: "percentage", + Fam: "pcie bandwidth", + Ctx: "nvidia_smi.gpu_pcie_bandwidth_utilization", + Priority: prioGPUPCIBandwidthUtilization, + Dims: module.Dims{ + {ID: "gpu_%s_pcie_bandwidth_utilization_rx", Name: "rx", Div: 100}, + {ID: "gpu_%s_pcie_bandwidth_utilization_tx", Name: "tx", Div: 100}, + }, + } + gpuFanSpeedPercChartTmpl = module.Chart{ + ID: "gpu_%s_fan_speed_perc", + Title: "Fan speed", + Units: "%", + Fam: "fan speed", + Ctx: "nvidia_smi.gpu_fan_speed_perc", + Priority: prioGPUFanSpeed, + Dims: module.Dims{ + {ID: "gpu_%s_fan_speed_perc", Name: "fan_speed"}, + }, + } + gpuUtilizationChartTmpl = module.Chart{ + ID: "gpu_%s_gpu_utilization", + Title: "GPU utilization", + Units: "%", + Fam: "gpu utilization", + Ctx: "nvidia_smi.gpu_utilization", + Priority: prioGPUUtilization, + Dims: module.Dims{ + {ID: "gpu_%s_gpu_utilization", Name: "gpu"}, + }, + } + gpuMemUtilizationChartTmpl = module.Chart{ + ID: "gpu_%s_memory_utilization", + Title: "Memory utilization", + Units: "%", + Fam: "mem utilization", + Ctx: "nvidia_smi.gpu_memory_utilization", + Priority: prioGPUMemUtilization, + Dims: module.Dims{ + {ID: "gpu_%s_mem_utilization", Name: "memory"}, + }, + } + gpuDecoderUtilizationChartTmpl = module.Chart{ + ID: "gpu_%s_decoder_utilization", + Title: "Decoder utilization", + Units: "%", + Fam: "dec utilization", + Ctx: "nvidia_smi.gpu_decoder_utilization", + Priority: prioGPUDecoderUtilization, + Dims: module.Dims{ + {ID: "gpu_%s_decoder_utilization", Name: "decoder"}, + }, + } + gpuEncoderUtilizationChartTmpl = module.Chart{ + ID: "gpu_%s_encoder_utilization", + Title: "Encoder utilization", + Units: "%", + Fam: "enc utilization", + Ctx: "nvidia_smi.gpu_encoder_utilization", + Priority: prioGPUEncoderUtilization, + Dims: module.Dims{ + {ID: "gpu_%s_encoder_utilization", Name: "encoder"}, + }, + } + gpuMIGModeCurrentStatusChartTmpl = module.Chart{ + ID: "gpu_%s_mig_mode_current_status", + Title: "MIG current mode", + Units: "status", + Fam: "mig", + Ctx: "nvidia_smi.gpu_mig_mode_current_status", + Priority: prioGPUMIGModeStatus, + Dims: module.Dims{ + {ID: "gpu_%s_mig_current_mode_enabled", Name: "enabled"}, + {ID: "gpu_%s_mig_current_mode_disabled", Name: "disabled"}, + }, + } + gpuMIGDevicesCountChartTmpl = module.Chart{ + ID: "gpu_%s_mig_devices_count", + Title: "MIG devices", + Units: "devices", + Fam: "mig", + Ctx: "nvidia_smi.gpu_mig_devices_count", + Priority: prioGPUMIGDevicesCount, + Dims: module.Dims{ + {ID: "gpu_%s_mig_devices_count", Name: "mig"}, + }, + } + gpuFrameBufferMemoryUsageChartTmpl = module.Chart{ + ID: "gpu_%s_frame_buffer_memory_usage", + Title: "Frame buffer memory usage", + Units: "B", + Fam: "fb mem usage", + Ctx: "nvidia_smi.gpu_frame_buffer_memory_usage", + Type: module.Stacked, + Priority: prioGPUFBMemoryUsage, + Dims: module.Dims{ + {ID: "gpu_%s_frame_buffer_memory_usage_free", Name: "free"}, + {ID: "gpu_%s_frame_buffer_memory_usage_used", Name: "used"}, + {ID: "gpu_%s_frame_buffer_memory_usage_reserved", Name: "reserved"}, + }, + } + gpuBAR1MemoryUsageChartTmpl = module.Chart{ + ID: "gpu_%s_bar1_memory_usage", + Title: "BAR1 memory usage", + Units: "B", + Fam: "bar1 mem usage", + Ctx: "nvidia_smi.gpu_bar1_memory_usage", + Type: module.Stacked, + Priority: prioGPUBAR1MemoryUsage, + Dims: module.Dims{ + {ID: "gpu_%s_bar1_memory_usage_free", Name: "free"}, + {ID: "gpu_%s_bar1_memory_usage_used", Name: "used"}, + }, + } + gpuTemperatureChartTmpl = module.Chart{ + ID: "gpu_%s_temperature", + Title: "Temperature", + Units: "Celsius", + Fam: "temperature", + Ctx: "nvidia_smi.gpu_temperature", + Priority: prioGPUTemperatureChart, + Dims: module.Dims{ + {ID: "gpu_%s_temperature", Name: "temperature"}, + }, + } + gpuVoltageChartTmpl = module.Chart{ + ID: "gpu_%s_voltage", + Title: "Voltage", + Units: "V", + Fam: "voltage", + Ctx: "nvidia_smi.gpu_voltage", + Priority: prioGPUVoltageChart, + Dims: module.Dims{ + {ID: "gpu_%s_voltage", Name: "voltage", Div: 1000}, // mV => V + }, + } + gpuClockFreqChartTmpl = module.Chart{ + ID: "gpu_%s_clock_freq", + Title: "Clock current frequency", + Units: "MHz", + Fam: "clocks", + Ctx: "nvidia_smi.gpu_clock_freq", + Priority: prioGPUClockFreq, + Dims: module.Dims{ + {ID: "gpu_%s_graphics_clock", Name: "graphics"}, + {ID: "gpu_%s_video_clock", Name: "video"}, + {ID: "gpu_%s_sm_clock", Name: "sm"}, + {ID: "gpu_%s_mem_clock", Name: "mem"}, + }, + } + gpuPowerDrawChartTmpl = module.Chart{ + ID: "gpu_%s_power_draw", + Title: "Power draw", + Units: "Watts", + Fam: "power draw", + Ctx: "nvidia_smi.gpu_power_draw", + Priority: prioGPUPowerDraw, + Dims: module.Dims{ + {ID: "gpu_%s_power_draw", Name: "power_draw"}, + }, + } + gpuPerformanceStateChartTmpl = module.Chart{ + ID: "gpu_%s_performance_state", + Title: "Performance state", + Units: "state", + Fam: "performance state", + Ctx: "nvidia_smi.gpu_performance_state", + Priority: prioGPUPerformanceState, + Dims: module.Dims{ + {ID: "gpu_%s_performance_state_P0", Name: "P0"}, + {ID: "gpu_%s_performance_state_P1", Name: "P1"}, + {ID: "gpu_%s_performance_state_P2", Name: "P2"}, + {ID: "gpu_%s_performance_state_P3", Name: "P3"}, + {ID: "gpu_%s_performance_state_P4", Name: "P4"}, + {ID: "gpu_%s_performance_state_P5", Name: "P5"}, + {ID: "gpu_%s_performance_state_P6", Name: "P6"}, + {ID: "gpu_%s_performance_state_P7", Name: "P7"}, + {ID: "gpu_%s_performance_state_P8", Name: "P8"}, + {ID: "gpu_%s_performance_state_P9", Name: "P9"}, + {ID: "gpu_%s_performance_state_P10", Name: "P10"}, + {ID: "gpu_%s_performance_state_P11", Name: "P11"}, + {ID: "gpu_%s_performance_state_P12", Name: "P12"}, + {ID: "gpu_%s_performance_state_P13", Name: "P13"}, + {ID: "gpu_%s_performance_state_P14", Name: "P14"}, + {ID: "gpu_%s_performance_state_P15", Name: "P15"}, + }, + } +) + +func (nv *NvidiaSmi) addGPUXMLCharts(gpu gpuInfo) { + charts := gpuXMLCharts.Copy() + + if !isValidValue(gpu.Utilization.GpuUtil) { + _ = charts.Remove(gpuUtilizationChartTmpl.ID) + } + if !isValidValue(gpu.Utilization.MemoryUtil) { + _ = charts.Remove(gpuMemUtilizationChartTmpl.ID) + } + if !isValidValue(gpu.Utilization.DecoderUtil) { + _ = charts.Remove(gpuDecoderUtilizationChartTmpl.ID) + } + if !isValidValue(gpu.Utilization.EncoderUtil) { + _ = charts.Remove(gpuEncoderUtilizationChartTmpl.ID) + } + if !isValidValue(gpu.MIGMode.CurrentMIG) { + _ = charts.Remove(gpuMIGModeCurrentStatusChartTmpl.ID) + _ = charts.Remove(gpuMIGDevicesCountChartTmpl.ID) + } + if !isValidValue(gpu.FanSpeed) { + _ = charts.Remove(gpuFanSpeedPercChartTmpl.ID) + } + if (gpu.PowerReadings == nil || !isValidValue(gpu.PowerReadings.PowerDraw)) && + (gpu.GPUPowerReadings == nil || !isValidValue(gpu.GPUPowerReadings.PowerDraw)) { + _ = charts.Remove(gpuPowerDrawChartTmpl.ID) + } + if !isValidValue(gpu.Voltage.GraphicsVolt) { + _ = charts.Remove(gpuVoltageChartTmpl.ID) + } + + for _, c := range *charts { + c.ID = fmt.Sprintf(c.ID, strings.ToLower(gpu.UUID)) + c.Labels = []module.Label{ + // csv output has no 'product_brand' + {Key: "uuid", Value: gpu.UUID}, + {Key: "product_name", Value: gpu.ProductName}, + } + for _, d := range c.Dims { + d.ID = fmt.Sprintf(d.ID, gpu.UUID) + } + } + + if err := nv.Charts().Add(*charts...); err != nil { + nv.Warning(err) + } +} + +var ( + migDeviceFrameBufferMemoryUsageChartTmpl = module.Chart{ + ID: "mig_instance_%s_gpu_%s_frame_buffer_memory_usage", + Title: "MIG Frame buffer memory usage", + Units: "B", + Fam: "fb mem usage", + Ctx: "nvidia_smi.gpu_mig_frame_buffer_memory_usage", + Type: module.Stacked, + Priority: prioGPUMIGFBMemoryUsage, + Dims: module.Dims{ + {ID: "mig_instance_%s_gpu_%s_frame_buffer_memory_usage_free", Name: "free"}, + {ID: "mig_instance_%s_gpu_%s_frame_buffer_memory_usage_used", Name: "used"}, + {ID: "mig_instance_%s_gpu_%s_frame_buffer_memory_usage_reserved", Name: "reserved"}, + }, + } + migDeviceBAR1MemoryUsageChartTmpl = module.Chart{ + ID: "mig_instance_%s_gpu_%s_bar1_memory_usage", + Title: "MIG BAR1 memory usage", + Units: "B", + Fam: "bar1 mem usage", + Ctx: "nvidia_smi.gpu_mig_bar1_memory_usage", + Type: module.Stacked, + Priority: prioGPUMIGBAR1MemoryUsage, + Dims: module.Dims{ + {ID: "mig_instance_%s_gpu_%s_bar1_memory_usage_free", Name: "free"}, + {ID: "mig_instance_%s_gpu_%s_bar1_memory_usage_used", Name: "used"}, + }, + } +) + +func (nv *NvidiaSmi) addMIGDeviceCharts(gpu gpuInfo, mig gpuMIGDeviceInfo) { + charts := migDeviceXMLCharts.Copy() + + for _, c := range *charts { + c.ID = fmt.Sprintf(c.ID, strings.ToLower(mig.GPUInstanceID), strings.ToLower(gpu.UUID)) + c.Labels = []module.Label{ + {Key: "gpu_uuid", Value: gpu.UUID}, + {Key: "gpu_product_name", Value: gpu.ProductName}, + {Key: "gpu_instance_id", Value: mig.GPUInstanceID}, + } + for _, d := range c.Dims { + d.ID = fmt.Sprintf(d.ID, mig.GPUInstanceID, gpu.UUID) + } + } + + if err := nv.Charts().Add(*charts...); err != nil { + nv.Warning(err) + } +} + +func (nv *NvidiaSmi) removeCharts(prefix string) { + prefix = strings.ToLower(prefix) + + for _, c := range *nv.Charts() { + if strings.HasPrefix(c.ID, prefix) { + c.MarkRemove() + c.MarkNotCreated() + } + } +} |