diff options
Diffstat (limited to 'collectors/python.d.plugin/nvidia_smi')
-rw-r--r-- | collectors/python.d.plugin/nvidia_smi/metadata.yaml | 329 | ||||
-rw-r--r-- | collectors/python.d.plugin/nvidia_smi/metrics.csv | 16 | ||||
-rw-r--r-- | collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py | 40 |
3 files changed, 190 insertions, 195 deletions
diff --git a/collectors/python.d.plugin/nvidia_smi/metadata.yaml b/collectors/python.d.plugin/nvidia_smi/metadata.yaml index fc0c90d5..9bf1e6ca 100644 --- a/collectors/python.d.plugin/nvidia_smi/metadata.yaml +++ b/collectors/python.d.plugin/nvidia_smi/metadata.yaml @@ -1,163 +1,166 @@ -meta: - plugin_name: python.d.plugin - module_name: nvidia_smi - monitored_instance: - name: python.d nvidia_smi - link: '' - categories: [] - icon_filename: '' - related_resources: - integrations: - list: [] - info_provided_to_referring_integrations: - description: '' - keywords: [] - most_popular: false -overview: - data_collection: - metrics_description: '' - method_description: '' - supported_platforms: - include: [] - exclude: [] - multi-instance: true - additional_permissions: - description: '' - default_behavior: - auto_detection: - description: '' - limits: - description: '' - performance_impact: - description: '' -setup: - prerequisites: - list: [] - configuration: - file: - name: '' - description: '' - options: - description: '' - folding: - title: '' - enabled: true - list: [] - examples: - folding: - enabled: true - title: '' - list: [] -troubleshooting: - problems: - list: [] -alerts: [] -metrics: - folding: - title: Metrics - enabled: false - description: "" - availability: [] - scopes: - - name: GPU - description: "" - labels: [] - metrics: - - name: nvidia_smi.pci_bandwidth - description: PCI Express Bandwidth Utilization - unit: "KiB/s" - chart_type: area - dimensions: - - name: rx - - name: tx - - name: nvidia_smi.pci_bandwidth_percent - description: PCI Express Bandwidth Percent - unit: "percentage" - chart_type: area - dimensions: - - name: rx_percent - - name: tx_percent - - name: nvidia_smi.fan_speed - description: Fan Speed - unit: "percentage" - chart_type: line - dimensions: - - name: speed - - name: nvidia_smi.gpu_utilization - description: GPU Utilization - unit: "percentage" - chart_type: line - dimensions: - - name: utilization - - name: nvidia_smi.mem_utilization - description: Memory Bandwidth Utilization - unit: "percentage" - chart_type: line - dimensions: - - name: utilization - - name: nvidia_smi.encoder_utilization - description: Encoder/Decoder Utilization - unit: "percentage" - chart_type: line - dimensions: - - name: encoder - - name: decoder - - name: nvidia_smi.memory_allocated - description: Memory Usage - unit: "MiB" - chart_type: stacked - dimensions: - - name: free - - name: used - - name: nvidia_smi.bar1_memory_usage - description: Bar1 Memory Usage - unit: "MiB" - chart_type: stacked - dimensions: - - name: free - - name: used - - name: nvidia_smi.temperature - description: Temperature - unit: "celsius" - chart_type: line - dimensions: - - name: temp - - name: nvidia_smi.clocks - description: Clock Frequencies - unit: "MHz" - chart_type: line - dimensions: - - name: graphics - - name: video - - name: sm - - name: mem - - name: nvidia_smi.power - description: Power Utilization - unit: "Watts" - chart_type: line - dimensions: - - name: power - - name: nvidia_smi.power_state - description: Power State - unit: "state" - chart_type: line - dimensions: - - name: a dimension per {power_state} - - name: nvidia_smi.processes_mem - description: Memory Used by Each Process - unit: "MiB" - chart_type: stacked - dimensions: - - name: a dimension per process - - name: nvidia_smi.user_mem - description: Memory Used by Each User - unit: "MiB" - chart_type: stacked - dimensions: - - name: a dimension per user - - name: nvidia_smi.user_num - description: Number of User on GPU - unit: "num" - chart_type: line - dimensions: - - name: users +# This collector will not appear in documentation, as the go version is preferred, +# https://github.com/netdata/go.d.plugin/blob/master/modules/nvidia_smi/README.md +# +# meta: +# plugin_name: python.d.plugin +# module_name: nvidia_smi +# monitored_instance: +# name: python.d nvidia_smi +# link: '' +# categories: [] +# icon_filename: '' +# related_resources: +# integrations: +# list: [] +# info_provided_to_referring_integrations: +# description: '' +# keywords: [] +# most_popular: false +# overview: +# data_collection: +# metrics_description: '' +# method_description: '' +# supported_platforms: +# include: [] +# exclude: [] +# multi_instance: true +# additional_permissions: +# description: '' +# default_behavior: +# auto_detection: +# description: '' +# limits: +# description: '' +# performance_impact: +# description: '' +# setup: +# prerequisites: +# list: [] +# configuration: +# file: +# name: '' +# description: '' +# options: +# description: '' +# folding: +# title: '' +# enabled: true +# list: [] +# examples: +# folding: +# enabled: true +# title: '' +# list: [] +# troubleshooting: +# problems: +# list: [] +# alerts: [] +# metrics: +# folding: +# title: Metrics +# enabled: false +# description: "" +# availability: [] +# scopes: +# - name: GPU +# description: "" +# labels: [] +# metrics: +# - name: nvidia_smi.pci_bandwidth +# description: PCI Express Bandwidth Utilization +# unit: "KiB/s" +# chart_type: area +# dimensions: +# - name: rx +# - name: tx +# - name: nvidia_smi.pci_bandwidth_percent +# description: PCI Express Bandwidth Percent +# unit: "percentage" +# chart_type: area +# dimensions: +# - name: rx_percent +# - name: tx_percent +# - name: nvidia_smi.fan_speed +# description: Fan Speed +# unit: "percentage" +# chart_type: line +# dimensions: +# - name: speed +# - name: nvidia_smi.gpu_utilization +# description: GPU Utilization +# unit: "percentage" +# chart_type: line +# dimensions: +# - name: utilization +# - name: nvidia_smi.mem_utilization +# description: Memory Bandwidth Utilization +# unit: "percentage" +# chart_type: line +# dimensions: +# - name: utilization +# - name: nvidia_smi.encoder_utilization +# description: Encoder/Decoder Utilization +# unit: "percentage" +# chart_type: line +# dimensions: +# - name: encoder +# - name: decoder +# - name: nvidia_smi.memory_allocated +# description: Memory Usage +# unit: "MiB" +# chart_type: stacked +# dimensions: +# - name: free +# - name: used +# - name: nvidia_smi.bar1_memory_usage +# description: Bar1 Memory Usage +# unit: "MiB" +# chart_type: stacked +# dimensions: +# - name: free +# - name: used +# - name: nvidia_smi.temperature +# description: Temperature +# unit: "celsius" +# chart_type: line +# dimensions: +# - name: temp +# - name: nvidia_smi.clocks +# description: Clock Frequencies +# unit: "MHz" +# chart_type: line +# dimensions: +# - name: graphics +# - name: video +# - name: sm +# - name: mem +# - name: nvidia_smi.power +# description: Power Utilization +# unit: "Watts" +# chart_type: line +# dimensions: +# - name: power +# - name: nvidia_smi.power_state +# description: Power State +# unit: "state" +# chart_type: line +# dimensions: +# - name: a dimension per {power_state} +# - name: nvidia_smi.processes_mem +# description: Memory Used by Each Process +# unit: "MiB" +# chart_type: stacked +# dimensions: +# - name: a dimension per process +# - name: nvidia_smi.user_mem +# description: Memory Used by Each User +# unit: "MiB" +# chart_type: stacked +# dimensions: +# - name: a dimension per user +# - name: nvidia_smi.user_num +# description: Number of User on GPU +# unit: "num" +# chart_type: line +# dimensions: +# - name: users diff --git a/collectors/python.d.plugin/nvidia_smi/metrics.csv b/collectors/python.d.plugin/nvidia_smi/metrics.csv deleted file mode 100644 index 683ea565..00000000 --- a/collectors/python.d.plugin/nvidia_smi/metrics.csv +++ /dev/null @@ -1,16 +0,0 @@ -metric,scope,dimensions,unit,description,chart_type,labels,plugin,module -nvidia_smi.pci_bandwidth,GPU,"rx, tx",KiB/s,PCI Express Bandwidth Utilization,area,,python.d.plugin,nvidia_smi -nvidia_smi.pci_bandwidth_percent,GPU,"rx_percent, tx_percent",percentage,PCI Express Bandwidth Percent,area,,python.d.plugin,nvidia_smi -nvidia_smi.fan_speed,GPU,speed,percentage,Fan Speed,line,,python.d.plugin,nvidia_smi -nvidia_smi.gpu_utilization,GPU,utilization,percentage,GPU Utilization,line,,python.d.plugin,nvidia_smi -nvidia_smi.mem_utilization,GPU,utilization,percentage,Memory Bandwidth Utilization,line,,python.d.plugin,nvidia_smi -nvidia_smi.encoder_utilization,GPU,"encoder, decoder",percentage,Encoder/Decoder Utilization,line,,python.d.plugin,nvidia_smi -nvidia_smi.memory_allocated,GPU,"free, used",MiB,Memory Usage,stacked,,python.d.plugin,nvidia_smi -nvidia_smi.bar1_memory_usage,GPU,"free, used",MiB,Bar1 Memory Usage,stacked,,python.d.plugin,nvidia_smi -nvidia_smi.temperature,GPU,temp,celsius,Temperature,line,,python.d.plugin,nvidia_smi -nvidia_smi.clocks,GPU,"graphics, video, sm, mem",MHz,Clock Frequencies,line,,python.d.plugin,nvidia_smi -nvidia_smi.power,GPU,power,Watts,Power Utilization,line,,python.d.plugin,nvidia_smi -nvidia_smi.power_state,GPU,a dimension per {power_state},state,Power State,line,,python.d.plugin,nvidia_smi -nvidia_smi.processes_mem,GPU,a dimension per process,MiB,Memory Used by Each Process,stacked,,python.d.plugin,nvidia_smi -nvidia_smi.user_mem,GPU,a dimension per user,MiB,Memory Used by Each User,stacked,,python.d.plugin,nvidia_smi -nvidia_smi.user_num,GPU,users,num,Number of User on GPU,line,,python.d.plugin,nvidia_smi diff --git a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py index 271c9963..556a6143 100644 --- a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py +++ b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py @@ -62,20 +62,22 @@ POWER_STATES = ['P' + str(i) for i in range(0, 16)] # PCI Transfer data rate in gigabits per second (Gb/s) per generation PCI_SPEED = { - "1": 2.5, - "2": 5, - "3": 8, - "4": 16, - "5": 32 + "1": 2.5, + "2": 5, + "3": 8, + "4": 16, + "5": 32 } # PCI encoding per generation PCI_ENCODING = { - "1": 2/10, - "2": 2/10, - "3": 2/130, - "4": 2/130, - "5": 2/130 + "1": 2 / 10, + "2": 2 / 10, + "3": 2 / 130, + "4": 2 / 130, + "5": 2 / 130 } + + def gpu_charts(gpu): fam = gpu.full_name() @@ -88,7 +90,8 @@ def gpu_charts(gpu): ] }, PCI_BANDWIDTH_PERCENT: { - 'options': [None, 'PCI Express Bandwidth Percent', 'percentage', fam, 'nvidia_smi.pci_bandwidth_percent', 'area'], + 'options': [None, 'PCI Express Bandwidth Percent', 'percentage', fam, 'nvidia_smi.pci_bandwidth_percent', + 'area'], 'lines': [ ['rx_util_percent', 'rx_percent'], ['tx_util_percent', 'tx_percent'], @@ -358,7 +361,8 @@ class GPU: @handle_attr_error def pci_link_width(self): - return self.root.find('pci').find('pci_gpu_link_info').find('link_widths').find('max_link_width').text.split('x')[0] + info = self.root.find('pci').find('pci_gpu_link_info') + return info.find('link_widths').find('max_link_width').text.split('x')[0] def pci_bw_max(self): link_gen = self.pci_link_gen() @@ -368,7 +372,7 @@ class GPU: # Maximum PCIe Bandwidth = SPEED * WIDTH * (1 - ENCODING) - 1Gb/s. # see details https://enterprise-support.nvidia.com/s/article/understanding-pcie-configuration-for-maximum-performance # return max bandwidth in kilobytes per second (kB/s) - return (PCI_SPEED[link_gen] * link_width * (1- PCI_ENCODING[link_gen]) - 1) * 1000 * 1000 / 8 + return (PCI_SPEED[link_gen] * link_width * (1 - PCI_ENCODING[link_gen]) - 1) * 1000 * 1000 / 8 @handle_attr_error def rx_util(self): @@ -435,13 +439,18 @@ class GPU: return self.root.find('clocks').find('mem_clock').text.split()[0] @handle_attr_error + def power_readings(self): + elem = self.root.find('power_readings') + return elem if elem else self.root.find('gpu_power_readings') + + @handle_attr_error def power_state(self): - return str(self.root.find('power_readings').find('power_state').text.split()[0]) + return str(self.power_readings().find('power_state').text.split()[0]) @handle_value_error @handle_attr_error def power_draw(self): - return float(self.root.find('power_readings').find('power_draw').text.split()[0]) * 100 + return float(self.power_readings().find('power_draw').text.split()[0]) * 100 @handle_attr_error def processes(self): @@ -492,7 +501,6 @@ class GPU: data['rx_util_percent'] = str(int(int(self.rx_util()) * 100 / self.pci_bw_max())) data['tx_util_percent'] = str(int(int(self.tx_util()) * 100 / self.pci_bw_max())) - for v in POWER_STATES: data['power_state_' + v.lower()] = 0 p_state = self.power_state() |