summaryrefslogtreecommitdiffstats
path: root/collectors/python.d.plugin/nvidia_smi
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2023-08-10 09:18:49 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2023-08-10 09:18:49 +0000
commitdd814a7c1a8de056a79f7238578b09236edd5506 (patch)
tree429e7eed5a634a4efe9a6877ce66da8e64aa1782 /collectors/python.d.plugin/nvidia_smi
parentAdding upstream version 1.41.0. (diff)
downloadnetdata-dd814a7c1a8de056a79f7238578b09236edd5506.tar.xz
netdata-dd814a7c1a8de056a79f7238578b09236edd5506.zip
Adding upstream version 1.42.0.upstream/1.42.0
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'collectors/python.d.plugin/nvidia_smi')
-rw-r--r--collectors/python.d.plugin/nvidia_smi/metadata.yaml329
-rw-r--r--collectors/python.d.plugin/nvidia_smi/metrics.csv16
-rw-r--r--collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py40
3 files changed, 190 insertions, 195 deletions
diff --git a/collectors/python.d.plugin/nvidia_smi/metadata.yaml b/collectors/python.d.plugin/nvidia_smi/metadata.yaml
index fc0c90d5f..9bf1e6ca7 100644
--- a/collectors/python.d.plugin/nvidia_smi/metadata.yaml
+++ b/collectors/python.d.plugin/nvidia_smi/metadata.yaml
@@ -1,163 +1,166 @@
-meta:
- plugin_name: python.d.plugin
- module_name: nvidia_smi
- monitored_instance:
- name: python.d nvidia_smi
- link: ''
- categories: []
- icon_filename: ''
- related_resources:
- integrations:
- list: []
- info_provided_to_referring_integrations:
- description: ''
- keywords: []
- most_popular: false
-overview:
- data_collection:
- metrics_description: ''
- method_description: ''
- supported_platforms:
- include: []
- exclude: []
- multi-instance: true
- additional_permissions:
- description: ''
- default_behavior:
- auto_detection:
- description: ''
- limits:
- description: ''
- performance_impact:
- description: ''
-setup:
- prerequisites:
- list: []
- configuration:
- file:
- name: ''
- description: ''
- options:
- description: ''
- folding:
- title: ''
- enabled: true
- list: []
- examples:
- folding:
- enabled: true
- title: ''
- list: []
-troubleshooting:
- problems:
- list: []
-alerts: []
-metrics:
- folding:
- title: Metrics
- enabled: false
- description: ""
- availability: []
- scopes:
- - name: GPU
- description: ""
- labels: []
- metrics:
- - name: nvidia_smi.pci_bandwidth
- description: PCI Express Bandwidth Utilization
- unit: "KiB/s"
- chart_type: area
- dimensions:
- - name: rx
- - name: tx
- - name: nvidia_smi.pci_bandwidth_percent
- description: PCI Express Bandwidth Percent
- unit: "percentage"
- chart_type: area
- dimensions:
- - name: rx_percent
- - name: tx_percent
- - name: nvidia_smi.fan_speed
- description: Fan Speed
- unit: "percentage"
- chart_type: line
- dimensions:
- - name: speed
- - name: nvidia_smi.gpu_utilization
- description: GPU Utilization
- unit: "percentage"
- chart_type: line
- dimensions:
- - name: utilization
- - name: nvidia_smi.mem_utilization
- description: Memory Bandwidth Utilization
- unit: "percentage"
- chart_type: line
- dimensions:
- - name: utilization
- - name: nvidia_smi.encoder_utilization
- description: Encoder/Decoder Utilization
- unit: "percentage"
- chart_type: line
- dimensions:
- - name: encoder
- - name: decoder
- - name: nvidia_smi.memory_allocated
- description: Memory Usage
- unit: "MiB"
- chart_type: stacked
- dimensions:
- - name: free
- - name: used
- - name: nvidia_smi.bar1_memory_usage
- description: Bar1 Memory Usage
- unit: "MiB"
- chart_type: stacked
- dimensions:
- - name: free
- - name: used
- - name: nvidia_smi.temperature
- description: Temperature
- unit: "celsius"
- chart_type: line
- dimensions:
- - name: temp
- - name: nvidia_smi.clocks
- description: Clock Frequencies
- unit: "MHz"
- chart_type: line
- dimensions:
- - name: graphics
- - name: video
- - name: sm
- - name: mem
- - name: nvidia_smi.power
- description: Power Utilization
- unit: "Watts"
- chart_type: line
- dimensions:
- - name: power
- - name: nvidia_smi.power_state
- description: Power State
- unit: "state"
- chart_type: line
- dimensions:
- - name: a dimension per {power_state}
- - name: nvidia_smi.processes_mem
- description: Memory Used by Each Process
- unit: "MiB"
- chart_type: stacked
- dimensions:
- - name: a dimension per process
- - name: nvidia_smi.user_mem
- description: Memory Used by Each User
- unit: "MiB"
- chart_type: stacked
- dimensions:
- - name: a dimension per user
- - name: nvidia_smi.user_num
- description: Number of User on GPU
- unit: "num"
- chart_type: line
- dimensions:
- - name: users
+# This collector will not appear in documentation, as the go version is preferred,
+# https://github.com/netdata/go.d.plugin/blob/master/modules/nvidia_smi/README.md
+#
+# meta:
+# plugin_name: python.d.plugin
+# module_name: nvidia_smi
+# monitored_instance:
+# name: python.d nvidia_smi
+# link: ''
+# categories: []
+# icon_filename: ''
+# related_resources:
+# integrations:
+# list: []
+# info_provided_to_referring_integrations:
+# description: ''
+# keywords: []
+# most_popular: false
+# overview:
+# data_collection:
+# metrics_description: ''
+# method_description: ''
+# supported_platforms:
+# include: []
+# exclude: []
+# multi_instance: true
+# additional_permissions:
+# description: ''
+# default_behavior:
+# auto_detection:
+# description: ''
+# limits:
+# description: ''
+# performance_impact:
+# description: ''
+# setup:
+# prerequisites:
+# list: []
+# configuration:
+# file:
+# name: ''
+# description: ''
+# options:
+# description: ''
+# folding:
+# title: ''
+# enabled: true
+# list: []
+# examples:
+# folding:
+# enabled: true
+# title: ''
+# list: []
+# troubleshooting:
+# problems:
+# list: []
+# alerts: []
+# metrics:
+# folding:
+# title: Metrics
+# enabled: false
+# description: ""
+# availability: []
+# scopes:
+# - name: GPU
+# description: ""
+# labels: []
+# metrics:
+# - name: nvidia_smi.pci_bandwidth
+# description: PCI Express Bandwidth Utilization
+# unit: "KiB/s"
+# chart_type: area
+# dimensions:
+# - name: rx
+# - name: tx
+# - name: nvidia_smi.pci_bandwidth_percent
+# description: PCI Express Bandwidth Percent
+# unit: "percentage"
+# chart_type: area
+# dimensions:
+# - name: rx_percent
+# - name: tx_percent
+# - name: nvidia_smi.fan_speed
+# description: Fan Speed
+# unit: "percentage"
+# chart_type: line
+# dimensions:
+# - name: speed
+# - name: nvidia_smi.gpu_utilization
+# description: GPU Utilization
+# unit: "percentage"
+# chart_type: line
+# dimensions:
+# - name: utilization
+# - name: nvidia_smi.mem_utilization
+# description: Memory Bandwidth Utilization
+# unit: "percentage"
+# chart_type: line
+# dimensions:
+# - name: utilization
+# - name: nvidia_smi.encoder_utilization
+# description: Encoder/Decoder Utilization
+# unit: "percentage"
+# chart_type: line
+# dimensions:
+# - name: encoder
+# - name: decoder
+# - name: nvidia_smi.memory_allocated
+# description: Memory Usage
+# unit: "MiB"
+# chart_type: stacked
+# dimensions:
+# - name: free
+# - name: used
+# - name: nvidia_smi.bar1_memory_usage
+# description: Bar1 Memory Usage
+# unit: "MiB"
+# chart_type: stacked
+# dimensions:
+# - name: free
+# - name: used
+# - name: nvidia_smi.temperature
+# description: Temperature
+# unit: "celsius"
+# chart_type: line
+# dimensions:
+# - name: temp
+# - name: nvidia_smi.clocks
+# description: Clock Frequencies
+# unit: "MHz"
+# chart_type: line
+# dimensions:
+# - name: graphics
+# - name: video
+# - name: sm
+# - name: mem
+# - name: nvidia_smi.power
+# description: Power Utilization
+# unit: "Watts"
+# chart_type: line
+# dimensions:
+# - name: power
+# - name: nvidia_smi.power_state
+# description: Power State
+# unit: "state"
+# chart_type: line
+# dimensions:
+# - name: a dimension per {power_state}
+# - name: nvidia_smi.processes_mem
+# description: Memory Used by Each Process
+# unit: "MiB"
+# chart_type: stacked
+# dimensions:
+# - name: a dimension per process
+# - name: nvidia_smi.user_mem
+# description: Memory Used by Each User
+# unit: "MiB"
+# chart_type: stacked
+# dimensions:
+# - name: a dimension per user
+# - name: nvidia_smi.user_num
+# description: Number of User on GPU
+# unit: "num"
+# chart_type: line
+# dimensions:
+# - name: users
diff --git a/collectors/python.d.plugin/nvidia_smi/metrics.csv b/collectors/python.d.plugin/nvidia_smi/metrics.csv
deleted file mode 100644
index 683ea5650..000000000
--- a/collectors/python.d.plugin/nvidia_smi/metrics.csv
+++ /dev/null
@@ -1,16 +0,0 @@
-metric,scope,dimensions,unit,description,chart_type,labels,plugin,module
-nvidia_smi.pci_bandwidth,GPU,"rx, tx",KiB/s,PCI Express Bandwidth Utilization,area,,python.d.plugin,nvidia_smi
-nvidia_smi.pci_bandwidth_percent,GPU,"rx_percent, tx_percent",percentage,PCI Express Bandwidth Percent,area,,python.d.plugin,nvidia_smi
-nvidia_smi.fan_speed,GPU,speed,percentage,Fan Speed,line,,python.d.plugin,nvidia_smi
-nvidia_smi.gpu_utilization,GPU,utilization,percentage,GPU Utilization,line,,python.d.plugin,nvidia_smi
-nvidia_smi.mem_utilization,GPU,utilization,percentage,Memory Bandwidth Utilization,line,,python.d.plugin,nvidia_smi
-nvidia_smi.encoder_utilization,GPU,"encoder, decoder",percentage,Encoder/Decoder Utilization,line,,python.d.plugin,nvidia_smi
-nvidia_smi.memory_allocated,GPU,"free, used",MiB,Memory Usage,stacked,,python.d.plugin,nvidia_smi
-nvidia_smi.bar1_memory_usage,GPU,"free, used",MiB,Bar1 Memory Usage,stacked,,python.d.plugin,nvidia_smi
-nvidia_smi.temperature,GPU,temp,celsius,Temperature,line,,python.d.plugin,nvidia_smi
-nvidia_smi.clocks,GPU,"graphics, video, sm, mem",MHz,Clock Frequencies,line,,python.d.plugin,nvidia_smi
-nvidia_smi.power,GPU,power,Watts,Power Utilization,line,,python.d.plugin,nvidia_smi
-nvidia_smi.power_state,GPU,a dimension per {power_state},state,Power State,line,,python.d.plugin,nvidia_smi
-nvidia_smi.processes_mem,GPU,a dimension per process,MiB,Memory Used by Each Process,stacked,,python.d.plugin,nvidia_smi
-nvidia_smi.user_mem,GPU,a dimension per user,MiB,Memory Used by Each User,stacked,,python.d.plugin,nvidia_smi
-nvidia_smi.user_num,GPU,users,num,Number of User on GPU,line,,python.d.plugin,nvidia_smi
diff --git a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py
index 271c99638..556a61435 100644
--- a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py
+++ b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py
@@ -62,20 +62,22 @@ POWER_STATES = ['P' + str(i) for i in range(0, 16)]
# PCI Transfer data rate in gigabits per second (Gb/s) per generation
PCI_SPEED = {
- "1": 2.5,
- "2": 5,
- "3": 8,
- "4": 16,
- "5": 32
+ "1": 2.5,
+ "2": 5,
+ "3": 8,
+ "4": 16,
+ "5": 32
}
# PCI encoding per generation
PCI_ENCODING = {
- "1": 2/10,
- "2": 2/10,
- "3": 2/130,
- "4": 2/130,
- "5": 2/130
+ "1": 2 / 10,
+ "2": 2 / 10,
+ "3": 2 / 130,
+ "4": 2 / 130,
+ "5": 2 / 130
}
+
+
def gpu_charts(gpu):
fam = gpu.full_name()
@@ -88,7 +90,8 @@ def gpu_charts(gpu):
]
},
PCI_BANDWIDTH_PERCENT: {
- 'options': [None, 'PCI Express Bandwidth Percent', 'percentage', fam, 'nvidia_smi.pci_bandwidth_percent', 'area'],
+ 'options': [None, 'PCI Express Bandwidth Percent', 'percentage', fam, 'nvidia_smi.pci_bandwidth_percent',
+ 'area'],
'lines': [
['rx_util_percent', 'rx_percent'],
['tx_util_percent', 'tx_percent'],
@@ -358,7 +361,8 @@ class GPU:
@handle_attr_error
def pci_link_width(self):
- return self.root.find('pci').find('pci_gpu_link_info').find('link_widths').find('max_link_width').text.split('x')[0]
+ info = self.root.find('pci').find('pci_gpu_link_info')
+ return info.find('link_widths').find('max_link_width').text.split('x')[0]
def pci_bw_max(self):
link_gen = self.pci_link_gen()
@@ -368,7 +372,7 @@ class GPU:
# Maximum PCIe Bandwidth = SPEED * WIDTH * (1 - ENCODING) - 1Gb/s.
# see details https://enterprise-support.nvidia.com/s/article/understanding-pcie-configuration-for-maximum-performance
# return max bandwidth in kilobytes per second (kB/s)
- return (PCI_SPEED[link_gen] * link_width * (1- PCI_ENCODING[link_gen]) - 1) * 1000 * 1000 / 8
+ return (PCI_SPEED[link_gen] * link_width * (1 - PCI_ENCODING[link_gen]) - 1) * 1000 * 1000 / 8
@handle_attr_error
def rx_util(self):
@@ -435,13 +439,18 @@ class GPU:
return self.root.find('clocks').find('mem_clock').text.split()[0]
@handle_attr_error
+ def power_readings(self):
+ elem = self.root.find('power_readings')
+ return elem if elem else self.root.find('gpu_power_readings')
+
+ @handle_attr_error
def power_state(self):
- return str(self.root.find('power_readings').find('power_state').text.split()[0])
+ return str(self.power_readings().find('power_state').text.split()[0])
@handle_value_error
@handle_attr_error
def power_draw(self):
- return float(self.root.find('power_readings').find('power_draw').text.split()[0]) * 100
+ return float(self.power_readings().find('power_draw').text.split()[0]) * 100
@handle_attr_error
def processes(self):
@@ -492,7 +501,6 @@ class GPU:
data['rx_util_percent'] = str(int(int(self.rx_util()) * 100 / self.pci_bw_max()))
data['tx_util_percent'] = str(int(int(self.tx_util()) * 100 / self.pci_bw_max()))
-
for v in POWER_STATES:
data['power_state_' + v.lower()] = 0
p_state = self.power_state()