diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2023-02-06 16:11:34 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2023-02-06 16:11:34 +0000 |
commit | d079b656b4719739b2247dcd9d46e9bec793095a (patch) | |
tree | d2c950c70a776bcf697c963151c5bd959f8a9f03 /collectors/python.d.plugin/nvidia_smi | |
parent | Releasing debian version 1.37.1-2. (diff) | |
download | netdata-d079b656b4719739b2247dcd9d46e9bec793095a.tar.xz netdata-d079b656b4719739b2247dcd9d46e9bec793095a.zip |
Merging upstream version 1.38.0.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'collectors/python.d.plugin/nvidia_smi')
-rw-r--r-- | collectors/python.d.plugin/nvidia_smi/README.md | 11 | ||||
-rw-r--r-- | collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py | 53 |
2 files changed, 59 insertions, 5 deletions
diff --git a/collectors/python.d.plugin/nvidia_smi/README.md b/collectors/python.d.plugin/nvidia_smi/README.md index bb4169441..ce5473c26 100644 --- a/collectors/python.d.plugin/nvidia_smi/README.md +++ b/collectors/python.d.plugin/nvidia_smi/README.md @@ -1,14 +1,17 @@ <!-- title: "Nvidia GPU monitoring with Netdata" -custom_edit_url: https://github.com/netdata/netdata/edit/master/collectors/python.d.plugin/nvidia_smi/README.md -sidebar_label: "Nvidia GPUs" +custom_edit_url: "https://github.com/netdata/netdata/edit/master/collectors/python.d.plugin/nvidia_smi/README.md" +sidebar_label: "nvidia_smi-python.d.plugin" +learn_status: "Published" +learn_topic_type: "References" +learn_rel_path: "References/Collectors references/Devices" --> # Nvidia GPU monitoring with Netdata Monitors performance metrics (memory usage, fan speed, pcie bandwidth utilization, temperature, etc.) using `nvidia-smi` cli tool. -> **Warning**: this collector does not work when the Netdata Agent is [running in a container](https://learn.netdata.cloud/docs/agent/packaging/docker). +> **Warning**: this collector does not work when the Netdata Agent is [running in a container](https://github.com/netdata/netdata/blob/master/packaging/docker/README.md). ## Requirements and Notes @@ -48,7 +51,7 @@ It produces the following charts: ## Configuration Edit the `python.d/nvidia_smi.conf` configuration file using `edit-config` from the Netdata [config -directory](/docs/configure/nodes.md), which is typically at `/etc/netdata`. +directory](https://github.com/netdata/netdata/blob/master/docs/configure/nodes.md), which is typically at `/etc/netdata`. ```bash cd /etc/netdata # Replace this path with your Netdata config directory, if different diff --git a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py index 23e90e658..6affae7b8 100644 --- a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py +++ b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py @@ -22,6 +22,7 @@ EMPTY_ROW_LIMIT = 500 POLLER_BREAK_ROW = '</nvidia_smi_log>' PCI_BANDWIDTH = 'pci_bandwidth' +PCI_BANDWIDTH_PERCENT = 'pci_bandwidth_percent' FAN_SPEED = 'fan_speed' GPU_UTIL = 'gpu_utilization' MEM_UTIL = 'mem_utilization' @@ -38,6 +39,7 @@ USER_NUM = 'user_num' ORDER = [ PCI_BANDWIDTH, + PCI_BANDWIDTH_PERCENT, FAN_SPEED, GPU_UTIL, MEM_UTIL, @@ -56,7 +58,22 @@ ORDER = [ # https://docs.nvidia.com/gameworks/content/gameworkslibrary/coresdk/nvapi/group__gpupstate.html POWER_STATES = ['P' + str(i) for i in range(0, 16)] - +# PCI Transfer data rate in gigabits per second (Gb/s) per generation +PCI_SPEED = { + "1": 2.5, + "2": 5, + "3": 8, + "4": 16, + "5": 32 +} +# PCI encoding per generation +PCI_ENCODING = { + "1": 2/10, + "2": 2/10, + "3": 2/130, + "4": 2/130, + "5": 2/130 +} def gpu_charts(gpu): fam = gpu.full_name() @@ -68,6 +85,13 @@ def gpu_charts(gpu): ['tx_util', 'tx', 'absolute', 1, -1], ] }, + PCI_BANDWIDTH_PERCENT: { + 'options': [None, 'PCI Express Bandwidth Percent', 'percentage', fam, 'nvidia_smi.pci_bandwidth_percent', 'area'], + 'lines': [ + ['rx_util_percent', 'rx_percent'], + ['tx_util_percent', 'tx_percent'], + ] + }, FAN_SPEED: { 'options': [None, 'Fan Speed', 'percentage', fam, 'nvidia_smi.fan_speed', 'line'], 'lines': [ @@ -327,6 +351,24 @@ class GPU: return 'gpu{0} {1}'.format(self.num, self.name()) @handle_attr_error + def pci_link_gen(self): + return self.root.find('pci').find('pci_gpu_link_info').find('pcie_gen').find('max_link_gen').text + + @handle_attr_error + def pci_link_width(self): + return self.root.find('pci').find('pci_gpu_link_info').find('link_widths').find('max_link_width').text.split('x')[0] + + def pci_bw_max(self): + link_gen = self.pci_link_gen() + link_width = int(self.pci_link_width()) + if link_gen not in PCI_SPEED or link_gen not in PCI_ENCODING or not link_width: + return None + # Maximum PCIe Bandwidth = SPEED * WIDTH * (1 - ENCODING) - 1Gb/s. + # see details https://enterprise-support.nvidia.com/s/article/understanding-pcie-configuration-for-maximum-performance + # return max bandwidth in kilobytes per second (kB/s) + return (PCI_SPEED[link_gen] * link_width * (1- PCI_ENCODING[link_gen]) - 1) * 1000 * 1000 / 8 + + @handle_attr_error def rx_util(self): return self.root.find('pci').find('rx_util').text.split()[0] @@ -439,6 +481,15 @@ class GPU: 'power_draw': self.power_draw(), } + pci_bw_max = self.pci_bw_max() + if not pci_bw_max: + data['rx_util_percent'] = 0 + data['tx_util_percent'] = 0 + else : + data['rx_util_percent'] = str(int(int(self.rx_util())*100/self.pci_bw_max())) + data['tx_util_percent'] = str(int(int(self.tx_util())*100/self.pci_bw_max())) + + for v in POWER_STATES: data['power_state_' + v.lower()] = 0 p_state = self.power_state() |