Merging upstream version 1.38.0.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2023-02-06 16:11:34 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2023-02-06 16:11:34 +0000
commit: d079b656b4719739b2247dcd9d46e9bec793095a (patch)
tree: d2c950c70a776bcf697c963151c5bd959f8a9f03 /collectors/python.d.plugin/nvidia_smi
parent: Releasing debian version 1.37.1-2. (diff)
download: netdata-d079b656b4719739b2247dcd9d46e9bec793095a.tar.xz
netdata-d079b656b4719739b2247dcd9d46e9bec793095a.zip
2 files changed, 59 insertions, 5 deletions
diff --git a/collectors/python.d.plugin/nvidia_smi/README.md b/collectors/python.d.plugin/nvidia_smi/README.md
index bb4169441..ce5473c26 100644
--- a/collectors/python.d.plugin/nvidia_smi/README.md
+++ b/collectors/python.d.plugin/nvidia_smi/README.md
@@ -1,14 +1,17 @@
 <!--
 title: "Nvidia GPU monitoring with Netdata"
-custom_edit_url: https://github.com/netdata/netdata/edit/master/collectors/python.d.plugin/nvidia_smi/README.md
-sidebar_label: "Nvidia GPUs"
+custom_edit_url: "https://github.com/netdata/netdata/edit/master/collectors/python.d.plugin/nvidia_smi/README.md"
+sidebar_label: "nvidia_smi-python.d.plugin"
+learn_status: "Published"
+learn_topic_type: "References"
+learn_rel_path: "References/Collectors references/Devices"
 -->
 
 # Nvidia GPU monitoring with Netdata
 
 Monitors performance metrics (memory usage, fan speed, pcie bandwidth utilization, temperature, etc.) using `nvidia-smi` cli tool.
 
-> **Warning**: this collector does not work when the Netdata Agent is [running in a container](https://learn.netdata.cloud/docs/agent/packaging/docker).
+> **Warning**: this collector does not work when the Netdata Agent is [running in a container](https://github.com/netdata/netdata/blob/master/packaging/docker/README.md).
 
 
 ## Requirements and Notes
@@ -48,7 +51,7 @@ It produces the following charts:
 ## Configuration
 
 Edit the `python.d/nvidia_smi.conf` configuration file using `edit-config` from the Netdata [config
-directory](/docs/configure/nodes.md), which is typically at `/etc/netdata`.
+directory](https://github.com/netdata/netdata/blob/master/docs/configure/nodes.md), which is typically at `/etc/netdata`.
 
 ```bash
 cd /etc/netdata   # Replace this path with your Netdata config directory, if different
diff --git a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py
index 23e90e658..6affae7b8 100644
--- a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py
+++ b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py
@@ -22,6 +22,7 @@ EMPTY_ROW_LIMIT = 500
 POLLER_BREAK_ROW = '</nvidia_smi_log>'
 
 PCI_BANDWIDTH = 'pci_bandwidth'
+PCI_BANDWIDTH_PERCENT = 'pci_bandwidth_percent'
 FAN_SPEED = 'fan_speed'
 GPU_UTIL = 'gpu_utilization'
 MEM_UTIL = 'mem_utilization'
@@ -38,6 +39,7 @@ USER_NUM = 'user_num'
 
 ORDER = [
     PCI_BANDWIDTH,
+    PCI_BANDWIDTH_PERCENT,
     FAN_SPEED,
     GPU_UTIL,
     MEM_UTIL,
@@ -56,7 +58,22 @@ ORDER = [
 # https://docs.nvidia.com/gameworks/content/gameworkslibrary/coresdk/nvapi/group__gpupstate.html
 POWER_STATES = ['P' + str(i) for i in range(0, 16)]
 
-
+# PCI Transfer data rate in gigabits per second (Gb/s) per generation
+PCI_SPEED = {
+  "1": 2.5,
+  "2": 5,
+  "3": 8,
+  "4": 16,
+  "5": 32
+}
+# PCI encoding per generation
+PCI_ENCODING = {
+  "1": 2/10,
+  "2": 2/10,
+  "3": 2/130,
+  "4": 2/130,
+  "5": 2/130
+}
 def gpu_charts(gpu):
     fam = gpu.full_name()
 
@@ -68,6 +85,13 @@ def gpu_charts(gpu):
                 ['tx_util', 'tx', 'absolute', 1, -1],
             ]
         },
+        PCI_BANDWIDTH_PERCENT: {
+            'options': [None, 'PCI Express Bandwidth Percent', 'percentage', fam, 'nvidia_smi.pci_bandwidth_percent', 'area'],
+            'lines': [
+                ['rx_util_percent', 'rx_percent'],
+                ['tx_util_percent', 'tx_percent'],
+            ]
+        },
         FAN_SPEED: {
             'options': [None, 'Fan Speed', 'percentage', fam, 'nvidia_smi.fan_speed', 'line'],
             'lines': [
@@ -327,6 +351,24 @@ class GPU:
         return 'gpu{0} {1}'.format(self.num, self.name())
 
     @handle_attr_error
+    def pci_link_gen(self):
+        return self.root.find('pci').find('pci_gpu_link_info').find('pcie_gen').find('max_link_gen').text
+
+    @handle_attr_error
+    def pci_link_width(self):
+        return self.root.find('pci').find('pci_gpu_link_info').find('link_widths').find('max_link_width').text.split('x')[0]
+
+    def pci_bw_max(self):
+        link_gen = self.pci_link_gen()
+        link_width = int(self.pci_link_width())
+        if link_gen not in PCI_SPEED or link_gen not in PCI_ENCODING or not link_width:
+            return None
+        # Maximum PCIe Bandwidth = SPEED * WIDTH * (1 - ENCODING) - 1Gb/s.
+        # see details https://enterprise-support.nvidia.com/s/article/understanding-pcie-configuration-for-maximum-performance
+        # return max bandwidth in kilobytes per second (kB/s)
+        return (PCI_SPEED[link_gen] * link_width * (1- PCI_ENCODING[link_gen]) - 1) * 1000 * 1000 / 8
+
+    @handle_attr_error
     def rx_util(self):
         return self.root.find('pci').find('rx_util').text.split()[0]
 
@@ -439,6 +481,15 @@ class GPU:
             'power_draw': self.power_draw(),
         }
 
+        pci_bw_max = self.pci_bw_max()
+        if not pci_bw_max:
+            data['rx_util_percent'] = 0
+            data['tx_util_percent'] = 0
+        else :
+            data['rx_util_percent'] = str(int(int(self.rx_util())*100/self.pci_bw_max()))
+            data['tx_util_percent'] = str(int(int(self.tx_util())*100/self.pci_bw_max()))
+
+
         for v in POWER_STATES:
             data['power_state_' + v.lower()] = 0
         p_state = self.power_state()
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2023-02-06 16:11:34 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2023-02-06 16:11:34 +0000
commit	d079b656b4719739b2247dcd9d46e9bec793095a (patch)
tree	d2c950c70a776bcf697c963151c5bd959f8a9f03 /collectors/python.d.plugin/nvidia_smi
parent	Releasing debian version 1.37.1-2. (diff)
download	netdata-d079b656b4719739b2247dcd9d46e9bec793095a.tar.xz netdata-d079b656b4719739b2247dcd9d46e9bec793095a.zip