From a8220ab2d293bb7f4b014b79d16b2fb05090fa93 Mon Sep 17 00:00:00 2001
From: Daniel Baumann <daniel.baumann@progress-linux.org>
Date: Sun, 7 Feb 2021 12:45:55 +0100
Subject: Adding upstream version 1.29.0.

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
---
 collectors/python.d.plugin/nvidia_smi/README.md    |  58 ++++---
 .../python.d.plugin/nvidia_smi/nvidia_smi.chart.py | 175 ++++++++++++++++++---
 .../python.d.plugin/nvidia_smi/nvidia_smi.conf     |   4 +-
 3 files changed, 192 insertions(+), 45 deletions(-)

(limited to 'collectors/python.d.plugin/nvidia_smi')

diff --git a/collectors/python.d.plugin/nvidia_smi/README.md b/collectors/python.d.plugin/nvidia_smi/README.md
index 71e3e2889..9bfb2094b 100644
--- a/collectors/python.d.plugin/nvidia_smi/README.md
+++ b/collectors/python.d.plugin/nvidia_smi/README.md
@@ -1,42 +1,58 @@
-# nvidia_smi
+<!--
+title: "Nvidia GPU monitoring with Netdata"
+custom_edit_url: https://github.com/netdata/netdata/edit/master/collectors/python.d.plugin/nvidia_smi/README.md
+sidebar_label: "Nvidia GPUs"
+-->
 
-This module monitors the `nvidia-smi` cli tool.
+# Nvidia GPU monitoring with Netdata
 
-**Requirements and Notes:**
+Monitors performance metrics (memory usage, fan speed, pcie bandwidth utilization, temperature, etc.) using `nvidia-smi` cli tool.
 
--   You must have the `nvidia-smi` tool installed and your NVIDIA GPU(s) must support the tool. Mostly the newer high end models used for AI / ML and Crypto or Pro range, read more about [nvidia_smi](https://developer.nvidia.com/nvidia-system-management-interface).
 
--   You must enable this plugin as its disabled by default due to minor performance issues.
+## Requirements and Notes
 
+-   You must have the `nvidia-smi` tool installed and your NVIDIA GPU(s) must support the tool. Mostly the newer high end models used for AI / ML and Crypto or Pro range, read more about [nvidia_smi](https://developer.nvidia.com/nvidia-system-management-interface).
+-   You must enable this plugin as its disabled by default due to minor performance issues.
 -   On some systems when the GPU is idle the `nvidia-smi` tool unloads and there is added latency again when it is next queried. If you are running GPUs under constant workload this isn't likely to be an issue.
-
 -   Currently the `nvidia-smi` tool is being queried via cli. Updating the plugin to use the nvidia c/c++ API directly should resolve this issue. See discussion here: <https://github.com/netdata/netdata/pull/4357>
-
 -   Contributions are welcome.
-
 -   Make sure `netdata` user can execute `/usr/bin/nvidia-smi` or wherever your binary is.
-
+-   If `nvidia-smi` process [is not killed after netdata restart](https://github.com/netdata/netdata/issues/7143) you need to off `loop_mode`.
 -   `poll_seconds` is how often in seconds the tool is polled for as an integer.
 
-It produces:
+## Charts
+
+It produces the following charts:
 
-1.  Per GPU
+-   PCI Express Bandwidth Utilization in `KiB/s`
+-   Fan Speed in `percentage`
+-   GPU Utilization in `percentage`
+-   Memory Bandwidth Utilization in `percentage`
+-   Encoder/Decoder Utilization in `percentage`
+-   Memory Usage in `MiB`
+-   Temperature in `celsius`
+-   Clock Frequencies in `MHz`
+-   Power Utilization in `Watts`
+-   Memory Used by Each Process in `MiB`
+-   Memory Used by Each User in `MiB`
+-   Number of User on GPU in `num`
 
-    -   GPU utilization
-    -   memory allocation
-    -   memory utilization
-    -   fan speed
-    -   power usage
-    -   temperature
-    -   clock speed
-    -   PCI bandwidth
+## Configuration
 
-## configuration
+Edit the `python.d/nvidia_smi.conf` configuration file using `edit-config` from the Netdata [config
+directory](/docs/configure/nodes.md), which is typically at `/etc/netdata`.
+
+```bash
+cd /etc/netdata   # Replace this path with your Netdata config directory, if different
+sudo ./edit-config python.d/nvidia_smi.conf
+```
 
 Sample:
 
 ```yaml
-poll_seconds: 1
+loop_mode    : yes
+poll_seconds : 1
+exclude_zero_memory_users : yes
 ```
 
 [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fcollectors%2Fpython.d.plugin%2Fnvidia_smi%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)](<>)
diff --git a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py
index 0bea268ef..9c69586dd 100644
--- a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py
+++ b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py
@@ -2,21 +2,22 @@
 # Description: nvidia-smi netdata python.d module
 # Original Author: Steven Noonan (tycho)
 # Author: Ilya Mashchenko (ilyam8)
+# User Memory Stat Author: Guido Scatena (scatenag)
 
 import subprocess
 import threading
+import os
+import pwd
+
 import xml.etree.ElementTree as et
 
-from bases.collection import find_binary
 from bases.FrameworkServices.SimpleService import SimpleService
+from bases.collection import find_binary
 
 disabled_by_default = True
 
-
 NVIDIA_SMI = 'nvidia-smi'
 
-BAD_VALUE = 'N/A'
-
 EMPTY_ROW = ''
 EMPTY_ROW_LIMIT = 500
 POLLER_BREAK_ROW = '</nvidia_smi_log>'
@@ -31,6 +32,8 @@ TEMPERATURE = 'temperature'
 CLOCKS = 'clocks'
 POWER = 'power'
 PROCESSES_MEM = 'processes_mem'
+USER_MEM = 'user_mem'
+USER_NUM = 'user_num'
 
 ORDER = [
     PCI_BANDWIDTH,
@@ -43,6 +46,8 @@ ORDER = [
     CLOCKS,
     POWER,
     PROCESSES_MEM,
+    USER_MEM,
+    USER_NUM,
 ]
 
 
@@ -76,7 +81,8 @@ def gpu_charts(gpu):
             ]
         },
         ENCODER_UTIL: {
-            'options': [None, 'Encoder/Decoder Utilization', 'percentage', fam, 'nvidia_smi.encoder_utilization', 'line'],
+            'options': [None, 'Encoder/Decoder Utilization', 'percentage', fam, 'nvidia_smi.encoder_utilization',
+                        'line'],
             'lines': [
                 ['encoder_util', 'encoder'],
                 ['decoder_util', 'decoder'],
@@ -114,6 +120,16 @@ def gpu_charts(gpu):
             'options': [None, 'Memory Used by Each Process', 'MiB', fam, 'nvidia_smi.processes_mem', 'stacked'],
             'lines': []
         },
+        USER_MEM: {
+            'options': [None, 'Memory Used by Each User', 'MiB', fam, 'nvidia_smi.user_mem', 'stacked'],
+            'lines': []
+        },
+        USER_NUM: {
+            'options': [None, 'Number of User on GPU', 'num', fam, 'nvidia_smi.user_num', 'line'],
+            'lines': [
+                ['user_num', 'users'],
+            ]
+        },
     }
 
     idx = gpu.num
@@ -212,6 +228,7 @@ def handle_attr_error(method):
             return method(*args, **kwargs)
         except AttributeError:
             return None
+
     return on_call
 
 
@@ -221,13 +238,66 @@ def handle_value_error(method):
             return method(*args, **kwargs)
         except ValueError:
             return None
+
     return on_call
 
 
+HOST_PREFIX = os.getenv('NETDATA_HOST_PREFIX')
+ETC_PASSWD_PATH = '/etc/passwd'
+PROC_PATH = '/proc'
+
+IS_INSIDE_DOCKER = False
+
+if HOST_PREFIX:
+    ETC_PASSWD_PATH = os.path.join(HOST_PREFIX, ETC_PASSWD_PATH[1:])
+    PROC_PATH = os.path.join(HOST_PREFIX, PROC_PATH[1:])
+    IS_INSIDE_DOCKER = True
+
+
+def read_passwd_file():
+    data = dict()
+    with open(ETC_PASSWD_PATH, 'r') as f:
+        for line in f:
+            line = line.strip()
+            if line.startswith("#"):
+                continue
+            fields = line.split(":")
+            # name, passwd, uid, gid, comment, home_dir, shell
+            if len(fields) != 7:
+                continue
+            # uid, guid
+            fields[2], fields[3] = int(fields[2]), int(fields[3])
+            data[fields[2]] = fields
+    return data
+
+
+def read_passwd_file_safe():
+    try:
+        if IS_INSIDE_DOCKER:
+            return read_passwd_file()
+        return dict((k[2], k) for k in pwd.getpwall())
+    except (OSError, IOError):
+        return dict()
+
+
+def get_username_by_pid_safe(pid, passwd_file):
+    path = os.path.join(PROC_PATH, pid)
+    try:
+        uid = os.stat(path).st_uid
+    except (OSError, IOError):
+        return ''
+
+    try:
+        return passwd_file[uid][0]
+    except KeyError:
+        return str(uid)
+
+
 class GPU:
-    def __init__(self, num, root):
+    def __init__(self, num, root, exclude_zero_memory_users=False):
         self.num = num
         self.root = root
+        self.exclude_zero_memory_users = exclude_zero_memory_users
 
     def id(self):
         return self.root.get('id')
@@ -301,15 +371,22 @@ class GPU:
 
     @handle_attr_error
     def processes(self):
-        p_nodes = self.root.find('processes').findall('process_info')
-        ps = []
-        for p in p_nodes:
-            ps.append({
-                'pid': p.find('pid').text,
-                'process_name': p.find('process_name').text,
-                'used_memory': int(p.find('used_memory').text.split()[0]),
+        processes_info = self.root.find('processes').findall('process_info')
+        if not processes_info:
+            return list()
+
+        passwd_file = read_passwd_file_safe()
+        processes = list()
+
+        for info in processes_info:
+            pid = info.find('pid').text
+            processes.append({
+                'pid': int(pid),
+                'process_name': info.find('process_name').text,
+                'used_memory': int(info.find('used_memory').text.split()[0]),
+                'username': get_username_by_pid_safe(pid, passwd_file),
             })
-        return ps
+        return processes
 
     def data(self):
         data = {
@@ -330,11 +407,21 @@ class GPU:
             'power_draw': self.power_draw(),
         }
         processes = self.processes() or []
-        data.update({'process_mem_{0}'.format(p['pid']): p['used_memory'] for p in processes})
-
-        return dict(
-            ('gpu{0}_{1}'.format(self.num, k), v) for k, v in data.items() if v is not None and v != BAD_VALUE
-        )
+        users = set()
+        for p in processes:
+            data['process_mem_{0}'.format(p['pid'])] = p['used_memory']
+            if p['username']:
+                if self.exclude_zero_memory_users and p['used_memory'] == 0:
+                    continue
+                users.add(p['username'])
+                key = 'user_mem_{0}'.format(p['username'])
+                if key in data:
+                    data[key] += p['used_memory']
+                else:
+                    data[key] = p['used_memory']
+        data['user_num'] = len(users)
+
+        return dict(('gpu{0}_{1}'.format(self.num, k), v) for k, v in data.items())
 
 
 class Service(SimpleService):
@@ -342,10 +429,12 @@ class Service(SimpleService):
         super(Service, self).__init__(configuration=configuration, name=name)
         self.order = list()
         self.definitions = dict()
+        self.loop_mode = configuration.get('loop_mode', True)
         poll = int(configuration.get('poll_seconds', 1))
+        self.exclude_zero_memory_users = configuration.get('exclude_zero_memory_users', False)
         self.poller = NvidiaSMIPoller(poll)
 
-    def get_data(self):
+    def get_data_loop_mode(self):
         if not self.poller.is_started():
             self.poller.start()
 
@@ -353,7 +442,17 @@ class Service(SimpleService):
             self.debug('poller is off')
             return None
 
-        last_data = self.poller.data()
+        return self.poller.data()
+
+    def get_data_normal_mode(self):
+        return self.poller.run_once()
+
+    def get_data(self):
+        if self.loop_mode:
+            last_data = self.get_data_loop_mode()
+        else:
+            last_data = self.get_data_normal_mode()
+
         if not last_data:
             return None
 
@@ -363,9 +462,13 @@ class Service(SimpleService):
 
         data = dict()
         for idx, root in enumerate(parsed.findall('gpu')):
-            gpu = GPU(idx, root)
-            data.update(gpu.data())
+            gpu = GPU(idx, root, self.exclude_zero_memory_users)
+            gpu_data = gpu.data()
+            # self.debug(gpu_data)
+            gpu_data = dict((k, v) for k, v in gpu_data.items() if is_gpu_data_value_valid(v))
+            data.update(gpu_data)
             self.update_processes_mem_chart(gpu)
+            self.update_processes_user_mem_chart(gpu)
 
         return data or None
 
@@ -384,6 +487,24 @@ class Service(SimpleService):
             if dim.id not in active_dim_ids:
                 chart.del_dimension(dim.id, hide=False)
 
+    def update_processes_user_mem_chart(self, gpu):
+        ps = gpu.processes()
+        if not ps:
+            return
+        chart = self.charts['gpu{0}_{1}'.format(gpu.num, USER_MEM)]
+        active_dim_ids = []
+        for p in ps:
+            if not p.get('username'):
+                continue
+            dim_id = 'gpu{0}_user_mem_{1}'.format(gpu.num, p['username'])
+            active_dim_ids.append(dim_id)
+            if dim_id not in chart:
+                chart.add_dimension([dim_id, '{0}'.format(p['username'])])
+
+        for dim in chart:
+            if dim.id not in active_dim_ids:
+                chart.del_dimension(dim.id, hide=False)
+
     def check(self):
         if not self.poller.has_smi():
             self.error("couldn't find '{0}' binary".format(NVIDIA_SMI))
@@ -419,3 +540,11 @@ class Service(SimpleService):
             order, charts = gpu_charts(GPU(idx, root))
             self.order.extend(order)
             self.definitions.update(charts)
+
+
+def is_gpu_data_value_valid(value):
+    try:
+        int(value)
+    except (TypeError, ValueError):
+        return False
+    return True
diff --git a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.conf b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.conf
index 53e544a5d..3d2a30d41 100644
--- a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.conf
+++ b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.conf
@@ -61,6 +61,8 @@
 #
 # Additionally to the above, example also supports the following:
 #
-# poll_seconds: SECONDS       # default is 1. Sets the frequency of seconds the nvidia-smi tool is polled.
+# loop_mode: yes/no                 # default is yes. If set to yes `nvidia-smi` is executed in a separate thread using `-l` option.
+# poll_seconds: SECONDS             # default is 1. Sets the frequency of seconds the nvidia-smi tool is polled in loop mode.
+# exclude_zero_memory_users: yes/no # default is no. Whether to collect users metrics with 0Mb memory allocation.
 #
 # ----------------------------------------------------------------------
-- 
cgit v1.2.3