From a8220ab2d293bb7f4b014b79d16b2fb05090fa93 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Feb 2021 12:45:55 +0100 Subject: Adding upstream version 1.29.0. Signed-off-by: Daniel Baumann --- collectors/python.d.plugin/nvidia_smi/README.md | 58 ++++--- .../python.d.plugin/nvidia_smi/nvidia_smi.chart.py | 175 ++++++++++++++++++--- .../python.d.plugin/nvidia_smi/nvidia_smi.conf | 4 +- 3 files changed, 192 insertions(+), 45 deletions(-) (limited to 'collectors/python.d.plugin/nvidia_smi') diff --git a/collectors/python.d.plugin/nvidia_smi/README.md b/collectors/python.d.plugin/nvidia_smi/README.md index 71e3e2889..9bfb2094b 100644 --- a/collectors/python.d.plugin/nvidia_smi/README.md +++ b/collectors/python.d.plugin/nvidia_smi/README.md @@ -1,42 +1,58 @@ -# nvidia_smi + -This module monitors the `nvidia-smi` cli tool. +# Nvidia GPU monitoring with Netdata -**Requirements and Notes:** +Monitors performance metrics (memory usage, fan speed, pcie bandwidth utilization, temperature, etc.) using `nvidia-smi` cli tool. -- You must have the `nvidia-smi` tool installed and your NVIDIA GPU(s) must support the tool. Mostly the newer high end models used for AI / ML and Crypto or Pro range, read more about [nvidia_smi](https://developer.nvidia.com/nvidia-system-management-interface). -- You must enable this plugin as its disabled by default due to minor performance issues. +## Requirements and Notes +- You must have the `nvidia-smi` tool installed and your NVIDIA GPU(s) must support the tool. Mostly the newer high end models used for AI / ML and Crypto or Pro range, read more about [nvidia_smi](https://developer.nvidia.com/nvidia-system-management-interface). +- You must enable this plugin as its disabled by default due to minor performance issues. - On some systems when the GPU is idle the `nvidia-smi` tool unloads and there is added latency again when it is next queried. If you are running GPUs under constant workload this isn't likely to be an issue. - - Currently the `nvidia-smi` tool is being queried via cli. Updating the plugin to use the nvidia c/c++ API directly should resolve this issue. See discussion here: - - Contributions are welcome. - - Make sure `netdata` user can execute `/usr/bin/nvidia-smi` or wherever your binary is. - +- If `nvidia-smi` process [is not killed after netdata restart](https://github.com/netdata/netdata/issues/7143) you need to off `loop_mode`. - `poll_seconds` is how often in seconds the tool is polled for as an integer. -It produces: +## Charts + +It produces the following charts: -1. Per GPU +- PCI Express Bandwidth Utilization in `KiB/s` +- Fan Speed in `percentage` +- GPU Utilization in `percentage` +- Memory Bandwidth Utilization in `percentage` +- Encoder/Decoder Utilization in `percentage` +- Memory Usage in `MiB` +- Temperature in `celsius` +- Clock Frequencies in `MHz` +- Power Utilization in `Watts` +- Memory Used by Each Process in `MiB` +- Memory Used by Each User in `MiB` +- Number of User on GPU in `num` - - GPU utilization - - memory allocation - - memory utilization - - fan speed - - power usage - - temperature - - clock speed - - PCI bandwidth +## Configuration -## configuration +Edit the `python.d/nvidia_smi.conf` configuration file using `edit-config` from the Netdata [config +directory](/docs/configure/nodes.md), which is typically at `/etc/netdata`. + +```bash +cd /etc/netdata # Replace this path with your Netdata config directory, if different +sudo ./edit-config python.d/nvidia_smi.conf +``` Sample: ```yaml -poll_seconds: 1 +loop_mode : yes +poll_seconds : 1 +exclude_zero_memory_users : yes ``` [![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fcollectors%2Fpython.d.plugin%2Fnvidia_smi%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)](<>) diff --git a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py index 0bea268ef..9c69586dd 100644 --- a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py +++ b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py @@ -2,21 +2,22 @@ # Description: nvidia-smi netdata python.d module # Original Author: Steven Noonan (tycho) # Author: Ilya Mashchenko (ilyam8) +# User Memory Stat Author: Guido Scatena (scatenag) import subprocess import threading +import os +import pwd + import xml.etree.ElementTree as et -from bases.collection import find_binary from bases.FrameworkServices.SimpleService import SimpleService +from bases.collection import find_binary disabled_by_default = True - NVIDIA_SMI = 'nvidia-smi' -BAD_VALUE = 'N/A' - EMPTY_ROW = '' EMPTY_ROW_LIMIT = 500 POLLER_BREAK_ROW = '' @@ -31,6 +32,8 @@ TEMPERATURE = 'temperature' CLOCKS = 'clocks' POWER = 'power' PROCESSES_MEM = 'processes_mem' +USER_MEM = 'user_mem' +USER_NUM = 'user_num' ORDER = [ PCI_BANDWIDTH, @@ -43,6 +46,8 @@ ORDER = [ CLOCKS, POWER, PROCESSES_MEM, + USER_MEM, + USER_NUM, ] @@ -76,7 +81,8 @@ def gpu_charts(gpu): ] }, ENCODER_UTIL: { - 'options': [None, 'Encoder/Decoder Utilization', 'percentage', fam, 'nvidia_smi.encoder_utilization', 'line'], + 'options': [None, 'Encoder/Decoder Utilization', 'percentage', fam, 'nvidia_smi.encoder_utilization', + 'line'], 'lines': [ ['encoder_util', 'encoder'], ['decoder_util', 'decoder'], @@ -114,6 +120,16 @@ def gpu_charts(gpu): 'options': [None, 'Memory Used by Each Process', 'MiB', fam, 'nvidia_smi.processes_mem', 'stacked'], 'lines': [] }, + USER_MEM: { + 'options': [None, 'Memory Used by Each User', 'MiB', fam, 'nvidia_smi.user_mem', 'stacked'], + 'lines': [] + }, + USER_NUM: { + 'options': [None, 'Number of User on GPU', 'num', fam, 'nvidia_smi.user_num', 'line'], + 'lines': [ + ['user_num', 'users'], + ] + }, } idx = gpu.num @@ -212,6 +228,7 @@ def handle_attr_error(method): return method(*args, **kwargs) except AttributeError: return None + return on_call @@ -221,13 +238,66 @@ def handle_value_error(method): return method(*args, **kwargs) except ValueError: return None + return on_call +HOST_PREFIX = os.getenv('NETDATA_HOST_PREFIX') +ETC_PASSWD_PATH = '/etc/passwd' +PROC_PATH = '/proc' + +IS_INSIDE_DOCKER = False + +if HOST_PREFIX: + ETC_PASSWD_PATH = os.path.join(HOST_PREFIX, ETC_PASSWD_PATH[1:]) + PROC_PATH = os.path.join(HOST_PREFIX, PROC_PATH[1:]) + IS_INSIDE_DOCKER = True + + +def read_passwd_file(): + data = dict() + with open(ETC_PASSWD_PATH, 'r') as f: + for line in f: + line = line.strip() + if line.startswith("#"): + continue + fields = line.split(":") + # name, passwd, uid, gid, comment, home_dir, shell + if len(fields) != 7: + continue + # uid, guid + fields[2], fields[3] = int(fields[2]), int(fields[3]) + data[fields[2]] = fields + return data + + +def read_passwd_file_safe(): + try: + if IS_INSIDE_DOCKER: + return read_passwd_file() + return dict((k[2], k) for k in pwd.getpwall()) + except (OSError, IOError): + return dict() + + +def get_username_by_pid_safe(pid, passwd_file): + path = os.path.join(PROC_PATH, pid) + try: + uid = os.stat(path).st_uid + except (OSError, IOError): + return '' + + try: + return passwd_file[uid][0] + except KeyError: + return str(uid) + + class GPU: - def __init__(self, num, root): + def __init__(self, num, root, exclude_zero_memory_users=False): self.num = num self.root = root + self.exclude_zero_memory_users = exclude_zero_memory_users def id(self): return self.root.get('id') @@ -301,15 +371,22 @@ class GPU: @handle_attr_error def processes(self): - p_nodes = self.root.find('processes').findall('process_info') - ps = [] - for p in p_nodes: - ps.append({ - 'pid': p.find('pid').text, - 'process_name': p.find('process_name').text, - 'used_memory': int(p.find('used_memory').text.split()[0]), + processes_info = self.root.find('processes').findall('process_info') + if not processes_info: + return list() + + passwd_file = read_passwd_file_safe() + processes = list() + + for info in processes_info: + pid = info.find('pid').text + processes.append({ + 'pid': int(pid), + 'process_name': info.find('process_name').text, + 'used_memory': int(info.find('used_memory').text.split()[0]), + 'username': get_username_by_pid_safe(pid, passwd_file), }) - return ps + return processes def data(self): data = { @@ -330,11 +407,21 @@ class GPU: 'power_draw': self.power_draw(), } processes = self.processes() or [] - data.update({'process_mem_{0}'.format(p['pid']): p['used_memory'] for p in processes}) - - return dict( - ('gpu{0}_{1}'.format(self.num, k), v) for k, v in data.items() if v is not None and v != BAD_VALUE - ) + users = set() + for p in processes: + data['process_mem_{0}'.format(p['pid'])] = p['used_memory'] + if p['username']: + if self.exclude_zero_memory_users and p['used_memory'] == 0: + continue + users.add(p['username']) + key = 'user_mem_{0}'.format(p['username']) + if key in data: + data[key] += p['used_memory'] + else: + data[key] = p['used_memory'] + data['user_num'] = len(users) + + return dict(('gpu{0}_{1}'.format(self.num, k), v) for k, v in data.items()) class Service(SimpleService): @@ -342,10 +429,12 @@ class Service(SimpleService): super(Service, self).__init__(configuration=configuration, name=name) self.order = list() self.definitions = dict() + self.loop_mode = configuration.get('loop_mode', True) poll = int(configuration.get('poll_seconds', 1)) + self.exclude_zero_memory_users = configuration.get('exclude_zero_memory_users', False) self.poller = NvidiaSMIPoller(poll) - def get_data(self): + def get_data_loop_mode(self): if not self.poller.is_started(): self.poller.start() @@ -353,7 +442,17 @@ class Service(SimpleService): self.debug('poller is off') return None - last_data = self.poller.data() + return self.poller.data() + + def get_data_normal_mode(self): + return self.poller.run_once() + + def get_data(self): + if self.loop_mode: + last_data = self.get_data_loop_mode() + else: + last_data = self.get_data_normal_mode() + if not last_data: return None @@ -363,9 +462,13 @@ class Service(SimpleService): data = dict() for idx, root in enumerate(parsed.findall('gpu')): - gpu = GPU(idx, root) - data.update(gpu.data()) + gpu = GPU(idx, root, self.exclude_zero_memory_users) + gpu_data = gpu.data() + # self.debug(gpu_data) + gpu_data = dict((k, v) for k, v in gpu_data.items() if is_gpu_data_value_valid(v)) + data.update(gpu_data) self.update_processes_mem_chart(gpu) + self.update_processes_user_mem_chart(gpu) return data or None @@ -384,6 +487,24 @@ class Service(SimpleService): if dim.id not in active_dim_ids: chart.del_dimension(dim.id, hide=False) + def update_processes_user_mem_chart(self, gpu): + ps = gpu.processes() + if not ps: + return + chart = self.charts['gpu{0}_{1}'.format(gpu.num, USER_MEM)] + active_dim_ids = [] + for p in ps: + if not p.get('username'): + continue + dim_id = 'gpu{0}_user_mem_{1}'.format(gpu.num, p['username']) + active_dim_ids.append(dim_id) + if dim_id not in chart: + chart.add_dimension([dim_id, '{0}'.format(p['username'])]) + + for dim in chart: + if dim.id not in active_dim_ids: + chart.del_dimension(dim.id, hide=False) + def check(self): if not self.poller.has_smi(): self.error("couldn't find '{0}' binary".format(NVIDIA_SMI)) @@ -419,3 +540,11 @@ class Service(SimpleService): order, charts = gpu_charts(GPU(idx, root)) self.order.extend(order) self.definitions.update(charts) + + +def is_gpu_data_value_valid(value): + try: + int(value) + except (TypeError, ValueError): + return False + return True diff --git a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.conf b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.conf index 53e544a5d..3d2a30d41 100644 --- a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.conf +++ b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.conf @@ -61,6 +61,8 @@ # # Additionally to the above, example also supports the following: # -# poll_seconds: SECONDS # default is 1. Sets the frequency of seconds the nvidia-smi tool is polled. +# loop_mode: yes/no # default is yes. If set to yes `nvidia-smi` is executed in a separate thread using `-l` option. +# poll_seconds: SECONDS # default is 1. Sets the frequency of seconds the nvidia-smi tool is polled in loop mode. +# exclude_zero_memory_users: yes/no # default is no. Whether to collect users metrics with 0Mb memory allocation. # # ---------------------------------------------------------------------- -- cgit v1.2.3