4 files changed, 689 insertions, 0 deletions
diff --git a/collectors/python.d.plugin/nvidia_smi/Makefile.inc b/collectors/python.d.plugin/nvidia_smi/Makefile.inc
new file mode 100644
index 0000000..52fb25a
--- /dev/null
+++ b/collectors/python.d.plugin/nvidia_smi/Makefile.inc
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+# THIS IS NOT A COMPLETE Makefile
+# IT IS INCLUDED BY ITS PARENT'S Makefile.am
+# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT
+
+# install these files
+dist_python_DATA       += nvidia_smi/nvidia_smi.chart.py
+dist_pythonconfig_DATA += nvidia_smi/nvidia_smi.conf
+
+# do not install these files, but include them in the distribution
+dist_noinst_DATA       += nvidia_smi/README.md nvidia_smi/Makefile.inc
+
diff --git a/collectors/python.d.plugin/nvidia_smi/README.md b/collectors/python.d.plugin/nvidia_smi/README.md
new file mode 100644
index 0000000..9bfb209
--- /dev/null
+++ b/collectors/python.d.plugin/nvidia_smi/README.md
@@ -0,0 +1,58 @@
+<!--
+title: "Nvidia GPU monitoring with Netdata"
+custom_edit_url: https://github.com/netdata/netdata/edit/master/collectors/python.d.plugin/nvidia_smi/README.md
+sidebar_label: "Nvidia GPUs"
+-->
+
+# Nvidia GPU monitoring with Netdata
+
+Monitors performance metrics (memory usage, fan speed, pcie bandwidth utilization, temperature, etc.) using `nvidia-smi` cli tool.
+
+
+## Requirements and Notes
+
+-   You must have the `nvidia-smi` tool installed and your NVIDIA GPU(s) must support the tool. Mostly the newer high end models used for AI / ML and Crypto or Pro range, read more about [nvidia_smi](https://developer.nvidia.com/nvidia-system-management-interface).
+-   You must enable this plugin as its disabled by default due to minor performance issues.
+-   On some systems when the GPU is idle the `nvidia-smi` tool unloads and there is added latency again when it is next queried. If you are running GPUs under constant workload this isn't likely to be an issue.
+-   Currently the `nvidia-smi` tool is being queried via cli. Updating the plugin to use the nvidia c/c++ API directly should resolve this issue. See discussion here: <https://github.com/netdata/netdata/pull/4357>
+-   Contributions are welcome.
+-   Make sure `netdata` user can execute `/usr/bin/nvidia-smi` or wherever your binary is.
+-   If `nvidia-smi` process [is not killed after netdata restart](https://github.com/netdata/netdata/issues/7143) you need to off `loop_mode`.
+-   `poll_seconds` is how often in seconds the tool is polled for as an integer.
+
+## Charts
+
+It produces the following charts:
+
+-   PCI Express Bandwidth Utilization in `KiB/s`
+-   Fan Speed in `percentage`
+-   GPU Utilization in `percentage`
+-   Memory Bandwidth Utilization in `percentage`
+-   Encoder/Decoder Utilization in `percentage`
+-   Memory Usage in `MiB`
+-   Temperature in `celsius`
+-   Clock Frequencies in `MHz`
+-   Power Utilization in `Watts`
+-   Memory Used by Each Process in `MiB`
+-   Memory Used by Each User in `MiB`
+-   Number of User on GPU in `num`
+
+## Configuration
+
+Edit the `python.d/nvidia_smi.conf` configuration file using `edit-config` from the Netdata [config
+directory](/docs/configure/nodes.md), which is typically at `/etc/netdata`.
+
+```bash
+cd /etc/netdata   # Replace this path with your Netdata config directory, if different
+sudo ./edit-config python.d/nvidia_smi.conf
+```
+
+Sample:
+
+```yaml
+loop_mode    : yes
+poll_seconds : 1
+exclude_zero_memory_users : yes
+```
+
+[![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fcollectors%2Fpython.d.plugin%2Fnvidia_smi%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)](<>)
diff --git a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py
new file mode 100644
index 0000000..9c69586
--- /dev/null
+++ b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py
@@ -0,0 +1,550 @@
+# -*- coding: utf-8 -*-
+# Description: nvidia-smi netdata python.d module
+# Original Author: Steven Noonan (tycho)
+# Author: Ilya Mashchenko (ilyam8)
+# User Memory Stat Author: Guido Scatena (scatenag)
+
+import subprocess
+import threading
+import os
+import pwd
+
+import xml.etree.ElementTree as et
+
+from bases.FrameworkServices.SimpleService import SimpleService
+from bases.collection import find_binary
+
+disabled_by_default = True
+
+NVIDIA_SMI = 'nvidia-smi'
+
+EMPTY_ROW = ''
+EMPTY_ROW_LIMIT = 500
+POLLER_BREAK_ROW = '</nvidia_smi_log>'
+
+PCI_BANDWIDTH = 'pci_bandwidth'
+FAN_SPEED = 'fan_speed'
+GPU_UTIL = 'gpu_utilization'
+MEM_UTIL = 'mem_utilization'
+ENCODER_UTIL = 'encoder_utilization'
+MEM_USAGE = 'mem_usage'
+TEMPERATURE = 'temperature'
+CLOCKS = 'clocks'
+POWER = 'power'
+PROCESSES_MEM = 'processes_mem'
+USER_MEM = 'user_mem'
+USER_NUM = 'user_num'
+
+ORDER = [
+    PCI_BANDWIDTH,
+    FAN_SPEED,
+    GPU_UTIL,
+    MEM_UTIL,
+    ENCODER_UTIL,
+    MEM_USAGE,
+    TEMPERATURE,
+    CLOCKS,
+    POWER,
+    PROCESSES_MEM,
+    USER_MEM,
+    USER_NUM,
+]
+
+
+def gpu_charts(gpu):
+    fam = gpu.full_name()
+
+    charts = {
+        PCI_BANDWIDTH: {
+            'options': [None, 'PCI Express Bandwidth Utilization', 'KiB/s', fam, 'nvidia_smi.pci_bandwidth', 'area'],
+            'lines': [
+                ['rx_util', 'rx', 'absolute', 1, 1],
+                ['tx_util', 'tx', 'absolute', 1, -1],
+            ]
+        },
+        FAN_SPEED: {
+            'options': [None, 'Fan Speed', 'percentage', fam, 'nvidia_smi.fan_speed', 'line'],
+            'lines': [
+                ['fan_speed', 'speed'],
+            ]
+        },
+        GPU_UTIL: {
+            'options': [None, 'GPU Utilization', 'percentage', fam, 'nvidia_smi.gpu_utilization', 'line'],
+            'lines': [
+                ['gpu_util', 'utilization'],
+            ]
+        },
+        MEM_UTIL: {
+            'options': [None, 'Memory Bandwidth Utilization', 'percentage', fam, 'nvidia_smi.mem_utilization', 'line'],
+            'lines': [
+                ['memory_util', 'utilization'],
+            ]
+        },
+        ENCODER_UTIL: {
+            'options': [None, 'Encoder/Decoder Utilization', 'percentage', fam, 'nvidia_smi.encoder_utilization',
+                        'line'],
+            'lines': [
+                ['encoder_util', 'encoder'],
+                ['decoder_util', 'decoder'],
+            ]
+        },
+        MEM_USAGE: {
+            'options': [None, 'Memory Usage', 'MiB', fam, 'nvidia_smi.memory_allocated', 'stacked'],
+            'lines': [
+                ['fb_memory_free', 'free'],
+                ['fb_memory_used', 'used'],
+            ]
+        },
+        TEMPERATURE: {
+            'options': [None, 'Temperature', 'celsius', fam, 'nvidia_smi.temperature', 'line'],
+            'lines': [
+                ['gpu_temp', 'temp'],
+            ]
+        },
+        CLOCKS: {
+            'options': [None, 'Clock Frequencies', 'MHz', fam, 'nvidia_smi.clocks', 'line'],
+            'lines': [
+                ['graphics_clock', 'graphics'],
+                ['video_clock', 'video'],
+                ['sm_clock', 'sm'],
+                ['mem_clock', 'mem'],
+            ]
+        },
+        POWER: {
+            'options': [None, 'Power Utilization', 'Watts', fam, 'nvidia_smi.power', 'line'],
+            'lines': [
+                ['power_draw', 'power', 'absolute', 1, 100],
+            ]
+        },
+        PROCESSES_MEM: {
+            'options': [None, 'Memory Used by Each Process', 'MiB', fam, 'nvidia_smi.processes_mem', 'stacked'],
+            'lines': []
+        },
+        USER_MEM: {
+            'options': [None, 'Memory Used by Each User', 'MiB', fam, 'nvidia_smi.user_mem', 'stacked'],
+            'lines': []
+        },
+        USER_NUM: {
+            'options': [None, 'Number of User on GPU', 'num', fam, 'nvidia_smi.user_num', 'line'],
+            'lines': [
+                ['user_num', 'users'],
+            ]
+        },
+    }
+
+    idx = gpu.num
+
+    order = ['gpu{0}_{1}'.format(idx, v) for v in ORDER]
+    charts = dict(('gpu{0}_{1}'.format(idx, k), v) for k, v in charts.items())
+
+    for chart in charts.values():
+        for line in chart['lines']:
+            line[0] = 'gpu{0}_{1}'.format(idx, line[0])
+
+    return order, charts
+
+
+class NvidiaSMI:
+    def __init__(self):
+        self.command = find_binary(NVIDIA_SMI)
+        self.active_proc = None
+
+    def run_once(self):
+        proc = subprocess.Popen([self.command, '-x', '-q'], stdout=subprocess.PIPE)
+        stdout, _ = proc.communicate()
+        return stdout
+
+    def run_loop(self, interval):
+        if self.active_proc:
+            self.kill()
+        proc = subprocess.Popen([self.command, '-x', '-q', '-l', str(interval)], stdout=subprocess.PIPE)
+        self.active_proc = proc
+        return proc.stdout
+
+    def kill(self):
+        if self.active_proc:
+            self.active_proc.kill()
+            self.active_proc = None
+
+
+class NvidiaSMIPoller(threading.Thread):
+    def __init__(self, poll_interval):
+        threading.Thread.__init__(self)
+        self.daemon = True
+
+        self.smi = NvidiaSMI()
+        self.interval = poll_interval
+
+        self.lock = threading.RLock()
+        self.last_data = str()
+        self.exit = False
+        self.empty_rows = 0
+        self.rows = list()
+
+    def has_smi(self):
+        return bool(self.smi.command)
+
+    def run_once(self):
+        return self.smi.run_once()
+
+    def run(self):
+        out = self.smi.run_loop(self.interval)
+
+        for row in out:
+            if self.exit or self.empty_rows > EMPTY_ROW_LIMIT:
+                break
+            self.process_row(row)
+        self.smi.kill()
+
+    def process_row(self, row):
+        row = row.decode()
+        self.empty_rows += (row == EMPTY_ROW)
+        self.rows.append(row)
+
+        if POLLER_BREAK_ROW in row:
+            self.lock.acquire()
+            self.last_data = '\n'.join(self.rows)
+            self.lock.release()
+
+            self.rows = list()
+            self.empty_rows = 0
+
+    def is_started(self):
+        return self.ident is not None
+
+    def shutdown(self):
+        self.exit = True
+
+    def data(self):
+        self.lock.acquire()
+        data = self.last_data
+        self.lock.release()
+        return data
+
+
+def handle_attr_error(method):
+    def on_call(*args, **kwargs):
+        try:
+            return method(*args, **kwargs)
+        except AttributeError:
+            return None
+
+    return on_call
+
+
+def handle_value_error(method):
+    def on_call(*args, **kwargs):
+        try:
+            return method(*args, **kwargs)
+        except ValueError:
+            return None
+
+    return on_call
+
+
+HOST_PREFIX = os.getenv('NETDATA_HOST_PREFIX')
+ETC_PASSWD_PATH = '/etc/passwd'
+PROC_PATH = '/proc'
+
+IS_INSIDE_DOCKER = False
+
+if HOST_PREFIX:
+    ETC_PASSWD_PATH = os.path.join(HOST_PREFIX, ETC_PASSWD_PATH[1:])
+    PROC_PATH = os.path.join(HOST_PREFIX, PROC_PATH[1:])
+    IS_INSIDE_DOCKER = True
+
+
+def read_passwd_file():
+    data = dict()
+    with open(ETC_PASSWD_PATH, 'r') as f:
+        for line in f:
+            line = line.strip()
+            if line.startswith("#"):
+                continue
+            fields = line.split(":")
+            # name, passwd, uid, gid, comment, home_dir, shell
+            if len(fields) != 7:
+                continue
+            # uid, guid
+            fields[2], fields[3] = int(fields[2]), int(fields[3])
+            data[fields[2]] = fields
+    return data
+
+
+def read_passwd_file_safe():
+    try:
+        if IS_INSIDE_DOCKER:
+            return read_passwd_file()
+        return dict((k[2], k) for k in pwd.getpwall())
+    except (OSError, IOError):
+        return dict()
+
+
+def get_username_by_pid_safe(pid, passwd_file):
+    path = os.path.join(PROC_PATH, pid)
+    try:
+        uid = os.stat(path).st_uid
+    except (OSError, IOError):
+        return ''
+
+    try:
+        return passwd_file[uid][0]
+    except KeyError:
+        return str(uid)
+
+
+class GPU:
+    def __init__(self, num, root, exclude_zero_memory_users=False):
+        self.num = num
+        self.root = root
+        self.exclude_zero_memory_users = exclude_zero_memory_users
+
+    def id(self):
+        return self.root.get('id')
+
+    def name(self):
+        return self.root.find('product_name').text
+
+    def full_name(self):
+        return 'gpu{0} {1}'.format(self.num, self.name())
+
+    @handle_attr_error
+    def rx_util(self):
+        return self.root.find('pci').find('rx_util').text.split()[0]
+
+    @handle_attr_error
+    def tx_util(self):
+        return self.root.find('pci').find('tx_util').text.split()[0]
+
+    @handle_attr_error
+    def fan_speed(self):
+        return self.root.find('fan_speed').text.split()[0]
+
+    @handle_attr_error
+    def gpu_util(self):
+        return self.root.find('utilization').find('gpu_util').text.split()[0]
+
+    @handle_attr_error
+    def memory_util(self):
+        return self.root.find('utilization').find('memory_util').text.split()[0]
+
+    @handle_attr_error
+    def encoder_util(self):
+        return self.root.find('utilization').find('encoder_util').text.split()[0]
+
+    @handle_attr_error
+    def decoder_util(self):
+        return self.root.find('utilization').find('decoder_util').text.split()[0]
+
+    @handle_attr_error
+    def fb_memory_used(self):
+        return self.root.find('fb_memory_usage').find('used').text.split()[0]
+
+    @handle_attr_error
+    def fb_memory_free(self):
+        return self.root.find('fb_memory_usage').find('free').text.split()[0]
+
+    @handle_attr_error
+    def temperature(self):
+        return self.root.find('temperature').find('gpu_temp').text.split()[0]
+
+    @handle_attr_error
+    def graphics_clock(self):
+        return self.root.find('clocks').find('graphics_clock').text.split()[0]
+
+    @handle_attr_error
+    def video_clock(self):
+        return self.root.find('clocks').find('video_clock').text.split()[0]
+
+    @handle_attr_error
+    def sm_clock(self):
+        return self.root.find('clocks').find('sm_clock').text.split()[0]
+
+    @handle_attr_error
+    def mem_clock(self):
+        return self.root.find('clocks').find('mem_clock').text.split()[0]
+
+    @handle_value_error
+    @handle_attr_error
+    def power_draw(self):
+        return float(self.root.find('power_readings').find('power_draw').text.split()[0]) * 100
+
+    @handle_attr_error
+    def processes(self):
+        processes_info = self.root.find('processes').findall('process_info')
+        if not processes_info:
+            return list()
+
+        passwd_file = read_passwd_file_safe()
+        processes = list()
+
+        for info in processes_info:
+            pid = info.find('pid').text
+            processes.append({
+                'pid': int(pid),
+                'process_name': info.find('process_name').text,
+                'used_memory': int(info.find('used_memory').text.split()[0]),
+                'username': get_username_by_pid_safe(pid, passwd_file),
+            })
+        return processes
+
+    def data(self):
+        data = {
+            'rx_util': self.rx_util(),
+            'tx_util': self.tx_util(),
+            'fan_speed': self.fan_speed(),
+            'gpu_util': self.gpu_util(),
+            'memory_util': self.memory_util(),
+            'encoder_util': self.encoder_util(),
+            'decoder_util': self.decoder_util(),
+            'fb_memory_used': self.fb_memory_used(),
+            'fb_memory_free': self.fb_memory_free(),
+            'gpu_temp': self.temperature(),
+            'graphics_clock': self.graphics_clock(),
+            'video_clock': self.video_clock(),
+            'sm_clock': self.sm_clock(),
+            'mem_clock': self.mem_clock(),
+            'power_draw': self.power_draw(),
+        }
+        processes = self.processes() or []
+        users = set()
+        for p in processes:
+            data['process_mem_{0}'.format(p['pid'])] = p['used_memory']
+            if p['username']:
+                if self.exclude_zero_memory_users and p['used_memory'] == 0:
+                    continue
+                users.add(p['username'])
+                key = 'user_mem_{0}'.format(p['username'])
+                if key in data:
+                    data[key] += p['used_memory']
+                else:
+                    data[key] = p['used_memory']
+        data['user_num'] = len(users)
+
+        return dict(('gpu{0}_{1}'.format(self.num, k), v) for k, v in data.items())
+
+
+class Service(SimpleService):
+    def __init__(self, configuration=None, name=None):
+        super(Service, self).__init__(configuration=configuration, name=name)
+        self.order = list()
+        self.definitions = dict()
+        self.loop_mode = configuration.get('loop_mode', True)
+        poll = int(configuration.get('poll_seconds', 1))
+        self.exclude_zero_memory_users = configuration.get('exclude_zero_memory_users', False)
+        self.poller = NvidiaSMIPoller(poll)
+
+    def get_data_loop_mode(self):
+        if not self.poller.is_started():
+            self.poller.start()
+
+        if not self.poller.is_alive():
+            self.debug('poller is off')
+            return None
+
+        return self.poller.data()
+
+    def get_data_normal_mode(self):
+        return self.poller.run_once()
+
+    def get_data(self):
+        if self.loop_mode:
+            last_data = self.get_data_loop_mode()
+        else:
+            last_data = self.get_data_normal_mode()
+
+        if not last_data:
+            return None
+
+        parsed = self.parse_xml(last_data)
+        if parsed is None:
+            return None
+
+        data = dict()
+        for idx, root in enumerate(parsed.findall('gpu')):
+            gpu = GPU(idx, root, self.exclude_zero_memory_users)
+            gpu_data = gpu.data()
+            # self.debug(gpu_data)
+            gpu_data = dict((k, v) for k, v in gpu_data.items() if is_gpu_data_value_valid(v))
+            data.update(gpu_data)
+            self.update_processes_mem_chart(gpu)
+            self.update_processes_user_mem_chart(gpu)
+
+        return data or None
+
+    def update_processes_mem_chart(self, gpu):
+        ps = gpu.processes()
+        if not ps:
+            return
+        chart = self.charts['gpu{0}_{1}'.format(gpu.num, PROCESSES_MEM)]
+        active_dim_ids = []
+        for p in ps:
+            dim_id = 'gpu{0}_process_mem_{1}'.format(gpu.num, p['pid'])
+            active_dim_ids.append(dim_id)
+            if dim_id not in chart:
+                chart.add_dimension([dim_id, '{0} {1}'.format(p['pid'], p['process_name'])])
+        for dim in chart:
+            if dim.id not in active_dim_ids:
+                chart.del_dimension(dim.id, hide=False)
+
+    def update_processes_user_mem_chart(self, gpu):
+        ps = gpu.processes()
+        if not ps:
+            return
+        chart = self.charts['gpu{0}_{1}'.format(gpu.num, USER_MEM)]
+        active_dim_ids = []
+        for p in ps:
+            if not p.get('username'):
+                continue
+            dim_id = 'gpu{0}_user_mem_{1}'.format(gpu.num, p['username'])
+            active_dim_ids.append(dim_id)
+            if dim_id not in chart:
+                chart.add_dimension([dim_id, '{0}'.format(p['username'])])
+
+        for dim in chart:
+            if dim.id not in active_dim_ids:
+                chart.del_dimension(dim.id, hide=False)
+
+    def check(self):
+        if not self.poller.has_smi():
+            self.error("couldn't find '{0}' binary".format(NVIDIA_SMI))
+            return False
+
+        raw_data = self.poller.run_once()
+        if not raw_data:
+            self.error("failed to invoke '{0}' binary".format(NVIDIA_SMI))
+            return False
+
+        parsed = self.parse_xml(raw_data)
+        if parsed is None:
+            return False
+
+        gpus = parsed.findall('gpu')
+        if not gpus:
+            return False
+
+        self.create_charts(gpus)
+
+        return True
+
+    def parse_xml(self, data):
+        try:
+            return et.fromstring(data)
+        except et.ParseError as error:
+            self.error('xml parse failed: "{0}", error: {1}'.format(data, error))
+
+        return None
+
+    def create_charts(self, gpus):
+        for idx, root in enumerate(gpus):
+            order, charts = gpu_charts(GPU(idx, root))
+            self.order.extend(order)
+            self.definitions.update(charts)
+
+
+def is_gpu_data_value_valid(value):
+    try:
+        int(value)
+    except (TypeError, ValueError):
+        return False
+    return True
diff --git a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.conf b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.conf
new file mode 100644
index 0000000..3d2a30d
--- /dev/null
+++ b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.conf
@@ -0,0 +1,68 @@
+# netdata python.d.plugin configuration for nvidia_smi
+#
+# This file is in YaML format. Generally the format is:
+#
+# name: value
+#
+# There are 2 sections:
+#  - global variables
+#  - one or more JOBS
+#
+# JOBS allow you to collect values from multiple sources.
+# Each source will have its own set of charts.
+#
+# JOB parameters have to be indented (using spaces only, example below).
+
+# ----------------------------------------------------------------------
+# Global Variables
+# These variables set the defaults for all JOBs, however each JOB
+# may define its own, overriding the defaults.
+
+# update_every sets the default data collection frequency.
+# If unset, the python.d.plugin default is used.
+# update_every: 1
+
+# priority controls the order of charts at the netdata dashboard.
+# Lower numbers move the charts towards the top of the page.
+# If unset, the default for python.d.plugin is used.
+# priority: 60000
+
+# penalty indicates whether to apply penalty to update_every in case of failures.
+# Penalty will increase every 5 failed updates in a row. Maximum penalty is 10 minutes.
+# penalty: yes
+
+# autodetection_retry sets the job re-check interval in seconds.
+# The job is not deleted if check fails.
+# Attempts to start the job are made once every autodetection_retry.
+# This feature is disabled by default.
+# autodetection_retry: 0
+
+# ----------------------------------------------------------------------
+# JOBS (data collection sources)
+#
+# The default JOBS share the same *name*. JOBS with the same name
+# are mutually exclusive. Only one of them will be allowed running at
+# any time. This allows autodetection to try several alternatives and
+# pick the one that works.
+#
+# Any number of jobs is supported.
+#
+# All python.d.plugin JOBS (for all its modules) support a set of
+# predefined parameters. These are:
+#
+# job_name:
+#     name: myname            # the JOB's name as it will appear at the
+#                             # dashboard (by default is the job_name)
+#                             # JOBs sharing a name are mutually exclusive
+#     update_every: 1         # the JOB's data collection frequency
+#     priority: 60000         # the JOB's order on the dashboard
+#     penalty: yes            # the JOB's penalty
+#     autodetection_retry: 0  # the JOB's re-check interval in seconds
+#
+# Additionally to the above, example also supports the following:
+#
+# loop_mode: yes/no                 # default is yes. If set to yes `nvidia-smi` is executed in a separate thread using `-l` option.
+# poll_seconds: SECONDS             # default is 1. Sets the frequency of seconds the nvidia-smi tool is polled in loop mode.
+# exclude_zero_memory_users: yes/no # default is no. Whether to collect users metrics with 0Mb memory allocation.
+#
+# ----------------------------------------------------------------------