summaryrefslogtreecommitdiffstats
path: root/collectors/python.d.plugin/nvidia_smi
diff options
context:
space:
mode:
Diffstat (limited to 'collectors/python.d.plugin/nvidia_smi')
-rw-r--r--collectors/python.d.plugin/nvidia_smi/Makefile.inc12
-rw-r--r--collectors/python.d.plugin/nvidia_smi/README.md39
-rw-r--r--collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py361
-rw-r--r--collectors/python.d.plugin/nvidia_smi/nvidia_smi.conf68
4 files changed, 480 insertions, 0 deletions
diff --git a/collectors/python.d.plugin/nvidia_smi/Makefile.inc b/collectors/python.d.plugin/nvidia_smi/Makefile.inc
new file mode 100644
index 000000000..c23bd2517
--- /dev/null
+++ b/collectors/python.d.plugin/nvidia_smi/Makefile.inc
@@ -0,0 +1,12 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+# THIS IS NOT A COMPLETE Makefile
+# IT IS INCLUDED BY ITS PARENT'S Makefile.am
+# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT
+
+# install these files
+dist_python_DATA += nvidia_smi/nvidia_smi.chart.py
+dist_pythonconfig_DATA += nvidia_smi/nvidia_smi.conf
+
+# do not install these files, but include them in the distribution
+dist_noinst_DATA += nvidia_smi/README.md nvidia_smi/Makefile.inc
diff --git a/collectors/python.d.plugin/nvidia_smi/README.md b/collectors/python.d.plugin/nvidia_smi/README.md
new file mode 100644
index 000000000..06acfc297
--- /dev/null
+++ b/collectors/python.d.plugin/nvidia_smi/README.md
@@ -0,0 +1,39 @@
+# nvidia_smi
+
+This module monitors the `nvidia-smi` cli tool.
+
+**Requirements and Notes:**
+
+ * You must have the `nvidia-smi` tool installed and your NVIDIA GPU(s) must support the tool. Mostly the newer high end models used for AI / ML and Crypto or Pro range, read more about [nvidia_smi](https://developer.nvidia.com/nvidia-system-management-interface).
+
+ * You must enable this plugin as its disabled by default due to minor performance issues.
+
+ * On some systems when the GPU is idle the `nvidia-smi` tool unloads and there is added latency again when it is next queried. If you are running GPUs under constant workload this isn't likely to be an issue.
+
+ * Currently the `nvidia-smi` tool is being queried via cli. Updating the plugin to use the nvidia c/c++ API directly should resolve this issue. See discussion here: https://github.com/netdata/netdata/pull/4357
+
+ * Contributions are welcome.
+
+ * Make sure `netdata` user can execute `/usr/bin/nvidia-smi` or wherever your binary is.
+
+ * `poll_seconds` is how often in seconds the tool is polled for as an integer.
+
+It produces:
+
+1. Per GPU
+ * GPU utilization
+ * memory allocation
+ * memory utilization
+ * fan speed
+ * power usage
+ * temperature
+ * clock speed
+ * PCI bandwidth
+
+### configuration
+
+Sample:
+
+```yaml
+poll_seconds: 1
+``` \ No newline at end of file
diff --git a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py
new file mode 100644
index 000000000..c3fff6219
--- /dev/null
+++ b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py
@@ -0,0 +1,361 @@
+# -*- coding: utf-8 -*-
+# Description: nvidia-smi netdata python.d module
+# Original Author: Steven Noonan (tycho)
+# Author: Ilya Mashchenko (l2isbad)
+
+import subprocess
+import threading
+import xml.etree.ElementTree as et
+
+from bases.collection import find_binary
+from bases.FrameworkServices.SimpleService import SimpleService
+
+disabled_by_default = True
+
+
+NVIDIA_SMI = 'nvidia-smi'
+
+EMPTY_ROW = ''
+EMPTY_ROW_LIMIT = 500
+POLLER_BREAK_ROW = '</nvidia_smi_log>'
+
+PCI_BANDWIDTH = 'pci_bandwidth'
+FAN_SPEED = 'fan_speed'
+GPU_UTIL = 'gpu_utilization'
+MEM_UTIL = 'mem_utilization'
+ENCODER_UTIL = 'encoder_utilization'
+MEM_ALLOCATED = 'mem_allocated'
+TEMPERATURE = 'temperature'
+CLOCKS = 'clocks'
+POWER = 'power'
+
+ORDER = [
+ PCI_BANDWIDTH,
+ FAN_SPEED,
+ GPU_UTIL,
+ MEM_UTIL,
+ ENCODER_UTIL,
+ MEM_ALLOCATED,
+ TEMPERATURE,
+ CLOCKS,
+ POWER,
+]
+
+
+def gpu_charts(gpu):
+ fam = gpu.full_name()
+
+ charts = {
+ PCI_BANDWIDTH: {
+ 'options': [None, 'PCI Express Bandwidth Utilization', 'KB/s', fam, 'nvidia_smi.pci_bandwidth', 'area'],
+ 'lines': [
+ ['rx_util', 'rx', 'absolute', 1, 1],
+ ['tx_util', 'tx', 'absolute', 1, -1],
+ ]
+ },
+ FAN_SPEED: {
+ 'options': [None, 'Fan Speed', '%', fam, 'nvidia_smi.fan_speed', 'line'],
+ 'lines': [
+ ['fan_speed', 'speed'],
+ ]
+ },
+ GPU_UTIL: {
+ 'options': [None, 'GPU Utilization', '%', fam, 'nvidia_smi.gpu_utilization', 'line'],
+ 'lines': [
+ ['gpu_util', 'utilization'],
+ ]
+ },
+ MEM_UTIL: {
+ 'options': [None, 'Memory Bandwidth Utilization', '%', fam, 'nvidia_smi.mem_utilization', 'line'],
+ 'lines': [
+ ['memory_util', 'utilization'],
+ ]
+ },
+ ENCODER_UTIL: {
+ 'options': [None, 'Encoder/Decoder Utilization', '%', fam, 'nvidia_smi.encoder_utilization', 'line'],
+ 'lines': [
+ ['encoder_util', 'encoder'],
+ ['decoder_util', 'decoder'],
+ ]
+ },
+ MEM_ALLOCATED: {
+ 'options': [None, 'Memory Allocated', 'MB', fam, 'nvidia_smi.memory_allocated', 'line'],
+ 'lines': [
+ ['fb_memory_usage', 'used'],
+ ]
+ },
+ TEMPERATURE: {
+ 'options': [None, 'Temperature', 'celsius', fam, 'nvidia_smi.temperature', 'line'],
+ 'lines': [
+ ['gpu_temp', 'temp'],
+ ]
+ },
+ CLOCKS: {
+ 'options': [None, 'Clock Frequencies', 'MHz', fam, 'nvidia_smi.clocks', 'line'],
+ 'lines': [
+ ['graphics_clock', 'graphics'],
+ ['video_clock', 'video'],
+ ['sm_clock', 'sm'],
+ ['mem_clock', 'mem'],
+ ]
+ },
+ POWER: {
+ 'options': [None, 'Power Utilization', 'Watts', fam, 'nvidia_smi.power', 'line'],
+ 'lines': [
+ ['power_draw', 'power', 1, 100],
+ ]
+ },
+ }
+
+ idx = gpu.num
+
+ order = ['gpu{0}_{1}'.format(idx, v) for v in ORDER]
+ charts = dict(('gpu{0}_{1}'.format(idx, k), v) for k, v in charts.items())
+
+ for chart in charts.values():
+ for line in chart['lines']:
+ line[0] = 'gpu{0}_{1}'.format(idx, line[0])
+
+ return order, charts
+
+
+class NvidiaSMI:
+ def __init__(self):
+ self.command = find_binary(NVIDIA_SMI)
+ self.active_proc = None
+
+ def run_once(self):
+ proc = subprocess.Popen([self.command, '-x', '-q'], stdout=subprocess.PIPE)
+ stdout, _ = proc.communicate()
+ return stdout
+
+ def run_loop(self, interval):
+ if self.active_proc:
+ self.kill()
+ proc = subprocess.Popen([self.command, '-x', '-q', '-l', str(interval)], stdout=subprocess.PIPE)
+ self.active_proc = proc
+ return proc.stdout
+
+ def kill(self):
+ if self.active_proc:
+ self.active_proc.kill()
+ self.active_proc = None
+
+
+class NvidiaSMIPoller(threading.Thread):
+ def __init__(self, poll_interval):
+ threading.Thread.__init__(self)
+ self.daemon = True
+
+ self.smi = NvidiaSMI()
+ self.interval = poll_interval
+
+ self.lock = threading.RLock()
+ self.last_data = str()
+ self.exit = False
+ self.empty_rows = 0
+ self.rows = list()
+
+ def has_smi(self):
+ return bool(self.smi.command)
+
+ def run_once(self):
+ return self.smi.run_once()
+
+ def run(self):
+ out = self.smi.run_loop(self.interval)
+
+ for row in out:
+ if self.exit or self.empty_rows > EMPTY_ROW_LIMIT:
+ break
+ self.process_row(row)
+ self.smi.kill()
+
+ def process_row(self, row):
+ row = row.decode()
+ self.empty_rows += (row == EMPTY_ROW)
+ self.rows.append(row)
+
+ if POLLER_BREAK_ROW in row:
+ self.lock.acquire()
+ self.last_data = '\n'.join(self.rows)
+ self.lock.release()
+
+ self.rows = list()
+ self.empty_rows = 0
+
+ def is_started(self):
+ return self.ident is not None
+
+ def shutdown(self):
+ self.exit = True
+
+ def data(self):
+ self.lock.acquire()
+ data = self.last_data
+ self.lock.release()
+ return data
+
+
+def handle_attr_error(method):
+ def on_call(*args, **kwargs):
+ try:
+ return method(*args, **kwargs)
+ except AttributeError:
+ return None
+ return on_call
+
+
+class GPU:
+ def __init__(self, num, root):
+ self.num = num
+ self.root = root
+
+ def id(self):
+ return self.root.get('id')
+
+ def name(self):
+ return self.root.find('product_name').text
+
+ def full_name(self):
+ return 'gpu{0} {1}'.format(self.num, self.name())
+
+ @handle_attr_error
+ def rx_util(self):
+ return self.root.find('pci').find('rx_util').text.split()[0]
+
+ @handle_attr_error
+ def tx_util(self):
+ return self.root.find('pci').find('tx_util').text.split()[0]
+
+ @handle_attr_error
+ def fan_speed(self):
+ return self.root.find('fan_speed').text.split()[0]
+
+ @handle_attr_error
+ def gpu_util(self):
+ return self.root.find('utilization').find('gpu_util').text.split()[0]
+
+ @handle_attr_error
+ def memory_util(self):
+ return self.root.find('utilization').find('memory_util').text.split()[0]
+
+ @handle_attr_error
+ def encoder_util(self):
+ return self.root.find('utilization').find('encoder_util').text.split()[0]
+
+ @handle_attr_error
+ def decoder_util(self):
+ return self.root.find('utilization').find('decoder_util').text.split()[0]
+
+ @handle_attr_error
+ def fb_memory_usage(self):
+ return self.root.find('fb_memory_usage').find('used').text.split()[0]
+
+ @handle_attr_error
+ def temperature(self):
+ return self.root.find('temperature').find('gpu_temp').text.split()[0]
+
+ @handle_attr_error
+ def graphics_clock(self):
+ return self.root.find('clocks').find('graphics_clock').text.split()[0]
+
+ @handle_attr_error
+ def video_clock(self):
+ return self.root.find('clocks').find('video_clock').text.split()[0]
+
+ @handle_attr_error
+ def sm_clock(self):
+ return self.root.find('clocks').find('sm_clock').text.split()[0]
+
+ @handle_attr_error
+ def mem_clock(self):
+ return self.root.find('clocks').find('mem_clock').text.split()[0]
+
+ @handle_attr_error
+ def power_draw(self):
+ return float(self.root.find('power_readings').find('power_draw').text.split()[0]) * 100
+
+ def data(self):
+ data = {
+ 'rx_util': self.rx_util(),
+ 'tx_util': self.tx_util(),
+ 'fan_speed': self.fan_speed(),
+ 'gpu_util': self.gpu_util(),
+ 'memory_util': self.memory_util(),
+ 'encoder_util': self.encoder_util(),
+ 'decoder_util': self.decoder_util(),
+ 'fb_memory_usage': self.fb_memory_usage(),
+ 'gpu_temp': self.temperature(),
+ 'graphics_clock': self.graphics_clock(),
+ 'video_clock': self.video_clock(),
+ 'sm_clock': self.sm_clock(),
+ 'mem_clock': self.mem_clock(),
+ 'power_draw': self.power_draw(),
+ }
+
+ return dict(('gpu{0}_{1}'.format(self.num, k), v) for k, v in data.items() if v is not None)
+
+
+class Service(SimpleService):
+ def __init__(self, configuration=None, name=None):
+ super(Service, self).__init__(configuration=configuration, name=name)
+ self.order = list()
+ self.definitions = dict()
+
+ poll = int(configuration.get('poll_seconds', 1))
+ self.poller = NvidiaSMIPoller(poll)
+
+ def get_data(self):
+ if not self.poller.is_alive():
+ self.debug('poller is off')
+ return None
+
+ last_data = self.poller.data()
+
+ parsed = self.parse_xml(last_data)
+ if parsed is None:
+ return None
+
+ data = dict()
+ for idx, root in enumerate(parsed.findall('gpu')):
+ data.update(GPU(idx, root).data())
+
+ return data or None
+
+ def check(self):
+ if not self.poller.has_smi():
+ self.error("couldn't find '{0}' binary".format(NVIDIA_SMI))
+ return False
+
+ raw_data = self.poller.run_once()
+ if not raw_data:
+ self.error("failed to invoke '{0}' binary".format(NVIDIA_SMI))
+ return False
+
+ parsed = self.parse_xml(raw_data)
+ if parsed is None:
+ return False
+
+ gpus = parsed.findall('gpu')
+ if not gpus:
+ return False
+
+ self.create_charts(gpus)
+ self.poller.start()
+
+ return True
+
+ def parse_xml(self, data):
+ try:
+ return et.fromstring(data)
+ except et.ParseError as error:
+ self.error(error)
+
+ return None
+
+ def create_charts(self, gpus):
+ for idx, root in enumerate(gpus):
+ order, charts = gpu_charts(GPU(idx, root))
+ self.order.extend(order)
+ self.definitions.update(charts)
diff --git a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.conf b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.conf
new file mode 100644
index 000000000..e1bcf3faf
--- /dev/null
+++ b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.conf
@@ -0,0 +1,68 @@
+# netdata python.d.plugin configuration for nvidia_smi
+#
+# This file is in YaML format. Generally the format is:
+#
+# name: value
+#
+# There are 2 sections:
+# - global variables
+# - one or more JOBS
+#
+# JOBS allow you to collect values from multiple sources.
+# Each source will have its own set of charts.
+#
+# JOB parameters have to be indented (using spaces only, example below).
+
+# ----------------------------------------------------------------------
+# Global Variables
+# These variables set the defaults for all JOBs, however each JOB
+# may define its own, overriding the defaults.
+
+# update_every sets the default data collection frequency.
+# If unset, the python.d.plugin default is used.
+# update_every: 1
+
+# priority controls the order of charts at the netdata dashboard.
+# Lower numbers move the charts towards the top of the page.
+# If unset, the default for python.d.plugin is used.
+# priority: 60000
+
+# retries sets the number of retries to be made in case of failures.
+# If unset, the default for python.d.plugin is used.
+# Attempts to restore the service are made once every update_every
+# and only if the module has collected values in the past.
+# retries: 60
+
+# autodetection_retry sets the job re-check interval in seconds.
+# The job is not deleted if check fails.
+# Attempts to start the job are made once every autodetection_retry.
+# This feature is disabled by default.
+# autodetection_retry: 0
+
+# ----------------------------------------------------------------------
+# JOBS (data collection sources)
+#
+# The default JOBS share the same *name*. JOBS with the same name
+# are mutually exclusive. Only one of them will be allowed running at
+# any time. This allows autodetection to try several alternatives and
+# pick the one that works.
+#
+# Any number of jobs is supported.
+#
+# All python.d.plugin JOBS (for all its modules) support a set of
+# predefined parameters. These are:
+#
+# job_name:
+# name: myname # the JOB's name as it will appear at the
+# # dashboard (by default is the job_name)
+# # JOBs sharing a name are mutually exclusive
+# update_every: 1 # the JOB's data collection frequency
+# priority: 60000 # the JOB's order on the dashboard
+# retries: 60 # the JOB's number of restoration attempts
+# autodetection_retry: 0 # the JOB's re-check interval in seconds
+#
+# Additionally to the above, example also supports the following:
+#
+# poll_seconds: SECONDS # default is 1. Sets the frequency of seconds the nvidia-smi tool is polled.
+#
+# ----------------------------------------------------------------------