diff options
Diffstat (limited to 'collectors/python.d.plugin/nvidia_smi')
-rw-r--r-- | collectors/python.d.plugin/nvidia_smi/README.md | 2 | ||||
-rw-r--r-- | collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py | 27 |
2 files changed, 25 insertions, 4 deletions
diff --git a/collectors/python.d.plugin/nvidia_smi/README.md b/collectors/python.d.plugin/nvidia_smi/README.md index fb613064c..bb4169441 100644 --- a/collectors/python.d.plugin/nvidia_smi/README.md +++ b/collectors/python.d.plugin/nvidia_smi/README.md @@ -8,6 +8,8 @@ sidebar_label: "Nvidia GPUs" Monitors performance metrics (memory usage, fan speed, pcie bandwidth utilization, temperature, etc.) using `nvidia-smi` cli tool. +> **Warning**: this collector does not work when the Netdata Agent is [running in a container](https://learn.netdata.cloud/docs/agent/packaging/docker). + ## Requirements and Notes diff --git a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py index 1913e94e4..23e90e658 100644 --- a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py +++ b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py @@ -4,11 +4,10 @@ # Author: Ilya Mashchenko (ilyam8) # User Memory Stat Author: Guido Scatena (scatenag) -import subprocess -import threading import os import pwd - +import subprocess +import threading import xml.etree.ElementTree as et from bases.FrameworkServices.SimpleService import SimpleService @@ -32,6 +31,7 @@ BAR_USAGE = 'bar1_mem_usage' TEMPERATURE = 'temperature' CLOCKS = 'clocks' POWER = 'power' +POWER_STATE = 'power_state' PROCESSES_MEM = 'processes_mem' USER_MEM = 'user_mem' USER_NUM = 'user_num' @@ -47,11 +47,15 @@ ORDER = [ TEMPERATURE, CLOCKS, POWER, + POWER_STATE, PROCESSES_MEM, USER_MEM, USER_NUM, ] +# https://docs.nvidia.com/gameworks/content/gameworkslibrary/coresdk/nvapi/group__gpupstate.html +POWER_STATES = ['P' + str(i) for i in range(0, 16)] + def gpu_charts(gpu): fam = gpu.full_name() @@ -125,6 +129,10 @@ def gpu_charts(gpu): ['power_draw', 'power', 'absolute', 1, 100], ] }, + POWER_STATE: { + 'options': [None, 'Power State', 'state', fam, 'nvidia_smi.power_state', 'line'], + 'lines': [['power_state_' + v.lower(), v, 'absolute'] for v in POWER_STATES] + }, PROCESSES_MEM: { 'options': [None, 'Memory Used by Each Process', 'MiB', fam, 'nvidia_smi.processes_mem', 'stacked'], 'lines': [] @@ -382,6 +390,10 @@ class GPU: def mem_clock(self): return self.root.find('clocks').find('mem_clock').text.split()[0] + @handle_attr_error + def power_state(self): + return str(self.root.find('power_readings').find('power_state').text.split()[0]) + @handle_value_error @handle_attr_error def power_draw(self): @@ -426,6 +438,13 @@ class GPU: 'mem_clock': self.mem_clock(), 'power_draw': self.power_draw(), } + + for v in POWER_STATES: + data['power_state_' + v.lower()] = 0 + p_state = self.power_state() + if p_state: + data['power_state_' + p_state.lower()] = 1 + processes = self.processes() or [] users = set() for p in processes: @@ -450,7 +469,7 @@ class Service(SimpleService): self.order = list() self.definitions = dict() self.loop_mode = configuration.get('loop_mode', True) - poll = int(configuration.get('poll_seconds', 1)) + poll = int(configuration.get('poll_seconds', self.get_update_every())) self.exclude_zero_memory_users = configuration.get('exclude_zero_memory_users', False) self.poller = NvidiaSMIPoller(poll) |