summaryrefslogtreecommitdiffstats
path: root/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py
diff options
context:
space:
mode:
Diffstat (limited to 'collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py')
-rw-r--r--collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py68
1 files changed, 57 insertions, 11 deletions
diff --git a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py
index 4dc67f133..f7b7020e0 100644
--- a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py
+++ b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py
@@ -26,10 +26,11 @@ FAN_SPEED = 'fan_speed'
GPU_UTIL = 'gpu_utilization'
MEM_UTIL = 'mem_utilization'
ENCODER_UTIL = 'encoder_utilization'
-MEM_ALLOCATED = 'mem_allocated'
+MEM_USAGE = 'mem_usage'
TEMPERATURE = 'temperature'
CLOCKS = 'clocks'
POWER = 'power'
+PROCESSES_MEM = 'processes_mem'
ORDER = [
PCI_BANDWIDTH,
@@ -37,10 +38,11 @@ ORDER = [
GPU_UTIL,
MEM_UTIL,
ENCODER_UTIL,
- MEM_ALLOCATED,
+ MEM_USAGE,
TEMPERATURE,
CLOCKS,
POWER,
+ PROCESSES_MEM,
]
@@ -80,10 +82,11 @@ def gpu_charts(gpu):
['decoder_util', 'decoder'],
]
},
- MEM_ALLOCATED: {
- 'options': [None, 'Memory Allocated', 'MiB', fam, 'nvidia_smi.memory_allocated', 'line'],
+ MEM_USAGE: {
+ 'options': [None, 'Memory Usage', 'MiB', fam, 'nvidia_smi.memory_allocated', 'stacked'],
'lines': [
- ['fb_memory_usage', 'used'],
+ ['fb_memory_free', 'free'],
+ ['fb_memory_used', 'used'],
]
},
TEMPERATURE: {
@@ -107,6 +110,10 @@ def gpu_charts(gpu):
['power_draw', 'power', 1, 100],
]
},
+ PROCESSES_MEM: {
+ 'options': [None, 'Memory Used by Each Process', 'MiB', fam, 'nvidia_smi.processes_mem', 'stacked'],
+ 'lines': []
+ },
}
idx = gpu.num
@@ -260,10 +267,14 @@ class GPU:
return self.root.find('utilization').find('decoder_util').text.split()[0]
@handle_attr_error
- def fb_memory_usage(self):
+ def fb_memory_used(self):
return self.root.find('fb_memory_usage').find('used').text.split()[0]
@handle_attr_error
+ def fb_memory_free(self):
+ return self.root.find('fb_memory_usage').find('free').text.split()[0]
+
+ @handle_attr_error
def temperature(self):
return self.root.find('temperature').find('gpu_temp').text.split()[0]
@@ -288,6 +299,18 @@ class GPU:
def power_draw(self):
return float(self.root.find('power_readings').find('power_draw').text.split()[0]) * 100
+ @handle_attr_error
+ def processes(self):
+ p_nodes = self.root.find('processes').findall('process_info')
+ ps = []
+ for p in p_nodes:
+ ps.append({
+ 'pid': p.find('pid').text,
+ 'process_name': p.find('process_name').text,
+ 'used_memory': int(p.find('used_memory').text.split()[0]),
+ })
+ return ps
+
def data(self):
data = {
'rx_util': self.rx_util(),
@@ -297,7 +320,8 @@ class GPU:
'memory_util': self.memory_util(),
'encoder_util': self.encoder_util(),
'decoder_util': self.decoder_util(),
- 'fb_memory_usage': self.fb_memory_usage(),
+ 'fb_memory_used': self.fb_memory_used(),
+ 'fb_memory_free': self.fb_memory_free(),
'gpu_temp': self.temperature(),
'graphics_clock': self.graphics_clock(),
'video_clock': self.video_clock(),
@@ -305,12 +329,13 @@ class GPU:
'mem_clock': self.mem_clock(),
'power_draw': self.power_draw(),
}
+ processes = self.processes() or []
+ data.update({'process_mem_{0}'.format(p['pid']): p['used_memory'] for p in processes})
return dict(
('gpu{0}_{1}'.format(self.num, k), v) for k, v in data.items() if v is not None and v != BAD_VALUE
)
-
class Service(SimpleService):
def __init__(self, configuration=None, name=None):
super(Service, self).__init__(configuration=configuration, name=name)
@@ -320,11 +345,16 @@ class Service(SimpleService):
self.poller = NvidiaSMIPoller(poll)
def get_data(self):
+ if not self.poller.is_started():
+ self.poller.start()
+
if not self.poller.is_alive():
self.debug('poller is off')
return None
last_data = self.poller.data()
+ if not last_data:
+ return None
parsed = self.parse_xml(last_data)
if parsed is None:
@@ -332,10 +362,27 @@ class Service(SimpleService):
data = dict()
for idx, root in enumerate(parsed.findall('gpu')):
- data.update(GPU(idx, root).data())
+ gpu = GPU(idx, root)
+ data.update(gpu.data())
+ self.update_processes_mem_chart(gpu)
return data or None
+ def update_processes_mem_chart(self, gpu):
+ ps = gpu.processes()
+ if not ps:
+ return
+ chart = self.charts['gpu{0}_{1}'.format(gpu.num, PROCESSES_MEM)]
+ active_dim_ids = []
+ for p in ps:
+ dim_id = 'gpu{0}_process_mem_{1}'.format(gpu.num, p['pid'])
+ active_dim_ids.append(dim_id)
+ if dim_id not in chart:
+ chart.add_dimension([dim_id, '{0} {1}'.format(p['pid'], p['process_name'])])
+ for dim in chart:
+ if dim.id not in active_dim_ids:
+ chart.del_dimension(dim.id, hide=False)
+
def check(self):
if not self.poller.has_smi():
self.error("couldn't find '{0}' binary".format(NVIDIA_SMI))
@@ -355,7 +402,6 @@ class Service(SimpleService):
return False
self.create_charts(gpus)
- self.poller.start()
return True
@@ -363,7 +409,7 @@ class Service(SimpleService):
try:
return et.fromstring(data)
except et.ParseError as error:
- self.error(error)
+ self.error('xml parse failed: "{0}", error: {1}'.format(data, error))
return None