From caf1a5281f9e974ba73ceded3a782db3d0142c5f Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 26 Apr 2019 18:22:55 +0200 Subject: Merging upstream version 1.14.0. Signed-off-by: Daniel Baumann --- .../python.d.plugin/nvidia_smi/nvidia_smi.chart.py | 68 ++++++++++++++++++---- 1 file changed, 57 insertions(+), 11 deletions(-) (limited to 'collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py') diff --git a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py index 4dc67f133..f7b7020e0 100644 --- a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py +++ b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py @@ -26,10 +26,11 @@ FAN_SPEED = 'fan_speed' GPU_UTIL = 'gpu_utilization' MEM_UTIL = 'mem_utilization' ENCODER_UTIL = 'encoder_utilization' -MEM_ALLOCATED = 'mem_allocated' +MEM_USAGE = 'mem_usage' TEMPERATURE = 'temperature' CLOCKS = 'clocks' POWER = 'power' +PROCESSES_MEM = 'processes_mem' ORDER = [ PCI_BANDWIDTH, @@ -37,10 +38,11 @@ ORDER = [ GPU_UTIL, MEM_UTIL, ENCODER_UTIL, - MEM_ALLOCATED, + MEM_USAGE, TEMPERATURE, CLOCKS, POWER, + PROCESSES_MEM, ] @@ -80,10 +82,11 @@ def gpu_charts(gpu): ['decoder_util', 'decoder'], ] }, - MEM_ALLOCATED: { - 'options': [None, 'Memory Allocated', 'MiB', fam, 'nvidia_smi.memory_allocated', 'line'], + MEM_USAGE: { + 'options': [None, 'Memory Usage', 'MiB', fam, 'nvidia_smi.memory_allocated', 'stacked'], 'lines': [ - ['fb_memory_usage', 'used'], + ['fb_memory_free', 'free'], + ['fb_memory_used', 'used'], ] }, TEMPERATURE: { @@ -107,6 +110,10 @@ def gpu_charts(gpu): ['power_draw', 'power', 1, 100], ] }, + PROCESSES_MEM: { + 'options': [None, 'Memory Used by Each Process', 'MiB', fam, 'nvidia_smi.processes_mem', 'stacked'], + 'lines': [] + }, } idx = gpu.num @@ -260,9 +267,13 @@ class GPU: return self.root.find('utilization').find('decoder_util').text.split()[0] @handle_attr_error - def fb_memory_usage(self): + def fb_memory_used(self): return self.root.find('fb_memory_usage').find('used').text.split()[0] + @handle_attr_error + def fb_memory_free(self): + return self.root.find('fb_memory_usage').find('free').text.split()[0] + @handle_attr_error def temperature(self): return self.root.find('temperature').find('gpu_temp').text.split()[0] @@ -288,6 +299,18 @@ class GPU: def power_draw(self): return float(self.root.find('power_readings').find('power_draw').text.split()[0]) * 100 + @handle_attr_error + def processes(self): + p_nodes = self.root.find('processes').findall('process_info') + ps = [] + for p in p_nodes: + ps.append({ + 'pid': p.find('pid').text, + 'process_name': p.find('process_name').text, + 'used_memory': int(p.find('used_memory').text.split()[0]), + }) + return ps + def data(self): data = { 'rx_util': self.rx_util(), @@ -297,7 +320,8 @@ class GPU: 'memory_util': self.memory_util(), 'encoder_util': self.encoder_util(), 'decoder_util': self.decoder_util(), - 'fb_memory_usage': self.fb_memory_usage(), + 'fb_memory_used': self.fb_memory_used(), + 'fb_memory_free': self.fb_memory_free(), 'gpu_temp': self.temperature(), 'graphics_clock': self.graphics_clock(), 'video_clock': self.video_clock(), @@ -305,12 +329,13 @@ class GPU: 'mem_clock': self.mem_clock(), 'power_draw': self.power_draw(), } + processes = self.processes() or [] + data.update({'process_mem_{0}'.format(p['pid']): p['used_memory'] for p in processes}) return dict( ('gpu{0}_{1}'.format(self.num, k), v) for k, v in data.items() if v is not None and v != BAD_VALUE ) - class Service(SimpleService): def __init__(self, configuration=None, name=None): super(Service, self).__init__(configuration=configuration, name=name) @@ -320,11 +345,16 @@ class Service(SimpleService): self.poller = NvidiaSMIPoller(poll) def get_data(self): + if not self.poller.is_started(): + self.poller.start() + if not self.poller.is_alive(): self.debug('poller is off') return None last_data = self.poller.data() + if not last_data: + return None parsed = self.parse_xml(last_data) if parsed is None: @@ -332,10 +362,27 @@ class Service(SimpleService): data = dict() for idx, root in enumerate(parsed.findall('gpu')): - data.update(GPU(idx, root).data()) + gpu = GPU(idx, root) + data.update(gpu.data()) + self.update_processes_mem_chart(gpu) return data or None + def update_processes_mem_chart(self, gpu): + ps = gpu.processes() + if not ps: + return + chart = self.charts['gpu{0}_{1}'.format(gpu.num, PROCESSES_MEM)] + active_dim_ids = [] + for p in ps: + dim_id = 'gpu{0}_process_mem_{1}'.format(gpu.num, p['pid']) + active_dim_ids.append(dim_id) + if dim_id not in chart: + chart.add_dimension([dim_id, '{0} {1}'.format(p['pid'], p['process_name'])]) + for dim in chart: + if dim.id not in active_dim_ids: + chart.del_dimension(dim.id, hide=False) + def check(self): if not self.poller.has_smi(): self.error("couldn't find '{0}' binary".format(NVIDIA_SMI)) @@ -355,7 +402,6 @@ class Service(SimpleService): return False self.create_charts(gpus) - self.poller.start() return True @@ -363,7 +409,7 @@ class Service(SimpleService): try: return et.fromstring(data) except et.ParseError as error: - self.error(error) + self.error('xml parse failed: "{0}", error: {1}'.format(data, error)) return None -- cgit v1.2.3