summaryrefslogtreecommitdiffstats
path: root/collectors/python.d.plugin/nvidia_smi
diff options
context:
space:
mode:
Diffstat (limited to 'collectors/python.d.plugin/nvidia_smi')
-rw-r--r--collectors/python.d.plugin/nvidia_smi/README.md3
-rw-r--r--collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py29
-rw-r--r--collectors/python.d.plugin/nvidia_smi/nvidia_smi.conf10
3 files changed, 27 insertions, 15 deletions
diff --git a/collectors/python.d.plugin/nvidia_smi/README.md b/collectors/python.d.plugin/nvidia_smi/README.md
index 06acfc297..48b611951 100644
--- a/collectors/python.d.plugin/nvidia_smi/README.md
+++ b/collectors/python.d.plugin/nvidia_smi/README.md
@@ -36,4 +36,5 @@ Sample:
```yaml
poll_seconds: 1
-``` \ No newline at end of file
+```
+[![analytics](https://www.google-analytics.com/collect?v=1&aip=1&t=pageview&_s=1&ds=github&dr=https%3A%2F%2Fgithub.com%2Fnetdata%2Fnetdata&dl=https%3A%2F%2Fmy-netdata.io%2Fgithub%2Fcollectors%2Fpython.d.plugin%2Fnvidia_smi%2FREADME&_u=MAC~&cid=5792dfd7-8dc4-476b-af31-da2fdb9f93d2&tid=UA-64295674-3)]()
diff --git a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py
index c3fff6219..7cb816c0d 100644
--- a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py
+++ b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py
@@ -15,6 +15,8 @@ disabled_by_default = True
NVIDIA_SMI = 'nvidia-smi'
+BAD_VALUE = 'N/A'
+
EMPTY_ROW = ''
EMPTY_ROW_LIMIT = 500
POLLER_BREAK_ROW = '</nvidia_smi_log>'
@@ -47,39 +49,39 @@ def gpu_charts(gpu):
charts = {
PCI_BANDWIDTH: {
- 'options': [None, 'PCI Express Bandwidth Utilization', 'KB/s', fam, 'nvidia_smi.pci_bandwidth', 'area'],
+ 'options': [None, 'PCI Express Bandwidth Utilization', 'KiB/s', fam, 'nvidia_smi.pci_bandwidth', 'area'],
'lines': [
['rx_util', 'rx', 'absolute', 1, 1],
['tx_util', 'tx', 'absolute', 1, -1],
]
},
FAN_SPEED: {
- 'options': [None, 'Fan Speed', '%', fam, 'nvidia_smi.fan_speed', 'line'],
+ 'options': [None, 'Fan Speed', 'percentage', fam, 'nvidia_smi.fan_speed', 'line'],
'lines': [
['fan_speed', 'speed'],
]
},
GPU_UTIL: {
- 'options': [None, 'GPU Utilization', '%', fam, 'nvidia_smi.gpu_utilization', 'line'],
+ 'options': [None, 'GPU Utilization', 'percentage', fam, 'nvidia_smi.gpu_utilization', 'line'],
'lines': [
['gpu_util', 'utilization'],
]
},
MEM_UTIL: {
- 'options': [None, 'Memory Bandwidth Utilization', '%', fam, 'nvidia_smi.mem_utilization', 'line'],
+ 'options': [None, 'Memory Bandwidth Utilization', 'percentage', fam, 'nvidia_smi.mem_utilization', 'line'],
'lines': [
['memory_util', 'utilization'],
]
},
ENCODER_UTIL: {
- 'options': [None, 'Encoder/Decoder Utilization', '%', fam, 'nvidia_smi.encoder_utilization', 'line'],
+ 'options': [None, 'Encoder/Decoder Utilization', 'percentage', fam, 'nvidia_smi.encoder_utilization', 'line'],
'lines': [
['encoder_util', 'encoder'],
['decoder_util', 'decoder'],
]
},
MEM_ALLOCATED: {
- 'options': [None, 'Memory Allocated', 'MB', fam, 'nvidia_smi.memory_allocated', 'line'],
+ 'options': [None, 'Memory Allocated', 'MiB', fam, 'nvidia_smi.memory_allocated', 'line'],
'lines': [
['fb_memory_usage', 'used'],
]
@@ -206,6 +208,15 @@ def handle_attr_error(method):
return on_call
+def handle_value_error(method):
+ def on_call(*args, **kwargs):
+ try:
+ return method(*args, **kwargs)
+ except ValueError:
+ return None
+ return on_call
+
+
class GPU:
def __init__(self, num, root):
self.num = num
@@ -272,6 +283,7 @@ class GPU:
def mem_clock(self):
return self.root.find('clocks').find('mem_clock').text.split()[0]
+ @handle_value_error
@handle_attr_error
def power_draw(self):
return float(self.root.find('power_readings').find('power_draw').text.split()[0]) * 100
@@ -294,7 +306,9 @@ class GPU:
'power_draw': self.power_draw(),
}
- return dict(('gpu{0}_{1}'.format(self.num, k), v) for k, v in data.items() if v is not None)
+ return dict(
+ ('gpu{0}_{1}'.format(self.num, k), v) for k, v in data.items() if v is not None and v != BAD_VALUE
+ )
class Service(SimpleService):
@@ -302,7 +316,6 @@ class Service(SimpleService):
super(Service, self).__init__(configuration=configuration, name=name)
self.order = list()
self.definitions = dict()
-
poll = int(configuration.get('poll_seconds', 1))
self.poller = NvidiaSMIPoller(poll)
diff --git a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.conf b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.conf
index e1bcf3faf..53e544a5d 100644
--- a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.conf
+++ b/collectors/python.d.plugin/nvidia_smi/nvidia_smi.conf
@@ -27,11 +27,9 @@
# If unset, the default for python.d.plugin is used.
# priority: 60000
-# retries sets the number of retries to be made in case of failures.
-# If unset, the default for python.d.plugin is used.
-# Attempts to restore the service are made once every update_every
-# and only if the module has collected values in the past.
-# retries: 60
+# penalty indicates whether to apply penalty to update_every in case of failures.
+# Penalty will increase every 5 failed updates in a row. Maximum penalty is 10 minutes.
+# penalty: yes
# autodetection_retry sets the job re-check interval in seconds.
# The job is not deleted if check fails.
@@ -58,7 +56,7 @@
# # JOBs sharing a name are mutually exclusive
# update_every: 1 # the JOB's data collection frequency
# priority: 60000 # the JOB's order on the dashboard
-# retries: 60 # the JOB's number of restoration attempts
+# penalty: yes # the JOB's penalty
# autodetection_retry: 0 # the JOB's re-check interval in seconds
#
# Additionally to the above, example also supports the following: