summaryrefslogtreecommitdiffstats
path: root/src/collectors/python.d.plugin/nvidia_smi
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-05-05 11:19:16 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-07-24 09:53:24 +0000
commitb5f8ee61a7f7e9bd291dd26b0585d03eb686c941 (patch)
treed4d31289c39fc00da064a825df13a0b98ce95b10 /src/collectors/python.d.plugin/nvidia_smi
parentAdding upstream version 1.44.3. (diff)
downloadnetdata-upstream.tar.xz
netdata-upstream.zip
Adding upstream version 1.46.3.upstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r--src/collectors/python.d.plugin/nvidia_smi/README.md81
-rw-r--r--src/collectors/python.d.plugin/nvidia_smi/metadata.yaml166
-rw-r--r--src/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py (renamed from collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py)0
-rw-r--r--src/collectors/python.d.plugin/nvidia_smi/nvidia_smi.conf (renamed from collectors/python.d.plugin/nvidia_smi/nvidia_smi.conf)0
4 files changed, 247 insertions, 0 deletions
diff --git a/src/collectors/python.d.plugin/nvidia_smi/README.md b/src/collectors/python.d.plugin/nvidia_smi/README.md
new file mode 100644
index 000000000..240b65af3
--- /dev/null
+++ b/src/collectors/python.d.plugin/nvidia_smi/README.md
@@ -0,0 +1,81 @@
+<!--
+title: "Nvidia GPU monitoring with Netdata"
+custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/collectors/python.d.plugin/nvidia_smi/README.md"
+sidebar_label: "nvidia_smi-python.d.plugin"
+learn_status: "Published"
+learn_topic_type: "References"
+learn_rel_path: "Integrations/Monitor/Devices"
+-->
+
+# Nvidia GPU collector
+
+Monitors performance metrics (memory usage, fan speed, pcie bandwidth utilization, temperature, etc.) using `nvidia-smi` cli tool.
+
+## Requirements
+
+- The `nvidia-smi` tool installed and your NVIDIA GPU(s) must support the tool. Mostly the newer high end models used for AI / ML and Crypto or Pro range, read more about [nvidia_smi](https://developer.nvidia.com/nvidia-system-management-interface).
+- Enable this plugin, as it's disabled by default due to minor performance issues:
+ ```bash
+ cd /etc/netdata # Replace this path with your Netdata config directory, if different
+ sudo ./edit-config python.d.conf
+ ```
+ Remove the '#' before nvidia_smi so it reads: `nvidia_smi: yes`.
+- On some systems when the GPU is idle the `nvidia-smi` tool unloads and there is added latency again when it is next queried. If you are running GPUs under constant workload this isn't likely to be an issue.
+
+If using Docker, see [Netdata Docker container with NVIDIA GPUs monitoring](https://github.com/netdata/netdata/tree/master/packaging/docker#with-nvidia-gpus-monitoring).
+
+## Charts
+
+It produces the following charts:
+
+- PCI Express Bandwidth Utilization in `KiB/s`
+- Fan Speed in `percentage`
+- GPU Utilization in `percentage`
+- Memory Bandwidth Utilization in `percentage`
+- Encoder/Decoder Utilization in `percentage`
+- Memory Usage in `MiB`
+- Temperature in `celsius`
+- Clock Frequencies in `MHz`
+- Power Utilization in `Watts`
+- Memory Used by Each Process in `MiB`
+- Memory Used by Each User in `MiB`
+- Number of User on GPU in `num`
+
+## Configuration
+
+Edit the `python.d/nvidia_smi.conf` configuration file using `edit-config` from the Netdata [config
+directory](/docs/netdata-agent/configuration/README.md), which is typically at `/etc/netdata`.
+
+```bash
+cd /etc/netdata # Replace this path with your Netdata config directory, if different
+sudo ./edit-config python.d/nvidia_smi.conf
+```
+
+Sample:
+
+```yaml
+loop_mode : yes
+poll_seconds : 1
+exclude_zero_memory_users : yes
+```
+
+
+### Troubleshooting
+
+To troubleshoot issues with the `nvidia_smi` module, run the `python.d.plugin` with the debug option enabled. The
+output will give you the output of the data collection job or error messages on why the collector isn't working.
+
+First, navigate to your plugins directory, usually they are located under `/usr/libexec/netdata/plugins.d/`. If that's
+not the case on your system, open `netdata.conf` and look for the setting `plugins directory`. Once you're in the
+plugin's directory, switch to the `netdata` user.
+
+```bash
+cd /usr/libexec/netdata/plugins.d/
+sudo su -s /bin/bash netdata
+```
+
+Now you can manually run the `nvidia_smi` module in debug mode:
+
+```bash
+./python.d.plugin nvidia_smi debug trace
+```
diff --git a/src/collectors/python.d.plugin/nvidia_smi/metadata.yaml b/src/collectors/python.d.plugin/nvidia_smi/metadata.yaml
new file mode 100644
index 000000000..0b049d31b
--- /dev/null
+++ b/src/collectors/python.d.plugin/nvidia_smi/metadata.yaml
@@ -0,0 +1,166 @@
+# This collector will not appear in documentation, as the go version is preferred,
+# /src/go/collectors/go.d.plugin/modules/nvidia_smi/README.md
+#
+# meta:
+# plugin_name: python.d.plugin
+# module_name: nvidia_smi
+# monitored_instance:
+# name: python.d nvidia_smi
+# link: ''
+# categories: []
+# icon_filename: ''
+# related_resources:
+# integrations:
+# list: []
+# info_provided_to_referring_integrations:
+# description: ''
+# keywords: []
+# most_popular: false
+# overview:
+# data_collection:
+# metrics_description: ''
+# method_description: ''
+# supported_platforms:
+# include: []
+# exclude: []
+# multi_instance: true
+# additional_permissions:
+# description: ''
+# default_behavior:
+# auto_detection:
+# description: ''
+# limits:
+# description: ''
+# performance_impact:
+# description: ''
+# setup:
+# prerequisites:
+# list: []
+# configuration:
+# file:
+# name: ''
+# description: ''
+# options:
+# description: ''
+# folding:
+# title: ''
+# enabled: true
+# list: []
+# examples:
+# folding:
+# enabled: true
+# title: ''
+# list: []
+# troubleshooting:
+# problems:
+# list: []
+# alerts: []
+# metrics:
+# folding:
+# title: Metrics
+# enabled: false
+# description: ""
+# availability: []
+# scopes:
+# - name: GPU
+# description: ""
+# labels: []
+# metrics:
+# - name: nvidia_smi.pci_bandwidth
+# description: PCI Express Bandwidth Utilization
+# unit: "KiB/s"
+# chart_type: area
+# dimensions:
+# - name: rx
+# - name: tx
+# - name: nvidia_smi.pci_bandwidth_percent
+# description: PCI Express Bandwidth Percent
+# unit: "percentage"
+# chart_type: area
+# dimensions:
+# - name: rx_percent
+# - name: tx_percent
+# - name: nvidia_smi.fan_speed
+# description: Fan Speed
+# unit: "percentage"
+# chart_type: line
+# dimensions:
+# - name: speed
+# - name: nvidia_smi.gpu_utilization
+# description: GPU Utilization
+# unit: "percentage"
+# chart_type: line
+# dimensions:
+# - name: utilization
+# - name: nvidia_smi.mem_utilization
+# description: Memory Bandwidth Utilization
+# unit: "percentage"
+# chart_type: line
+# dimensions:
+# - name: utilization
+# - name: nvidia_smi.encoder_utilization
+# description: Encoder/Decoder Utilization
+# unit: "percentage"
+# chart_type: line
+# dimensions:
+# - name: encoder
+# - name: decoder
+# - name: nvidia_smi.memory_allocated
+# description: Memory Usage
+# unit: "MiB"
+# chart_type: stacked
+# dimensions:
+# - name: free
+# - name: used
+# - name: nvidia_smi.bar1_memory_usage
+# description: Bar1 Memory Usage
+# unit: "MiB"
+# chart_type: stacked
+# dimensions:
+# - name: free
+# - name: used
+# - name: nvidia_smi.temperature
+# description: Temperature
+# unit: "celsius"
+# chart_type: line
+# dimensions:
+# - name: temp
+# - name: nvidia_smi.clocks
+# description: Clock Frequencies
+# unit: "MHz"
+# chart_type: line
+# dimensions:
+# - name: graphics
+# - name: video
+# - name: sm
+# - name: mem
+# - name: nvidia_smi.power
+# description: Power Utilization
+# unit: "Watts"
+# chart_type: line
+# dimensions:
+# - name: power
+# - name: nvidia_smi.power_state
+# description: Power State
+# unit: "state"
+# chart_type: line
+# dimensions:
+# - name: a dimension per {power_state}
+# - name: nvidia_smi.processes_mem
+# description: Memory Used by Each Process
+# unit: "MiB"
+# chart_type: stacked
+# dimensions:
+# - name: a dimension per process
+# - name: nvidia_smi.user_mem
+# description: Memory Used by Each User
+# unit: "MiB"
+# chart_type: stacked
+# dimensions:
+# - name: a dimension per user
+# - name: nvidia_smi.user_num
+# description: Number of User on GPU
+# unit: "num"
+# chart_type: line
+# dimensions:
+# - name: users
diff --git a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py b/src/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py
index 556a61435..556a61435 100644
--- a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py
+++ b/src/collectors/python.d.plugin/nvidia_smi/nvidia_smi.chart.py
diff --git a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.conf b/src/collectors/python.d.plugin/nvidia_smi/nvidia_smi.conf
index 3d2a30d41..3d2a30d41 100644
--- a/collectors/python.d.plugin/nvidia_smi/nvidia_smi.conf
+++ b/src/collectors/python.d.plugin/nvidia_smi/nvidia_smi.conf