summaryrefslogtreecommitdiffstats
path: root/src/go/collectors/go.d.plugin/modules/nvidia_smi/metadata.yaml
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--src/go/collectors/go.d.plugin/modules/nvidia_smi/metadata.yaml296
1 files changed, 296 insertions, 0 deletions
diff --git a/src/go/collectors/go.d.plugin/modules/nvidia_smi/metadata.yaml b/src/go/collectors/go.d.plugin/modules/nvidia_smi/metadata.yaml
new file mode 100644
index 000000000..e18370baf
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/nvidia_smi/metadata.yaml
@@ -0,0 +1,296 @@
+plugin_name: go.d.plugin
+modules:
+ - meta:
+ id: collector-go.d.plugin-nvidia_smi
+ plugin_name: go.d.plugin
+ module_name: nvidia_smi
+ monitored_instance:
+ name: Nvidia GPU
+ link: https://www.nvidia.com/en-us/
+ icon_filename: nvidia.svg
+ categories:
+ - data-collection.hardware-devices-and-sensors
+ keywords:
+ - nvidia
+ - gpu
+ - hardware
+ related_resources:
+ integrations:
+ list: []
+ info_provided_to_referring_integrations:
+ description: ""
+ most_popular: false
+ overview:
+ data_collection:
+ metrics_description: |
+ This collector monitors GPUs performance metrics using
+ the [nvidia-smi](https://developer.nvidia.com/nvidia-system-management-interface) CLI tool.
+
+ > **Warning**: under development, [loop mode](https://github.com/netdata/netdata/issues/14522) not implemented yet.
+ method_description: ""
+ supported_platforms:
+ include: []
+ exclude: []
+ multi_instance: true
+ additional_permissions:
+ description: ""
+ default_behavior:
+ auto_detection:
+ description: ""
+ limits:
+ description: ""
+ performance_impact:
+ description: ""
+ setup:
+ prerequisites:
+ list:
+ - title: Enable in go.d.conf.
+ description: |
+ This collector is disabled by default. You need to explicitly enable it in the `go.d.conf` file.
+ configuration:
+ file:
+ name: go.d/nvidia_smi.conf
+ options:
+ description: |
+ The following options can be defined globally: update_every, autodetection_retry.
+ folding:
+ title: Config options
+ enabled: true
+ list:
+ - name: update_every
+ description: Data collection frequency.
+ default_value: 10
+ required: false
+ - name: autodetection_retry
+ description: Recheck interval in seconds. Zero means no recheck will be scheduled.
+ default_value: 0
+ required: false
+ - name: binary_path
+ description: Path to nvidia_smi binary. The default is "nvidia_smi" and the executable is looked for in the directories specified in the PATH environment variable.
+ default_value: nvidia_smi
+ required: false
+ - name: timeout
+ description: nvidia_smi binary execution timeout.
+ default_value: 2
+ required: false
+ - name: use_csv_format
+ description: Used format when requesting GPU information. XML is used if set to 'no'.
+ default_value: true
+ required: false
+ details: |
+ This module supports data collection in CSV and XML formats. The default is CSV.
+
+ - XML provides more metrics, but requesting GPU information consumes more CPU, especially if there are multiple GPUs in the system.
+ - CSV provides fewer metrics, but is much lighter than XML in terms of CPU usage.
+ examples:
+ folding:
+ title: Config
+ enabled: true
+ list:
+ - name: XML format
+ description: Use XML format when requesting GPU information.
+ config: |
+ jobs:
+ - name: nvidia_smi
+ use_csv_format: no
+ - name: Custom binary path
+ description: The executable is not in the directories specified in the PATH environment variable.
+ config: |
+ jobs:
+ - name: nvidia_smi
+ binary_path: /usr/local/sbin/nvidia_smi
+ troubleshooting:
+ problems:
+ list: []
+ alerts: []
+ metrics:
+ folding:
+ title: Metrics
+ enabled: false
+ description: ""
+ availability:
+ - XML
+ - CSV
+ scopes:
+ - name: gpu
+ description: These metrics refer to the GPU.
+ labels:
+ - name: uuid
+ description: GPU id (e.g. 00000000:00:04.0)
+ - name: product_name
+ description: GPU product name (e.g. NVIDIA A100-SXM4-40GB)
+ metrics:
+ - name: nvidia_smi.gpu_pcie_bandwidth_usage
+ availability:
+ - XML
+ description: PCI Express Bandwidth Usage
+ unit: B/s
+ chart_type: line
+ dimensions:
+ - name: rx
+ - name: tx
+ - name: nvidia_smi.gpu_pcie_bandwidth_utilization
+ availability:
+ - XML
+ description: PCI Express Bandwidth Utilization
+ unit: '%'
+ chart_type: line
+ dimensions:
+ - name: rx
+ - name: tx
+ - name: nvidia_smi.gpu_fan_speed_perc
+ availability:
+ - XML
+ - CSV
+ description: Fan speed
+ unit: '%'
+ chart_type: line
+ dimensions:
+ - name: fan_speed
+ - name: nvidia_smi.gpu_utilization
+ availability:
+ - XML
+ - CSV
+ description: GPU utilization
+ unit: '%'
+ chart_type: line
+ dimensions:
+ - name: gpu
+ - name: nvidia_smi.gpu_memory_utilization
+ availability:
+ - XML
+ - CSV
+ description: Memory utilization
+ unit: '%'
+ chart_type: line
+ dimensions:
+ - name: memory
+ - name: nvidia_smi.gpu_decoder_utilization
+ availability:
+ - XML
+ description: Decoder utilization
+ unit: '%'
+ chart_type: line
+ dimensions:
+ - name: decoder
+ - name: nvidia_smi.gpu_encoder_utilization
+ availability:
+ - XML
+ description: Encoder utilization
+ unit: '%'
+ chart_type: line
+ dimensions:
+ - name: encoder
+ - name: nvidia_smi.gpu_frame_buffer_memory_usage
+ availability:
+ - XML
+ - CSV
+ description: Frame buffer memory usage
+ unit: B
+ chart_type: stacked
+ dimensions:
+ - name: free
+ - name: used
+ - name: reserved
+ - name: nvidia_smi.gpu_bar1_memory_usage
+ availability:
+ - XML
+ description: BAR1 memory usage
+ unit: B
+ chart_type: stacked
+ dimensions:
+ - name: free
+ - name: used
+ - name: nvidia_smi.gpu_temperature
+ availability:
+ - XML
+ - CSV
+ description: Temperature
+ unit: Celsius
+ chart_type: line
+ dimensions:
+ - name: temperature
+ - name: nvidia_smi.gpu_voltage
+ availability:
+ - XML
+ description: Voltage
+ unit: V
+ chart_type: line
+ dimensions:
+ - name: voltage
+ - name: nvidia_smi.gpu_clock_freq
+ availability:
+ - XML
+ - CSV
+ description: Clock current frequency
+ unit: MHz
+ chart_type: line
+ dimensions:
+ - name: graphics
+ - name: video
+ - name: sm
+ - name: mem
+ - name: nvidia_smi.gpu_power_draw
+ availability:
+ - XML
+ - CSV
+ description: Power draw
+ unit: Watts
+ chart_type: line
+ dimensions:
+ - name: power_draw
+ - name: nvidia_smi.gpu_performance_state
+ availability:
+ - XML
+ - CSV
+ description: Performance state
+ unit: state
+ chart_type: line
+ dimensions:
+ - name: P0-P15
+ - name: nvidia_smi.gpu_mig_mode_current_status
+ availability:
+ - XML
+ description: MIG current mode
+ unit: status
+ chart_type: line
+ dimensions:
+ - name: enabled
+ - name: disabled
+ - name: nvidia_smi.gpu_mig_devices_count
+ availability:
+ - XML
+ description: MIG devices
+ unit: devices
+ chart_type: line
+ dimensions:
+ - name: mig
+ - name: mig
+ description: These metrics refer to the Multi-Instance GPU (MIG).
+ labels:
+ - name: uuid
+ description: GPU id (e.g. 00000000:00:04.0)
+ - name: product_name
+ description: GPU product name (e.g. NVIDIA A100-SXM4-40GB)
+ - name: gpu_instance_id
+ description: GPU instance id (e.g. 1)
+ metrics:
+ - name: nvidia_smi.gpu_mig_frame_buffer_memory_usage
+ availability:
+ - XML
+ description: Frame buffer memory usage
+ unit: B
+ chart_type: stacked
+ dimensions:
+ - name: free
+ - name: used
+ - name: reserved
+ - name: nvidia_smi.gpu_mig_bar1_memory_usage
+ availability:
+ - XML
+ description: BAR1 memory usage
+ unit: B
+ chart_type: stacked
+ dimensions:
+ - name: free
+ - name: used