diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-05 12:08:03 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-05-05 12:08:18 +0000 |
commit | 5da14042f70711ea5cf66e034699730335462f66 (patch) | |
tree | 0f6354ccac934ed87a2d555f45be4c831cf92f4a /src/go/collectors/go.d.plugin/modules/nvidia_smi/metadata.yaml | |
parent | Releasing debian version 1.44.3-2. (diff) | |
download | netdata-5da14042f70711ea5cf66e034699730335462f66.tar.xz netdata-5da14042f70711ea5cf66e034699730335462f66.zip |
Merging upstream version 1.45.3+dfsg.
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to '')
-rw-r--r-- | src/go/collectors/go.d.plugin/modules/nvidia_smi/metadata.yaml | 296 |
1 files changed, 296 insertions, 0 deletions
diff --git a/src/go/collectors/go.d.plugin/modules/nvidia_smi/metadata.yaml b/src/go/collectors/go.d.plugin/modules/nvidia_smi/metadata.yaml new file mode 100644 index 000000000..e18370baf --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/nvidia_smi/metadata.yaml @@ -0,0 +1,296 @@ +plugin_name: go.d.plugin +modules: + - meta: + id: collector-go.d.plugin-nvidia_smi + plugin_name: go.d.plugin + module_name: nvidia_smi + monitored_instance: + name: Nvidia GPU + link: https://www.nvidia.com/en-us/ + icon_filename: nvidia.svg + categories: + - data-collection.hardware-devices-and-sensors + keywords: + - nvidia + - gpu + - hardware + related_resources: + integrations: + list: [] + info_provided_to_referring_integrations: + description: "" + most_popular: false + overview: + data_collection: + metrics_description: | + This collector monitors GPUs performance metrics using + the [nvidia-smi](https://developer.nvidia.com/nvidia-system-management-interface) CLI tool. + + > **Warning**: under development, [loop mode](https://github.com/netdata/netdata/issues/14522) not implemented yet. + method_description: "" + supported_platforms: + include: [] + exclude: [] + multi_instance: true + additional_permissions: + description: "" + default_behavior: + auto_detection: + description: "" + limits: + description: "" + performance_impact: + description: "" + setup: + prerequisites: + list: + - title: Enable in go.d.conf. + description: | + This collector is disabled by default. You need to explicitly enable it in the `go.d.conf` file. + configuration: + file: + name: go.d/nvidia_smi.conf + options: + description: | + The following options can be defined globally: update_every, autodetection_retry. + folding: + title: Config options + enabled: true + list: + - name: update_every + description: Data collection frequency. + default_value: 10 + required: false + - name: autodetection_retry + description: Recheck interval in seconds. Zero means no recheck will be scheduled. + default_value: 0 + required: false + - name: binary_path + description: Path to nvidia_smi binary. The default is "nvidia_smi" and the executable is looked for in the directories specified in the PATH environment variable. + default_value: nvidia_smi + required: false + - name: timeout + description: nvidia_smi binary execution timeout. + default_value: 2 + required: false + - name: use_csv_format + description: Used format when requesting GPU information. XML is used if set to 'no'. + default_value: true + required: false + details: | + This module supports data collection in CSV and XML formats. The default is CSV. + + - XML provides more metrics, but requesting GPU information consumes more CPU, especially if there are multiple GPUs in the system. + - CSV provides fewer metrics, but is much lighter than XML in terms of CPU usage. + examples: + folding: + title: Config + enabled: true + list: + - name: XML format + description: Use XML format when requesting GPU information. + config: | + jobs: + - name: nvidia_smi + use_csv_format: no + - name: Custom binary path + description: The executable is not in the directories specified in the PATH environment variable. + config: | + jobs: + - name: nvidia_smi + binary_path: /usr/local/sbin/nvidia_smi + troubleshooting: + problems: + list: [] + alerts: [] + metrics: + folding: + title: Metrics + enabled: false + description: "" + availability: + - XML + - CSV + scopes: + - name: gpu + description: These metrics refer to the GPU. + labels: + - name: uuid + description: GPU id (e.g. 00000000:00:04.0) + - name: product_name + description: GPU product name (e.g. NVIDIA A100-SXM4-40GB) + metrics: + - name: nvidia_smi.gpu_pcie_bandwidth_usage + availability: + - XML + description: PCI Express Bandwidth Usage + unit: B/s + chart_type: line + dimensions: + - name: rx + - name: tx + - name: nvidia_smi.gpu_pcie_bandwidth_utilization + availability: + - XML + description: PCI Express Bandwidth Utilization + unit: '%' + chart_type: line + dimensions: + - name: rx + - name: tx + - name: nvidia_smi.gpu_fan_speed_perc + availability: + - XML + - CSV + description: Fan speed + unit: '%' + chart_type: line + dimensions: + - name: fan_speed + - name: nvidia_smi.gpu_utilization + availability: + - XML + - CSV + description: GPU utilization + unit: '%' + chart_type: line + dimensions: + - name: gpu + - name: nvidia_smi.gpu_memory_utilization + availability: + - XML + - CSV + description: Memory utilization + unit: '%' + chart_type: line + dimensions: + - name: memory + - name: nvidia_smi.gpu_decoder_utilization + availability: + - XML + description: Decoder utilization + unit: '%' + chart_type: line + dimensions: + - name: decoder + - name: nvidia_smi.gpu_encoder_utilization + availability: + - XML + description: Encoder utilization + unit: '%' + chart_type: line + dimensions: + - name: encoder + - name: nvidia_smi.gpu_frame_buffer_memory_usage + availability: + - XML + - CSV + description: Frame buffer memory usage + unit: B + chart_type: stacked + dimensions: + - name: free + - name: used + - name: reserved + - name: nvidia_smi.gpu_bar1_memory_usage + availability: + - XML + description: BAR1 memory usage + unit: B + chart_type: stacked + dimensions: + - name: free + - name: used + - name: nvidia_smi.gpu_temperature + availability: + - XML + - CSV + description: Temperature + unit: Celsius + chart_type: line + dimensions: + - name: temperature + - name: nvidia_smi.gpu_voltage + availability: + - XML + description: Voltage + unit: V + chart_type: line + dimensions: + - name: voltage + - name: nvidia_smi.gpu_clock_freq + availability: + - XML + - CSV + description: Clock current frequency + unit: MHz + chart_type: line + dimensions: + - name: graphics + - name: video + - name: sm + - name: mem + - name: nvidia_smi.gpu_power_draw + availability: + - XML + - CSV + description: Power draw + unit: Watts + chart_type: line + dimensions: + - name: power_draw + - name: nvidia_smi.gpu_performance_state + availability: + - XML + - CSV + description: Performance state + unit: state + chart_type: line + dimensions: + - name: P0-P15 + - name: nvidia_smi.gpu_mig_mode_current_status + availability: + - XML + description: MIG current mode + unit: status + chart_type: line + dimensions: + - name: enabled + - name: disabled + - name: nvidia_smi.gpu_mig_devices_count + availability: + - XML + description: MIG devices + unit: devices + chart_type: line + dimensions: + - name: mig + - name: mig + description: These metrics refer to the Multi-Instance GPU (MIG). + labels: + - name: uuid + description: GPU id (e.g. 00000000:00:04.0) + - name: product_name + description: GPU product name (e.g. NVIDIA A100-SXM4-40GB) + - name: gpu_instance_id + description: GPU instance id (e.g. 1) + metrics: + - name: nvidia_smi.gpu_mig_frame_buffer_memory_usage + availability: + - XML + description: Frame buffer memory usage + unit: B + chart_type: stacked + dimensions: + - name: free + - name: used + - name: reserved + - name: nvidia_smi.gpu_mig_bar1_memory_usage + availability: + - XML + description: BAR1 memory usage + unit: B + chart_type: stacked + dimensions: + - name: free + - name: used |