blob: 71a5be2e7cb2660ed205cfe6bdc022de083d8305 (
plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
|
plugin_name: go.d.plugin
modules:
- meta:
id: collector-go.d.plugin-nvme
plugin_name: go.d.plugin
module_name: nvme
monitored_instance:
name: NVMe devices
link: ""
icon_filename: nvme.svg
categories:
- data-collection.storage-mount-points-and-filesystems
keywords:
- nvme
related_resources:
integrations:
list: []
info_provided_to_referring_integrations:
description: ""
most_popular: false
overview:
data_collection:
metrics_description: >
This collector monitors the health of NVMe devices using the command line
tool [nvme](https://github.com/linux-nvme/nvme-cli#nvme-cli), which can only be run by the root user. It uses `sudo` and
assumes it is set up so that the netdata user can execute `nvme` as root without a password.
method_description: ""
supported_platforms:
include: []
exclude: []
multi_instance: true
additional_permissions:
description: ""
default_behavior:
auto_detection:
description: ""
limits:
description: ""
performance_impact:
description: ""
setup:
prerequisites:
list:
- title: Install nvme-cli
description: |
See [Distro Support](https://github.com/linux-nvme/nvme-cli#distro-support). Install `nvme-cli` using your distribution's package manager.
- title: Allow netdata to execute nvme
description: |
Add the netdata user to `/etc/sudoers` (use `which nvme` to find the full path to the binary):
```bash
netdata ALL=(root) NOPASSWD: /usr/sbin/nvme
```
configuration:
file:
name: go.d/nvme.conf
options:
description: |
The following options can be defined globally: update_every, autodetection_retry.
folding:
title: Config options
enabled: true
list:
- name: update_every
description: Data collection frequency.
default_value: 10
required: false
- name: autodetection_retry
description: Recheck interval in seconds. Zero means no recheck will be scheduled.
default_value: 0
required: false
- name: binary_path
description: Path to nvme binary. The default is "nvme" and the executable is looked for in the directories specified in the PATH environment variable.
default_value: nvme
required: false
- name: timeout
description: nvme binary execution timeout.
default_value: 2
required: false
examples:
folding:
title: Config
enabled: true
list:
- name: Custom binary path
description: The executable is not in the directories specified in the PATH environment variable.
config: |
jobs:
- name: nvme
binary_path: /usr/local/sbin/nvme
troubleshooting:
problems:
list: []
alerts:
- name: nvme_device_critical_warnings_state
metric: nvme.device_critical_warnings_state
info: "NVMe device ${label:device} has critical warnings"
link: https://github.com/netdata/netdata/blob/master/src/health/health.d/nvme.conf
metrics:
folding:
title: Metrics
enabled: false
description: ""
availability: []
scopes:
- name: device
description: These metrics refer to the NVME device.
labels:
- name: device
description: NVMe device name
metrics:
- name: nvme.device_estimated_endurance_perc
description: Estimated endurance
unit: '%'
chart_type: line
dimensions:
- name: used
- name: nvme.device_available_spare_perc
description: Remaining spare capacity
unit: '%'
chart_type: line
dimensions:
- name: spare
- name: nvme.device_composite_temperature
description: Composite temperature
unit: celsius
chart_type: line
dimensions:
- name: temperature
- name: nvme.device_io_transferred_count
description: Amount of data transferred to and from device
unit: bytes
chart_type: area
dimensions:
- name: read
- name: written
- name: nvme.device_power_cycles_count
description: Power cycles
unit: cycles
chart_type: line
dimensions:
- name: power
- name: nvme.device_power_on_time
description: Power-on time
unit: seconds
chart_type: line
dimensions:
- name: power-on
- name: nvme.device_critical_warnings_state
description: Critical warnings state
unit: state
chart_type: line
dimensions:
- name: available_spare
- name: temp_threshold
- name: nvm_subsystem_reliability
- name: read_only
- name: volatile_mem_backup_failed
- name: persistent_memory_read_only
- name: nvme.device_unsafe_shutdowns_count
description: Unsafe shutdowns
unit: shutdowns
chart_type: line
dimensions:
- name: unsafe
- name: nvme.device_media_errors_rate
description: Media and data integrity errors
unit: errors/s
chart_type: line
dimensions:
- name: media
- name: nvme.device_error_log_entries_rate
description: Error log entries
unit: entries/s
chart_type: line
dimensions:
- name: error_log
- name: nvme.device_warning_composite_temperature_time
description: Warning composite temperature time
unit: seconds
chart_type: line
dimensions:
- name: wctemp
- name: nvme.device_critical_composite_temperature_time
description: Critical composite temperature time
unit: seconds
chart_type: line
dimensions:
- name: cctemp
- name: nvme.device_thermal_mgmt_temp1_transitions_rate
description: Thermal management temp1 transitions
unit: transitions/s
chart_type: line
dimensions:
- name: temp1
- name: nvme.device_thermal_mgmt_temp2_transitions_rate
description: Thermal management temp2 transitions
unit: transitions/s
chart_type: line
dimensions:
- name: temp2
- name: nvme.device_thermal_mgmt_temp1_time
description: Thermal management temp1 time
unit: seconds
chart_type: line
dimensions:
- name: temp1
- name: nvme.device_thermal_mgmt_temp2_time
description: Thermal management temp2 time
unit: seconds
chart_type: line
dimensions:
- name: temp2
|