summaryrefslogtreecommitdiffstats
path: root/src/go/collectors/go.d.plugin/modules/nvme
diff options
context:
space:
mode:
Diffstat (limited to '')
l---------src/go/collectors/go.d.plugin/modules/nvme/README.md1
-rw-r--r--src/go/collectors/go.d.plugin/modules/nvme/charts.go267
-rw-r--r--src/go/collectors/go.d.plugin/modules/nvme/collect.go120
-rw-r--r--src/go/collectors/go.d.plugin/modules/nvme/config_schema.json36
-rw-r--r--src/go/collectors/go.d.plugin/modules/nvme/exec.go94
-rw-r--r--src/go/collectors/go.d.plugin/modules/nvme/init.go26
-rw-r--r--src/go/collectors/go.d.plugin/modules/nvme/integrations/nvme_devices.md207
-rw-r--r--src/go/collectors/go.d.plugin/modules/nvme/metadata.yaml225
-rw-r--r--src/go/collectors/go.d.plugin/modules/nvme/nvme.go109
-rw-r--r--src/go/collectors/go.d.plugin/modules/nvme/nvme_test.go430
-rw-r--r--src/go/collectors/go.d.plugin/modules/nvme/testdata/config.json4
-rw-r--r--src/go/collectors/go.d.plugin/modules/nvme/testdata/config.yaml2
-rw-r--r--src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-list-empty.json4
-rw-r--r--src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-list.json30
-rw-r--r--src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-smart-log-float.json24
-rw-r--r--src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-smart-log-string.json24
-rw-r--r--src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-smart-log.json24
17 files changed, 1627 insertions, 0 deletions
diff --git a/src/go/collectors/go.d.plugin/modules/nvme/README.md b/src/go/collectors/go.d.plugin/modules/nvme/README.md
new file mode 120000
index 000000000..ca657b905
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/nvme/README.md
@@ -0,0 +1 @@
+integrations/nvme_devices.md \ No newline at end of file
diff --git a/src/go/collectors/go.d.plugin/modules/nvme/charts.go b/src/go/collectors/go.d.plugin/modules/nvme/charts.go
new file mode 100644
index 000000000..8404d2dcc
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/nvme/charts.go
@@ -0,0 +1,267 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+package nvme
+
+import (
+ "fmt"
+ "strings"
+
+ "github.com/netdata/netdata/go/go.d.plugin/agent/module"
+)
+
+const (
+ _ = 2050 + iota // right after Disks section
+ prioDeviceEstimatedEndurancePerc
+ prioDeviceAvailableSparePerc
+ prioDeviceCompositeTemperature
+ prioDeviceIOTransferredCount
+ prioDevicePowerCyclesCount
+ prioDevicePowerOnTime
+ prioDeviceUnsafeShutdownsCount
+ prioDeviceCriticalWarningsState
+ prioDeviceMediaErrorsRate
+ prioDeviceErrorLogEntriesRate
+ prioDeviceWarningCompositeTemperatureTime
+ prioDeviceCriticalCompositeTemperatureTime
+ prioDeviceThmTemp1TransitionsCount
+ prioDeviceThmTemp2TransitionsRate
+ prioDeviceThmTemp1Time
+ prioDeviceThmTemp2Time
+)
+
+var deviceChartsTmpl = module.Charts{
+ deviceEstimatedEndurancePercChartTmpl.Copy(),
+ deviceAvailableSparePercChartTmpl.Copy(),
+ deviceCompositeTemperatureChartTmpl.Copy(),
+ deviceIOTransferredCountChartTmpl.Copy(),
+ devicePowerCyclesCountChartTmpl.Copy(),
+ devicePowerOnTimeChartTmpl.Copy(),
+ deviceUnsafeShutdownsCountChartTmpl.Copy(),
+ deviceCriticalWarningsStateChartTmpl.Copy(),
+ deviceMediaErrorsRateChartTmpl.Copy(),
+ deviceErrorLogEntriesRateChartTmpl.Copy(),
+ deviceWarnCompositeTemperatureTimeChartTmpl.Copy(),
+ deviceCritCompositeTemperatureTimeChartTmpl.Copy(),
+ deviceThmTemp1TransitionsRateChartTmpl.Copy(),
+ deviceThmTemp2TransitionsRateChartTmpl.Copy(),
+ deviceThmTemp1TimeChartTmpl.Copy(),
+ deviceThmTemp2TimeChartTmpl.Copy(),
+}
+
+var deviceEstimatedEndurancePercChartTmpl = module.Chart{
+ ID: "device_%s_estimated_endurance_perc",
+ Title: "Estimated endurance",
+ Units: "percentage",
+ Fam: "endurance",
+ Ctx: "nvme.device_estimated_endurance_perc",
+ Priority: prioDeviceEstimatedEndurancePerc,
+ Dims: module.Dims{
+ {ID: "device_%s_percentage_used", Name: "used"},
+ },
+}
+var deviceAvailableSparePercChartTmpl = module.Chart{
+ ID: "device_%s_available_spare_perc",
+ Title: "Remaining spare capacity",
+ Units: "percentage",
+ Fam: "spare",
+ Ctx: "nvme.device_available_spare_perc",
+ Priority: prioDeviceAvailableSparePerc,
+ Dims: module.Dims{
+ {ID: "device_%s_available_spare", Name: "spare"},
+ },
+}
+var deviceCompositeTemperatureChartTmpl = module.Chart{
+ ID: "device_%s_temperature",
+ Title: "Composite temperature",
+ Units: "celsius",
+ Fam: "temperature",
+ Ctx: "nvme.device_composite_temperature",
+ Priority: prioDeviceCompositeTemperature,
+ Dims: module.Dims{
+ {ID: "device_%s_temperature", Name: "temperature"},
+ },
+}
+var deviceIOTransferredCountChartTmpl = module.Chart{
+ ID: "device_%s_io_transferred_count",
+ Title: "Amount of data transferred to and from device",
+ Units: "bytes",
+ Fam: "transferred data",
+ Ctx: "nvme.device_io_transferred_count",
+ Priority: prioDeviceIOTransferredCount,
+ Type: module.Area,
+ Dims: module.Dims{
+ {ID: "device_%s_data_units_read", Name: "read"},
+ {ID: "device_%s_data_units_written", Name: "written", Mul: -1},
+ },
+}
+
+var devicePowerCyclesCountChartTmpl = module.Chart{
+ ID: "device_%s_power_cycles_count",
+ Title: "Power cycles",
+ Units: "cycles",
+ Fam: "power cycles",
+ Ctx: "nvme.device_power_cycles_count",
+ Priority: prioDevicePowerCyclesCount,
+ Dims: module.Dims{
+ {ID: "device_%s_power_cycles", Name: "power"},
+ },
+}
+var devicePowerOnTimeChartTmpl = module.Chart{
+ ID: "device_%s_power_on_time",
+ Title: "Power-on time",
+ Units: "seconds",
+ Fam: "power-on time",
+ Ctx: "nvme.device_power_on_time",
+ Priority: prioDevicePowerOnTime,
+ Dims: module.Dims{
+ {ID: "device_%s_power_on_time", Name: "power-on"},
+ },
+}
+var deviceCriticalWarningsStateChartTmpl = module.Chart{
+ ID: "device_%s_critical_warnings_state",
+ Title: "Critical warnings state",
+ Units: "state",
+ Fam: "critical warnings",
+ Ctx: "nvme.device_critical_warnings_state",
+ Priority: prioDeviceCriticalWarningsState,
+ Dims: module.Dims{
+ {ID: "device_%s_critical_warning_available_spare", Name: "available_spare"},
+ {ID: "device_%s_critical_warning_temp_threshold", Name: "temp_threshold"},
+ {ID: "device_%s_critical_warning_nvm_subsystem_reliability", Name: "nvm_subsystem_reliability"},
+ {ID: "device_%s_critical_warning_read_only", Name: "read_only"},
+ {ID: "device_%s_critical_warning_volatile_mem_backup_failed", Name: "volatile_mem_backup_failed"},
+ {ID: "device_%s_critical_warning_persistent_memory_read_only", Name: "persistent_memory_read_only"},
+ },
+}
+var deviceUnsafeShutdownsCountChartTmpl = module.Chart{
+ ID: "device_%s_unsafe_shutdowns_count",
+ Title: "Unsafe shutdowns",
+ Units: "shutdowns",
+ Fam: "shutdowns",
+ Ctx: "nvme.device_unsafe_shutdowns_count",
+ Priority: prioDeviceUnsafeShutdownsCount,
+ Dims: module.Dims{
+ {ID: "device_%s_unsafe_shutdowns", Name: "unsafe"},
+ },
+}
+var deviceMediaErrorsRateChartTmpl = module.Chart{
+ ID: "device_%s_media_errors_rate",
+ Title: "Media and data integrity errors",
+ Units: "errors/s",
+ Fam: "media errors",
+ Ctx: "nvme.device_media_errors_rate",
+ Priority: prioDeviceMediaErrorsRate,
+ Dims: module.Dims{
+ {ID: "device_%s_media_errors", Name: "media", Algo: module.Incremental},
+ },
+}
+var deviceErrorLogEntriesRateChartTmpl = module.Chart{
+ ID: "device_%s_error_log_entries_rate",
+ Title: "Error log entries",
+ Units: "entries/s",
+ Fam: "error log",
+ Ctx: "nvme.device_error_log_entries_rate",
+ Priority: prioDeviceErrorLogEntriesRate,
+ Dims: module.Dims{
+ {ID: "device_%s_num_err_log_entries", Name: "error_log", Algo: module.Incremental},
+ },
+}
+var deviceWarnCompositeTemperatureTimeChartTmpl = module.Chart{
+ ID: "device_%s_warning_composite_temperature_time",
+ Title: "Warning composite temperature time",
+ Units: "seconds",
+ Fam: "warn temp time",
+ Ctx: "nvme.device_warning_composite_temperature_time",
+ Priority: prioDeviceWarningCompositeTemperatureTime,
+ Dims: module.Dims{
+ {ID: "device_%s_warning_temp_time", Name: "wctemp"},
+ },
+}
+var deviceCritCompositeTemperatureTimeChartTmpl = module.Chart{
+ ID: "device_%s_critical_composite_temperature_time",
+ Title: "Critical composite temperature time",
+ Units: "seconds",
+ Fam: "crit temp time",
+ Ctx: "nvme.device_critical_composite_temperature_time",
+ Priority: prioDeviceCriticalCompositeTemperatureTime,
+ Dims: module.Dims{
+ {ID: "device_%s_critical_comp_time", Name: "cctemp"},
+ },
+}
+var (
+ deviceThmTemp1TransitionsRateChartTmpl = module.Chart{
+ ID: "device_%s_thm_temp1_transitions_rate",
+ Title: "Thermal management temp1 transitions",
+ Units: "transitions/s",
+ Fam: "thermal mgmt transitions",
+ Ctx: "nvme.device_thermal_mgmt_temp1_transitions_rate",
+ Priority: prioDeviceThmTemp1TransitionsCount,
+ Dims: module.Dims{
+ {ID: "device_%s_thm_temp1_trans_count", Name: "temp1", Algo: module.Incremental},
+ },
+ }
+ deviceThmTemp2TransitionsRateChartTmpl = module.Chart{
+ ID: "device_%s_thm_temp2_transitions_rate",
+ Title: "Thermal management temp2 transitions",
+ Units: "transitions/s",
+ Fam: "thermal mgmt transitions",
+ Ctx: "nvme.device_thermal_mgmt_temp2_transitions_rate",
+ Priority: prioDeviceThmTemp2TransitionsRate,
+ Dims: module.Dims{
+ {ID: "device_%s_thm_temp2_trans_count", Name: "temp2", Algo: module.Incremental},
+ },
+ }
+)
+var (
+ deviceThmTemp1TimeChartTmpl = module.Chart{
+ ID: "device_%s_thm_temp1_time",
+ Title: "Thermal management temp1 time",
+ Units: "seconds",
+ Fam: "thermal mgmt time",
+ Ctx: "nvme.device_thermal_mgmt_temp1_time",
+ Priority: prioDeviceThmTemp1Time,
+ Dims: module.Dims{
+ {ID: "device_%s_thm_temp1_total_time", Name: "temp1"},
+ },
+ }
+ deviceThmTemp2TimeChartTmpl = module.Chart{
+ ID: "device_%s_thm_temp2_time",
+ Title: "Thermal management temp1 time",
+ Units: "seconds",
+ Fam: "thermal mgmt time",
+ Ctx: "nvme.device_thermal_mgmt_temp2_time",
+ Priority: prioDeviceThmTemp2Time,
+ Dims: module.Dims{
+ {ID: "device_%s_thm_temp2_total_time", Name: "temp2"},
+ },
+ }
+)
+
+func (n *NVMe) addDeviceCharts(device string) {
+ charts := deviceChartsTmpl.Copy()
+
+ for _, chart := range *charts {
+ chart.ID = fmt.Sprintf(chart.ID, device)
+ chart.Labels = []module.Label{
+ {Key: "device", Value: device},
+ }
+ for _, dim := range chart.Dims {
+ dim.ID = fmt.Sprintf(dim.ID, device)
+ }
+ }
+
+ if err := n.Charts().Add(*charts...); err != nil {
+ n.Warning(err)
+ }
+}
+
+func (n *NVMe) removeDeviceCharts(device string) {
+ px := fmt.Sprintf("device_%s", device)
+
+ for _, chart := range *n.Charts() {
+ if strings.HasPrefix(chart.ID, px) {
+ chart.MarkRemove()
+ chart.MarkNotCreated()
+ }
+ }
+}
diff --git a/src/go/collectors/go.d.plugin/modules/nvme/collect.go b/src/go/collectors/go.d.plugin/modules/nvme/collect.go
new file mode 100644
index 000000000..1cc942395
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/nvme/collect.go
@@ -0,0 +1,120 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+package nvme
+
+import (
+ "errors"
+ "fmt"
+ "path/filepath"
+ "strconv"
+ "time"
+)
+
+func (n *NVMe) collect() (map[string]int64, error) {
+ if n.exec == nil {
+ return nil, errors.New("nvme-cli is not initialized (nil)")
+ }
+
+ now := time.Now()
+ if n.forceListDevices || now.Sub(n.listDevicesTime) > n.listDevicesEvery {
+ n.forceListDevices = false
+ n.listDevicesTime = now
+ if err := n.listNVMeDevices(); err != nil {
+ return nil, err
+ }
+ }
+
+ mx := make(map[string]int64)
+
+ for path := range n.devicePaths {
+ if err := n.collectNVMeDevice(mx, path); err != nil {
+ n.Error(err)
+ n.forceListDevices = true
+ continue
+ }
+ }
+
+ return mx, nil
+}
+
+func (n *NVMe) collectNVMeDevice(mx map[string]int64, devicePath string) error {
+ stats, err := n.exec.smartLog(devicePath)
+ if err != nil {
+ return fmt.Errorf("exec nvme smart-log for '%s': %v", devicePath, err)
+ }
+
+ device := extractDeviceFromPath(devicePath)
+
+ mx["device_"+device+"_temperature"] = int64(float64(parseValue(stats.Temperature)) - 273.15) // Kelvin => Celsius
+ mx["device_"+device+"_percentage_used"] = parseValue(stats.PercentUsed)
+ mx["device_"+device+"_available_spare"] = parseValue(stats.AvailSpare)
+ mx["device_"+device+"_data_units_read"] = parseValue(stats.DataUnitsRead) * 1000 * 512 // units => bytes
+ mx["device_"+device+"_data_units_written"] = parseValue(stats.DataUnitsWritten) * 1000 * 512 // units => bytes
+ mx["device_"+device+"_host_read_commands"] = parseValue(stats.HostReadCommands)
+ mx["device_"+device+"_host_write_commands"] = parseValue(stats.HostWriteCommands)
+ mx["device_"+device+"_power_cycles"] = parseValue(stats.PowerCycles)
+ mx["device_"+device+"_power_on_time"] = parseValue(stats.PowerOnHours) * 3600 // hours => seconds
+ mx["device_"+device+"_unsafe_shutdowns"] = parseValue(stats.UnsafeShutdowns)
+ mx["device_"+device+"_media_errors"] = parseValue(stats.MediaErrors)
+ mx["device_"+device+"_num_err_log_entries"] = parseValue(stats.NumErrLogEntries)
+ mx["device_"+device+"_controller_busy_time"] = parseValue(stats.ControllerBusyTime) * 60 // minutes => seconds
+ mx["device_"+device+"_warning_temp_time"] = parseValue(stats.WarningTempTime) * 60 // minutes => seconds
+ mx["device_"+device+"_critical_comp_time"] = parseValue(stats.CriticalCompTime) * 60 // minutes => seconds
+ mx["device_"+device+"_thm_temp1_trans_count"] = parseValue(stats.ThmTemp1TransCount)
+ mx["device_"+device+"_thm_temp2_trans_count"] = parseValue(stats.ThmTemp2TransCount)
+ mx["device_"+device+"_thm_temp1_total_time"] = parseValue(stats.ThmTemp1TotalTime) // seconds
+ mx["device_"+device+"_thm_temp2_total_time"] = parseValue(stats.ThmTemp2TotalTime) // seconds
+
+ mx["device_"+device+"_critical_warning_available_spare"] = boolToInt(parseValue(stats.CriticalWarning)&1 != 0)
+ mx["device_"+device+"_critical_warning_temp_threshold"] = boolToInt(parseValue(stats.CriticalWarning)&(1<<1) != 0)
+ mx["device_"+device+"_critical_warning_nvm_subsystem_reliability"] = boolToInt(parseValue(stats.CriticalWarning)&(1<<2) != 0)
+ mx["device_"+device+"_critical_warning_read_only"] = boolToInt(parseValue(stats.CriticalWarning)&(1<<3) != 0)
+ mx["device_"+device+"_critical_warning_volatile_mem_backup_failed"] = boolToInt(parseValue(stats.CriticalWarning)&(1<<4) != 0)
+ mx["device_"+device+"_critical_warning_persistent_memory_read_only"] = boolToInt(parseValue(stats.CriticalWarning)&(1<<5) != 0)
+
+ return nil
+}
+
+func (n *NVMe) listNVMeDevices() error {
+ devices, err := n.exec.list()
+ if err != nil {
+ return fmt.Errorf("exec nvme list: %v", err)
+ }
+
+ seen := make(map[string]bool)
+ for _, v := range devices.Devices {
+ device := extractDeviceFromPath(v.DevicePath)
+ seen[device] = true
+
+ if !n.devicePaths[v.DevicePath] {
+ n.devicePaths[v.DevicePath] = true
+ n.addDeviceCharts(device)
+ }
+ }
+ for path := range n.devicePaths {
+ device := extractDeviceFromPath(path)
+ if !seen[device] {
+ delete(n.devicePaths, device)
+ n.removeDeviceCharts(device)
+ }
+ }
+
+ return nil
+}
+
+func extractDeviceFromPath(devicePath string) string {
+ _, name := filepath.Split(devicePath)
+ return name
+}
+
+func boolToInt(v bool) int64 {
+ if v {
+ return 1
+ }
+ return 0
+}
+
+func parseValue(s nvmeNumber) int64 {
+ v, _ := strconv.ParseFloat(string(s), 64)
+ return int64(v)
+}
diff --git a/src/go/collectors/go.d.plugin/modules/nvme/config_schema.json b/src/go/collectors/go.d.plugin/modules/nvme/config_schema.json
new file mode 100644
index 000000000..179a24ab1
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/nvme/config_schema.json
@@ -0,0 +1,36 @@
+{
+ "jsonSchema": {
+ "$schema": "http://json-schema.org/draft-07/schema#",
+ "title": "NVMe collector configuration",
+ "type": "object",
+ "properties": {
+ "update_every": {
+ "title": "Update every",
+ "description": "Data collection interval, measured in seconds.",
+ "type": "integer",
+ "minimum": 1,
+ "default": 10
+ },
+ "timeout": {
+ "title": "Timeout",
+ "description": "Timeout for executing the `nvme`, specified in seconds.",
+ "type": "number",
+ "minimum": 0.5,
+ "default": 2
+ }
+ },
+ "required": [],
+ "additionalProperties": false,
+ "patternProperties": {
+ "^name$": {}
+ }
+ },
+ "uiSchema": {
+ "uiOptions": {
+ "fullPage": true
+ },
+ "timeout": {
+ "ui:help": "Accepts decimals for precise control (e.g., type 1.5 for 1.5 seconds)."
+ }
+ }
+}
diff --git a/src/go/collectors/go.d.plugin/modules/nvme/exec.go b/src/go/collectors/go.d.plugin/modules/nvme/exec.go
new file mode 100644
index 000000000..8c1281a2f
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/nvme/exec.go
@@ -0,0 +1,94 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+package nvme
+
+import (
+ "bytes"
+ "context"
+ "encoding/json"
+ "os/exec"
+ "time"
+)
+
+type nvmeDeviceList struct {
+ Devices []struct {
+ DevicePath string `json:"DevicePath"`
+ UsedBytes nvmeNumber `json:"UsedBytes"`
+ PhysicalSize nvmeNumber `json:"PhysicalSize"`
+ SectorSize nvmeNumber `json:"SectorSize"`
+ }
+}
+
+// See "Health Information Log Page" in the Current Specification Version
+// https://nvmexpress.org/developers/nvme-specification/
+type nvmeDeviceSmartLog struct {
+ CriticalWarning nvmeNumber `json:"critical_warning"`
+ Temperature nvmeNumber `json:"temperature"`
+ AvailSpare nvmeNumber `json:"avail_spare"`
+ SpareThresh nvmeNumber `json:"spare_thresh"`
+ PercentUsed nvmeNumber `json:"percent_used"`
+ DataUnitsRead nvmeNumber `json:"data_units_read"`
+ DataUnitsWritten nvmeNumber `json:"data_units_written"`
+ HostReadCommands nvmeNumber `json:"host_read_commands"`
+ HostWriteCommands nvmeNumber `json:"host_write_commands"`
+ ControllerBusyTime nvmeNumber `json:"controller_busy_time"`
+ PowerCycles nvmeNumber `json:"power_cycles"`
+ PowerOnHours nvmeNumber `json:"power_on_hours"`
+ UnsafeShutdowns nvmeNumber `json:"unsafe_shutdowns"`
+ MediaErrors nvmeNumber `json:"media_errors"`
+ NumErrLogEntries nvmeNumber `json:"num_err_log_entries"`
+ WarningTempTime nvmeNumber `json:"warning_temp_time"`
+ CriticalCompTime nvmeNumber `json:"critical_comp_time"`
+ ThmTemp1TransCount nvmeNumber `json:"thm_temp1_trans_count"`
+ ThmTemp2TransCount nvmeNumber `json:"thm_temp2_trans_count"`
+ ThmTemp1TotalTime nvmeNumber `json:"thm_temp1_total_time"`
+ ThmTemp2TotalTime nvmeNumber `json:"thm_temp2_total_time"`
+}
+
+// nvme-cli 2.1.1 exposes some values as strings
+type nvmeNumber string
+
+func (n *nvmeNumber) UnmarshalJSON(b []byte) error {
+ *n = nvmeNumber(bytes.Trim(b, "\""))
+ return nil
+}
+
+type nvmeCLIExec struct {
+ ndsudoPath string
+ timeout time.Duration
+}
+
+func (n *nvmeCLIExec) list() (*nvmeDeviceList, error) {
+ bs, err := n.execute("nvme-list")
+ if err != nil {
+ return nil, err
+ }
+
+ var v nvmeDeviceList
+ if err := json.Unmarshal(bs, &v); err != nil {
+ return nil, err
+ }
+
+ return &v, nil
+}
+
+func (n *nvmeCLIExec) smartLog(devicePath string) (*nvmeDeviceSmartLog, error) {
+ bs, err := n.execute("nvme-smart-log", "--device", devicePath)
+ if err != nil {
+ return nil, err
+ }
+
+ var v nvmeDeviceSmartLog
+ if err := json.Unmarshal(bs, &v); err != nil {
+ return nil, err
+ }
+
+ return &v, nil
+}
+
+func (n *nvmeCLIExec) execute(arg ...string) ([]byte, error) {
+ ctx, cancel := context.WithTimeout(context.Background(), n.timeout)
+ defer cancel()
+
+ return exec.CommandContext(ctx, n.ndsudoPath, arg...).Output()
+}
diff --git a/src/go/collectors/go.d.plugin/modules/nvme/init.go b/src/go/collectors/go.d.plugin/modules/nvme/init.go
new file mode 100644
index 000000000..51f1400a0
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/nvme/init.go
@@ -0,0 +1,26 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+package nvme
+
+import (
+ "fmt"
+ "os"
+ "path/filepath"
+
+ "github.com/netdata/netdata/go/go.d.plugin/agent/executable"
+)
+
+func (n *NVMe) initNVMeCLIExec() (nvmeCLI, error) {
+ ndsudoPath := filepath.Join(executable.Directory, "ndsudo")
+
+ if _, err := os.Stat(ndsudoPath); err != nil {
+ return nil, fmt.Errorf("ndsudo executable not found: %v", err)
+ }
+
+ nvmeExec := &nvmeCLIExec{
+ ndsudoPath: ndsudoPath,
+ timeout: n.Timeout.Duration(),
+ }
+
+ return nvmeExec, nil
+}
diff --git a/src/go/collectors/go.d.plugin/modules/nvme/integrations/nvme_devices.md b/src/go/collectors/go.d.plugin/modules/nvme/integrations/nvme_devices.md
new file mode 100644
index 000000000..fd18c1fd2
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/nvme/integrations/nvme_devices.md
@@ -0,0 +1,207 @@
+<!--startmeta
+custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/go/collectors/go.d.plugin/modules/nvme/README.md"
+meta_yaml: "https://github.com/netdata/netdata/edit/master/src/go/collectors/go.d.plugin/modules/nvme/metadata.yaml"
+sidebar_label: "NVMe devices"
+learn_status: "Published"
+learn_rel_path: "Collecting Metrics/Storage, Mount Points and Filesystems"
+most_popular: False
+message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE COLLECTOR'S metadata.yaml FILE"
+endmeta-->
+
+# NVMe devices
+
+
+<img src="https://netdata.cloud/img/nvme.svg" width="150"/>
+
+
+Plugin: go.d.plugin
+Module: nvme
+
+<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" />
+
+## Overview
+
+This collector monitors the health of NVMe devices. It relies on the [`nvme`](https://github.com/linux-nvme/nvme-cli#nvme-cli) CLI tool but avoids directly executing the binary. Instead, it utilizes `ndsudo`, a Netdata helper specifically designed to run privileged commands securely within the Netdata environment. This approach eliminates the need to use `sudo`, improving security and potentially simplifying permission management.
+
+
+
+
+This collector is supported on all platforms.
+
+This collector supports collecting metrics from multiple instances of this integration, including remote instances.
+
+
+### Default Behavior
+
+#### Auto-Detection
+
+This integration doesn't support auto-detection.
+
+#### Limits
+
+The default configuration for this integration does not impose any limits on data collection.
+
+#### Performance Impact
+
+The default configuration for this integration is not expected to impose a significant performance impact on the system.
+
+
+## Metrics
+
+Metrics grouped by *scope*.
+
+The scope defines the instance that the metric belongs to. An instance is uniquely identified by a set of labels.
+
+
+
+### Per device
+
+These metrics refer to the NVME device.
+
+Labels:
+
+| Label | Description |
+|:-----------|:----------------|
+| device | NVMe device name |
+
+Metrics:
+
+| Metric | Dimensions | Unit |
+|:------|:----------|:----|
+| nvme.device_estimated_endurance_perc | used | % |
+| nvme.device_available_spare_perc | spare | % |
+| nvme.device_composite_temperature | temperature | celsius |
+| nvme.device_io_transferred_count | read, written | bytes |
+| nvme.device_power_cycles_count | power | cycles |
+| nvme.device_power_on_time | power-on | seconds |
+| nvme.device_critical_warnings_state | available_spare, temp_threshold, nvm_subsystem_reliability, read_only, volatile_mem_backup_failed, persistent_memory_read_only | state |
+| nvme.device_unsafe_shutdowns_count | unsafe | shutdowns |
+| nvme.device_media_errors_rate | media | errors/s |
+| nvme.device_error_log_entries_rate | error_log | entries/s |
+| nvme.device_warning_composite_temperature_time | wctemp | seconds |
+| nvme.device_critical_composite_temperature_time | cctemp | seconds |
+| nvme.device_thermal_mgmt_temp1_transitions_rate | temp1 | transitions/s |
+| nvme.device_thermal_mgmt_temp2_transitions_rate | temp2 | transitions/s |
+| nvme.device_thermal_mgmt_temp1_time | temp1 | seconds |
+| nvme.device_thermal_mgmt_temp2_time | temp2 | seconds |
+
+
+
+## Alerts
+
+
+The following alerts are available:
+
+| Alert name | On metric | Description |
+|:------------|:----------|:------------|
+| [ nvme_device_critical_warnings_state ](https://github.com/netdata/netdata/blob/master/src/health/health.d/nvme.conf) | nvme.device_critical_warnings_state | NVMe device ${label:device} has critical warnings |
+
+
+## Setup
+
+### Prerequisites
+
+#### Install nvme-cli
+
+See [Distro Support](https://github.com/linux-nvme/nvme-cli#distro-support). Install `nvme-cli` using your distribution's package manager.
+
+
+#### For Netdata running in a Docker container: grant NVMe device access
+
+Your NVMe devices need to be accessible within the Docker container for Netdata to monitor them.
+
+Include the following option in your `docker run` command or add the device mapping in your `docker-compose.yml` file:
+
+- `docker run`
+
+ ```bash
+ --device '/dev/nvme0n1:/dev/nvme0n1'
+ ```
+
+- `docker-compose.yml`
+
+ ```yaml
+ services:
+ netdata:
+ devices:
+ - "/dev/nvme0n1:/dev/nvme0n1"
+ ```
+
+**Note**: Replace `/dev/nvme0n1` with your actual NVMe device name.
+
+
+
+### Configuration
+
+#### File
+
+The configuration file name for this integration is `go.d/nvme.conf`.
+
+
+You can edit the configuration file using the `edit-config` script from the
+Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory).
+
+```bash
+cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata
+sudo ./edit-config go.d/nvme.conf
+```
+#### Options
+
+The following options can be defined globally: update_every, autodetection_retry.
+
+
+<details open><summary>Config options</summary>
+
+| Name | Description | Default | Required |
+|:----|:-----------|:-------|:--------:|
+| update_every | Data collection frequency. | 10 | no |
+| autodetection_retry | Recheck interval in seconds. Zero means no recheck will be scheduled. | 0 | no |
+| timeout | nvme binary execution timeout. | 2 | no |
+
+</details>
+
+#### Examples
+
+##### Custom update_every
+
+Allows you to override the default data collection interval.
+
+<details open><summary>Config</summary>
+
+```yaml
+jobs:
+ - name: nvme
+ update_every: 5 # Collect NVMe metrics every 5 seconds
+
+```
+</details>
+
+
+
+## Troubleshooting
+
+### Debug Mode
+
+To troubleshoot issues with the `nvme` collector, run the `go.d.plugin` with the debug option enabled. The output
+should give you clues as to why the collector isn't working.
+
+- Navigate to the `plugins.d` directory, usually at `/usr/libexec/netdata/plugins.d/`. If that's not the case on
+ your system, open `netdata.conf` and look for the `plugins` setting under `[directories]`.
+
+ ```bash
+ cd /usr/libexec/netdata/plugins.d/
+ ```
+
+- Switch to the `netdata` user.
+
+ ```bash
+ sudo -u netdata -s
+ ```
+
+- Run the `go.d.plugin` to debug the collector:
+
+ ```bash
+ ./go.d.plugin -d -m nvme
+ ```
+
+
diff --git a/src/go/collectors/go.d.plugin/modules/nvme/metadata.yaml b/src/go/collectors/go.d.plugin/modules/nvme/metadata.yaml
new file mode 100644
index 000000000..98f35af65
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/nvme/metadata.yaml
@@ -0,0 +1,225 @@
+plugin_name: go.d.plugin
+modules:
+ - meta:
+ id: collector-go.d.plugin-nvme
+ plugin_name: go.d.plugin
+ module_name: nvme
+ monitored_instance:
+ name: NVMe devices
+ link: ""
+ icon_filename: nvme.svg
+ categories:
+ - data-collection.storage-mount-points-and-filesystems
+ keywords:
+ - nvme
+ related_resources:
+ integrations:
+ list: []
+ info_provided_to_referring_integrations:
+ description: ""
+ most_popular: false
+ overview:
+ data_collection:
+ metrics_description: >
+ This collector monitors the health of NVMe devices.
+ It relies on the [`nvme`](https://github.com/linux-nvme/nvme-cli#nvme-cli) CLI tool but avoids directly executing the binary.
+ Instead, it utilizes `ndsudo`, a Netdata helper specifically designed to run privileged commands securely within the Netdata environment.
+ This approach eliminates the need to use `sudo`, improving security and potentially simplifying permission management.
+ method_description: ""
+ supported_platforms:
+ include: []
+ exclude: []
+ multi_instance: true
+ additional_permissions:
+ description: ""
+ default_behavior:
+ auto_detection:
+ description: ""
+ limits:
+ description: ""
+ performance_impact:
+ description: ""
+ setup:
+ prerequisites:
+ list:
+ - title: Install nvme-cli
+ description: |
+ See [Distro Support](https://github.com/linux-nvme/nvme-cli#distro-support). Install `nvme-cli` using your distribution's package manager.
+ - title: "For Netdata running in a Docker container: grant NVMe device access"
+ description: |
+ Your NVMe devices need to be accessible within the Docker container for Netdata to monitor them.
+
+ Include the following option in your `docker run` command or add the device mapping in your `docker-compose.yml` file:
+
+ - `docker run`
+
+ ```bash
+ --device '/dev/nvme0n1:/dev/nvme0n1'
+ ```
+
+ - `docker-compose.yml`
+
+ ```yaml
+ services:
+ netdata:
+ devices:
+ - "/dev/nvme0n1:/dev/nvme0n1"
+ ```
+
+ **Note**: Replace `/dev/nvme0n1` with your actual NVMe device name.
+ configuration:
+ file:
+ name: go.d/nvme.conf
+ options:
+ description: |
+ The following options can be defined globally: update_every, autodetection_retry.
+ folding:
+ title: Config options
+ enabled: true
+ list:
+ - name: update_every
+ description: Data collection frequency.
+ default_value: 10
+ required: false
+ - name: autodetection_retry
+ description: Recheck interval in seconds. Zero means no recheck will be scheduled.
+ default_value: 0
+ required: false
+ - name: timeout
+ description: nvme binary execution timeout.
+ default_value: 2
+ required: false
+ examples:
+ folding:
+ title: Config
+ enabled: true
+ list:
+ - name: Custom update_every
+ description: Allows you to override the default data collection interval.
+ config: |
+ jobs:
+ - name: nvme
+ update_every: 5 # Collect NVMe metrics every 5 seconds
+ troubleshooting:
+ problems:
+ list: []
+ alerts:
+ - name: nvme_device_critical_warnings_state
+ metric: nvme.device_critical_warnings_state
+ info: "NVMe device ${label:device} has critical warnings"
+ link: https://github.com/netdata/netdata/blob/master/src/health/health.d/nvme.conf
+ metrics:
+ folding:
+ title: Metrics
+ enabled: false
+ description: ""
+ availability: []
+ scopes:
+ - name: device
+ description: These metrics refer to the NVME device.
+ labels:
+ - name: device
+ description: NVMe device name
+ metrics:
+ - name: nvme.device_estimated_endurance_perc
+ description: Estimated endurance
+ unit: '%'
+ chart_type: line
+ dimensions:
+ - name: used
+ - name: nvme.device_available_spare_perc
+ description: Remaining spare capacity
+ unit: '%'
+ chart_type: line
+ dimensions:
+ - name: spare
+ - name: nvme.device_composite_temperature
+ description: Composite temperature
+ unit: celsius
+ chart_type: line
+ dimensions:
+ - name: temperature
+ - name: nvme.device_io_transferred_count
+ description: Amount of data transferred to and from device
+ unit: bytes
+ chart_type: area
+ dimensions:
+ - name: read
+ - name: written
+ - name: nvme.device_power_cycles_count
+ description: Power cycles
+ unit: cycles
+ chart_type: line
+ dimensions:
+ - name: power
+ - name: nvme.device_power_on_time
+ description: Power-on time
+ unit: seconds
+ chart_type: line
+ dimensions:
+ - name: power-on
+ - name: nvme.device_critical_warnings_state
+ description: Critical warnings state
+ unit: state
+ chart_type: line
+ dimensions:
+ - name: available_spare
+ - name: temp_threshold
+ - name: nvm_subsystem_reliability
+ - name: read_only
+ - name: volatile_mem_backup_failed
+ - name: persistent_memory_read_only
+ - name: nvme.device_unsafe_shutdowns_count
+ description: Unsafe shutdowns
+ unit: shutdowns
+ chart_type: line
+ dimensions:
+ - name: unsafe
+ - name: nvme.device_media_errors_rate
+ description: Media and data integrity errors
+ unit: errors/s
+ chart_type: line
+ dimensions:
+ - name: media
+ - name: nvme.device_error_log_entries_rate
+ description: Error log entries
+ unit: entries/s
+ chart_type: line
+ dimensions:
+ - name: error_log
+ - name: nvme.device_warning_composite_temperature_time
+ description: Warning composite temperature time
+ unit: seconds
+ chart_type: line
+ dimensions:
+ - name: wctemp
+ - name: nvme.device_critical_composite_temperature_time
+ description: Critical composite temperature time
+ unit: seconds
+ chart_type: line
+ dimensions:
+ - name: cctemp
+ - name: nvme.device_thermal_mgmt_temp1_transitions_rate
+ description: Thermal management temp1 transitions
+ unit: transitions/s
+ chart_type: line
+ dimensions:
+ - name: temp1
+ - name: nvme.device_thermal_mgmt_temp2_transitions_rate
+ description: Thermal management temp2 transitions
+ unit: transitions/s
+ chart_type: line
+ dimensions:
+ - name: temp2
+ - name: nvme.device_thermal_mgmt_temp1_time
+ description: Thermal management temp1 time
+ unit: seconds
+ chart_type: line
+ dimensions:
+ - name: temp1
+ - name: nvme.device_thermal_mgmt_temp2_time
+ description: Thermal management temp2 time
+ unit: seconds
+ chart_type: line
+ dimensions:
+ - name: temp2
diff --git a/src/go/collectors/go.d.plugin/modules/nvme/nvme.go b/src/go/collectors/go.d.plugin/modules/nvme/nvme.go
new file mode 100644
index 000000000..76b6445b3
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/nvme/nvme.go
@@ -0,0 +1,109 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+package nvme
+
+import (
+ _ "embed"
+ "errors"
+ "time"
+
+ "github.com/netdata/netdata/go/go.d.plugin/agent/module"
+ "github.com/netdata/netdata/go/go.d.plugin/pkg/web"
+)
+
+//go:embed "config_schema.json"
+var configSchema string
+
+func init() {
+ module.Register("nvme", module.Creator{
+ JobConfigSchema: configSchema,
+ Defaults: module.Defaults{
+ UpdateEvery: 10,
+ },
+ Create: func() module.Module { return New() },
+ Config: func() any { return &Config{} },
+ })
+}
+
+func New() *NVMe {
+ return &NVMe{
+ Config: Config{
+ Timeout: web.Duration(time.Second * 2),
+ },
+
+ charts: &module.Charts{},
+ devicePaths: make(map[string]bool),
+ listDevicesEvery: time.Minute * 10,
+ }
+
+}
+
+type Config struct {
+ UpdateEvery int `yaml:"update_every,omitempty" json:"update_every"`
+ Timeout web.Duration `yaml:"timeout,omitempty" json:"timeout"`
+}
+
+type (
+ NVMe struct {
+ module.Base
+ Config `yaml:",inline" json:""`
+
+ charts *module.Charts
+
+ exec nvmeCLI
+
+ devicePaths map[string]bool
+ listDevicesTime time.Time
+ listDevicesEvery time.Duration
+ forceListDevices bool
+ }
+ nvmeCLI interface {
+ list() (*nvmeDeviceList, error)
+ smartLog(devicePath string) (*nvmeDeviceSmartLog, error)
+ }
+)
+
+func (n *NVMe) Configuration() any {
+ return n.Config
+}
+
+func (n *NVMe) Init() error {
+ nvmeExec, err := n.initNVMeCLIExec()
+ if err != nil {
+ n.Errorf("init nvme-cli exec: %v", err)
+ return err
+ }
+ n.exec = nvmeExec
+
+ return nil
+}
+
+func (n *NVMe) Check() error {
+ mx, err := n.collect()
+ if err != nil {
+ n.Error(err)
+ return err
+ }
+ if len(mx) == 0 {
+ return errors.New("no metrics collected")
+ }
+ return nil
+}
+
+func (n *NVMe) Charts() *module.Charts {
+ return n.charts
+}
+
+func (n *NVMe) Collect() map[string]int64 {
+ mx, err := n.collect()
+ if err != nil {
+ n.Error(err)
+ }
+
+ if len(mx) == 0 {
+ return nil
+ }
+ return mx
+}
+
+func (n *NVMe) Cleanup() {}
diff --git a/src/go/collectors/go.d.plugin/modules/nvme/nvme_test.go b/src/go/collectors/go.d.plugin/modules/nvme/nvme_test.go
new file mode 100644
index 000000000..ab814442d
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/nvme/nvme_test.go
@@ -0,0 +1,430 @@
+// SPDX-License-Identifier: GPL-3.0-or-later
+
+package nvme
+
+import (
+ "encoding/json"
+ "errors"
+ "fmt"
+ "os"
+ "testing"
+
+ "github.com/netdata/netdata/go/go.d.plugin/agent/module"
+
+ "github.com/stretchr/testify/assert"
+ "github.com/stretchr/testify/require"
+)
+
+var (
+ dataConfigJSON, _ = os.ReadFile("testdata/config.json")
+ dataConfigYAML, _ = os.ReadFile("testdata/config.yaml")
+
+ dataNVMeListJSON, _ = os.ReadFile("testdata/nvme-list.json")
+ dataNVMeListEmptyJSON, _ = os.ReadFile("testdata/nvme-list-empty.json")
+ dataNVMeSmartLogJSON, _ = os.ReadFile("testdata/nvme-smart-log.json")
+ dataNVMeSmartLogStringJSON, _ = os.ReadFile("testdata/nvme-smart-log-string.json")
+ dataNVMeSmartLogFloatJSON, _ = os.ReadFile("testdata/nvme-smart-log-float.json")
+)
+
+func Test_testDataIsValid(t *testing.T) {
+ for name, data := range map[string][]byte{
+ "dataConfigJSON": dataConfigJSON,
+ "dataConfigYAML": dataConfigYAML,
+ "dataNVMeListJSON": dataNVMeListJSON,
+ "dataNVMeListEmptyJSON": dataNVMeListEmptyJSON,
+ "dataNVMeSmartLogStringJSON": dataNVMeSmartLogStringJSON,
+ "dataNVMeSmartLogFloatJSON": dataNVMeSmartLogFloatJSON,
+ } {
+ require.NotNil(t, data, name)
+ }
+}
+
+func TestNVMe_ConfigurationSerialize(t *testing.T) {
+ module.TestConfigurationSerialize(t, &NVMe{}, dataConfigJSON, dataConfigYAML)
+}
+
+func TestNVMe_Init(t *testing.T) {
+ tests := map[string]struct {
+ config Config
+ wantFail bool
+ }{
+ "fails if 'ndsudo' not found": {
+ wantFail: true,
+ config: New().Config,
+ },
+ }
+
+ for name, test := range tests {
+ t.Run(name, func(t *testing.T) {
+ nv := New()
+
+ if test.wantFail {
+ assert.Error(t, nv.Init())
+ } else {
+ assert.NoError(t, nv.Init())
+ }
+ })
+ }
+}
+
+func TestNVMe_Charts(t *testing.T) {
+ assert.NotNil(t, New().Charts())
+}
+
+func TestNVMe_Cleanup(t *testing.T) {
+ assert.NotPanics(t, New().Cleanup)
+}
+
+func TestNVMe_Check(t *testing.T) {
+ tests := map[string]struct {
+ wantFail bool
+ prepare func(n *NVMe)
+ }{
+ "success if all calls successful": {
+ wantFail: false,
+ prepare: prepareCaseOK,
+ },
+ "fails if 'nvme list' returns an empty list": {
+ wantFail: true,
+ prepare: prepareCaseEmptyList,
+ },
+ "fails if 'nvme list' returns an error": {
+ wantFail: true,
+ prepare: prepareCaseErrOnList,
+ },
+ "fails if 'nvme smart-log' returns an error": {
+ wantFail: true,
+ prepare: prepareCaseErrOnSmartLog,
+ },
+ }
+
+ for name, test := range tests {
+ t.Run(name, func(t *testing.T) {
+ n := New()
+
+ test.prepare(n)
+
+ if test.wantFail {
+ assert.Error(t, n.Check())
+ } else {
+ assert.NoError(t, n.Check())
+ }
+ })
+ }
+}
+
+func TestNVMe_Collect(t *testing.T) {
+ type testCaseStep struct {
+ prepare func(n *NVMe)
+ check func(t *testing.T, n *NVMe)
+ }
+
+ tests := map[string][]testCaseStep{
+ "success if all calls successful": {
+ {
+ prepare: prepareCaseOK,
+ check: func(t *testing.T, n *NVMe) {
+ mx := n.Collect()
+
+ expected := map[string]int64{
+ "device_nvme0n1_available_spare": 100,
+ "device_nvme0n1_controller_busy_time": 497040,
+ "device_nvme0n1_critical_comp_time": 0,
+ "device_nvme0n1_critical_warning_available_spare": 0,
+ "device_nvme0n1_critical_warning_nvm_subsystem_reliability": 0,
+ "device_nvme0n1_critical_warning_persistent_memory_read_only": 0,
+ "device_nvme0n1_critical_warning_read_only": 0,
+ "device_nvme0n1_critical_warning_temp_threshold": 0,
+ "device_nvme0n1_critical_warning_volatile_mem_backup_failed": 0,
+ "device_nvme0n1_data_units_read": 5068041216000,
+ "device_nvme0n1_data_units_written": 69712734208000,
+ "device_nvme0n1_host_read_commands": 313528805,
+ "device_nvme0n1_host_write_commands": 1928062610,
+ "device_nvme0n1_media_errors": 0,
+ "device_nvme0n1_num_err_log_entries": 110,
+ "device_nvme0n1_percentage_used": 2,
+ "device_nvme0n1_power_cycles": 64,
+ "device_nvme0n1_power_on_time": 17906400,
+ "device_nvme0n1_temperature": 36,
+ "device_nvme0n1_thm_temp1_total_time": 0,
+ "device_nvme0n1_thm_temp1_trans_count": 0,
+ "device_nvme0n1_thm_temp2_total_time": 0,
+ "device_nvme0n1_thm_temp2_trans_count": 0,
+ "device_nvme0n1_unsafe_shutdowns": 39,
+ "device_nvme0n1_warning_temp_time": 0,
+ "device_nvme1n1_available_spare": 100,
+ "device_nvme1n1_controller_busy_time": 497040,
+ "device_nvme1n1_critical_comp_time": 0,
+ "device_nvme1n1_critical_warning_available_spare": 0,
+ "device_nvme1n1_critical_warning_nvm_subsystem_reliability": 0,
+ "device_nvme1n1_critical_warning_persistent_memory_read_only": 0,
+ "device_nvme1n1_critical_warning_read_only": 0,
+ "device_nvme1n1_critical_warning_temp_threshold": 0,
+ "device_nvme1n1_critical_warning_volatile_mem_backup_failed": 0,
+ "device_nvme1n1_data_units_read": 5068041216000,
+ "device_nvme1n1_data_units_written": 69712734208000,
+ "device_nvme1n1_host_read_commands": 313528805,
+ "device_nvme1n1_host_write_commands": 1928062610,
+ "device_nvme1n1_media_errors": 0,
+ "device_nvme1n1_num_err_log_entries": 110,
+ "device_nvme1n1_percentage_used": 2,
+ "device_nvme1n1_power_cycles": 64,
+ "device_nvme1n1_power_on_time": 17906400,
+ "device_nvme1n1_temperature": 36,
+ "device_nvme1n1_thm_temp1_total_time": 0,
+ "device_nvme1n1_thm_temp1_trans_count": 0,
+ "device_nvme1n1_thm_temp2_total_time": 0,
+ "device_nvme1n1_thm_temp2_trans_count": 0,
+ "device_nvme1n1_unsafe_shutdowns": 39,
+ "device_nvme1n1_warning_temp_time": 0,
+ }
+
+ assert.Equal(t, expected, mx)
+ },
+ },
+ },
+ "success if all calls successful with string values": {
+ {
+ prepare: prepareCaseStringValuesOK,
+ check: func(t *testing.T, n *NVMe) {
+ mx := n.Collect()
+
+ expected := map[string]int64{
+ "device_nvme0n1_available_spare": 100,
+ "device_nvme0n1_controller_busy_time": 497040,
+ "device_nvme0n1_critical_comp_time": 0,
+ "device_nvme0n1_critical_warning_available_spare": 0,
+ "device_nvme0n1_critical_warning_nvm_subsystem_reliability": 0,
+ "device_nvme0n1_critical_warning_persistent_memory_read_only": 0,
+ "device_nvme0n1_critical_warning_read_only": 0,
+ "device_nvme0n1_critical_warning_temp_threshold": 0,
+ "device_nvme0n1_critical_warning_volatile_mem_backup_failed": 0,
+ "device_nvme0n1_data_units_read": 5068041216000,
+ "device_nvme0n1_data_units_written": 69712734208000,
+ "device_nvme0n1_host_read_commands": 313528805,
+ "device_nvme0n1_host_write_commands": 1928062610,
+ "device_nvme0n1_media_errors": 0,
+ "device_nvme0n1_num_err_log_entries": 110,
+ "device_nvme0n1_percentage_used": 2,
+ "device_nvme0n1_power_cycles": 64,
+ "device_nvme0n1_power_on_time": 17906400,
+ "device_nvme0n1_temperature": 36,
+ "device_nvme0n1_thm_temp1_total_time": 0,
+ "device_nvme0n1_thm_temp1_trans_count": 0,
+ "device_nvme0n1_thm_temp2_total_time": 0,
+ "device_nvme0n1_thm_temp2_trans_count": 0,
+ "device_nvme0n1_unsafe_shutdowns": 39,
+ "device_nvme0n1_warning_temp_time": 0,
+ "device_nvme1n1_available_spare": 100,
+ "device_nvme1n1_controller_busy_time": 497040,
+ "device_nvme1n1_critical_comp_time": 0,
+ "device_nvme1n1_critical_warning_available_spare": 0,
+ "device_nvme1n1_critical_warning_nvm_subsystem_reliability": 0,
+ "device_nvme1n1_critical_warning_persistent_memory_read_only": 0,
+ "device_nvme1n1_critical_warning_read_only": 0,
+ "device_nvme1n1_critical_warning_temp_threshold": 0,
+ "device_nvme1n1_critical_warning_volatile_mem_backup_failed": 0,
+ "device_nvme1n1_data_units_read": 5068041216000,
+ "device_nvme1n1_data_units_written": 69712734208000,
+ "device_nvme1n1_host_read_commands": 313528805,
+ "device_nvme1n1_host_write_commands": 1928062610,
+ "device_nvme1n1_media_errors": 0,
+ "device_nvme1n1_num_err_log_entries": 110,
+ "device_nvme1n1_percentage_used": 2,
+ "device_nvme1n1_power_cycles": 64,
+ "device_nvme1n1_power_on_time": 17906400,
+ "device_nvme1n1_temperature": 36,
+ "device_nvme1n1_thm_temp1_total_time": 0,
+ "device_nvme1n1_thm_temp1_trans_count": 0,
+ "device_nvme1n1_thm_temp2_total_time": 0,
+ "device_nvme1n1_thm_temp2_trans_count": 0,
+ "device_nvme1n1_unsafe_shutdowns": 39,
+ "device_nvme1n1_warning_temp_time": 0,
+ }
+
+ assert.Equal(t, expected, mx)
+ },
+ },
+ },
+ "success if all calls successful with float values": {
+ {
+ prepare: prepareCaseFloatValuesOK,
+ check: func(t *testing.T, n *NVMe) {
+ mx := n.Collect()
+
+ expected := map[string]int64{
+ "device_nvme0n1_available_spare": 100,
+ "device_nvme0n1_controller_busy_time": 497040,
+ "device_nvme0n1_critical_comp_time": 0,
+ "device_nvme0n1_critical_warning_available_spare": 0,
+ "device_nvme0n1_critical_warning_nvm_subsystem_reliability": 0,
+ "device_nvme0n1_critical_warning_persistent_memory_read_only": 0,
+ "device_nvme0n1_critical_warning_read_only": 0,
+ "device_nvme0n1_critical_warning_temp_threshold": 0,
+ "device_nvme0n1_critical_warning_volatile_mem_backup_failed": 0,
+ "device_nvme0n1_data_units_read": 5068041216000,
+ "device_nvme0n1_data_units_written": 69712734208000,
+ "device_nvme0n1_host_read_commands": 313528805,
+ "device_nvme0n1_host_write_commands": 1928062610,
+ "device_nvme0n1_media_errors": 0,
+ "device_nvme0n1_num_err_log_entries": 110,
+ "device_nvme0n1_percentage_used": 2,
+ "device_nvme0n1_power_cycles": 64,
+ "device_nvme0n1_power_on_time": 17906400,
+ "device_nvme0n1_temperature": 36,
+ "device_nvme0n1_thm_temp1_total_time": 0,
+ "device_nvme0n1_thm_temp1_trans_count": 0,
+ "device_nvme0n1_thm_temp2_total_time": 0,
+ "device_nvme0n1_thm_temp2_trans_count": 0,
+ "device_nvme0n1_unsafe_shutdowns": 39,
+ "device_nvme0n1_warning_temp_time": 0,
+ "device_nvme1n1_available_spare": 100,
+ "device_nvme1n1_controller_busy_time": 497040,
+ "device_nvme1n1_critical_comp_time": 0,
+ "device_nvme1n1_critical_warning_available_spare": 0,
+ "device_nvme1n1_critical_warning_nvm_subsystem_reliability": 0,
+ "device_nvme1n1_critical_warning_persistent_memory_read_only": 0,
+ "device_nvme1n1_critical_warning_read_only": 0,
+ "device_nvme1n1_critical_warning_temp_threshold": 0,
+ "device_nvme1n1_critical_warning_volatile_mem_backup_failed": 0,
+ "device_nvme1n1_data_units_read": 5068041216000,
+ "device_nvme1n1_data_units_written": 69712734208000,
+ "device_nvme1n1_host_read_commands": 313528805,
+ "device_nvme1n1_host_write_commands": 1928062610,
+ "device_nvme1n1_media_errors": 0,
+ "device_nvme1n1_num_err_log_entries": 110,
+ "device_nvme1n1_percentage_used": 2,
+ "device_nvme1n1_power_cycles": 64,
+ "device_nvme1n1_power_on_time": 17906400,
+ "device_nvme1n1_temperature": 36,
+ "device_nvme1n1_thm_temp1_total_time": 0,
+ "device_nvme1n1_thm_temp1_trans_count": 0,
+ "device_nvme1n1_thm_temp2_total_time": 0,
+ "device_nvme1n1_thm_temp2_trans_count": 0,
+ "device_nvme1n1_unsafe_shutdowns": 39,
+ "device_nvme1n1_warning_temp_time": 0,
+ }
+
+ assert.Equal(t, expected, mx)
+ },
+ },
+ },
+ "fail if 'nvme list' returns an empty list": {
+ {
+ prepare: prepareCaseEmptyList,
+ check: func(t *testing.T, n *NVMe) {
+ mx := n.Collect()
+
+ assert.Equal(t, (map[string]int64)(nil), mx)
+ },
+ },
+ },
+ "fail if 'nvme list' returns an error": {
+ {
+ prepare: prepareCaseErrOnList,
+ check: func(t *testing.T, n *NVMe) {
+ mx := n.Collect()
+
+ assert.Equal(t, (map[string]int64)(nil), mx)
+ },
+ },
+ },
+ "fail if 'nvme smart-log' returns an error": {
+ {
+ prepare: prepareCaseErrOnSmartLog,
+ check: func(t *testing.T, n *NVMe) {
+ mx := n.Collect()
+
+ assert.Equal(t, (map[string]int64)(nil), mx)
+ },
+ },
+ },
+ }
+
+ for name, test := range tests {
+ t.Run(name, func(t *testing.T) {
+ n := New()
+
+ for i, step := range test {
+ t.Run(fmt.Sprintf("step[%d]", i), func(t *testing.T) {
+ step.prepare(n)
+ step.check(t, n)
+ })
+ }
+ })
+ }
+}
+
+func prepareCaseOK(n *NVMe) {
+ n.exec = &mockNVMeCLIExec{}
+}
+
+func prepareCaseStringValuesOK(n *NVMe) {
+ n.exec = &mockNVMeCLIExec{smartLogString: true}
+}
+
+func prepareCaseFloatValuesOK(n *NVMe) {
+ n.exec = &mockNVMeCLIExec{smartLogFloat: true}
+}
+
+func prepareCaseEmptyList(n *NVMe) {
+ n.exec = &mockNVMeCLIExec{emptyList: true}
+}
+
+func prepareCaseErrOnList(n *NVMe) {
+ n.exec = &mockNVMeCLIExec{errOnList: true}
+}
+
+func prepareCaseErrOnSmartLog(n *NVMe) {
+ n.exec = &mockNVMeCLIExec{errOnSmartLog: true}
+}
+
+type mockNVMeCLIExec struct {
+ errOnList bool
+ errOnSmartLog bool
+ emptyList bool
+ smartLogString bool
+ smartLogFloat bool
+}
+
+func (m *mockNVMeCLIExec) list() (*nvmeDeviceList, error) {
+ if m.errOnList {
+ return nil, errors.New("mock.list() error")
+ }
+
+ data := dataNVMeListJSON
+ if m.emptyList {
+ data = dataNVMeListEmptyJSON
+ }
+
+ var v nvmeDeviceList
+ if err := json.Unmarshal(data, &v); err != nil {
+ return nil, err
+ }
+
+ return &v, nil
+}
+
+func (m *mockNVMeCLIExec) smartLog(_ string) (*nvmeDeviceSmartLog, error) {
+ if m.errOnSmartLog {
+ return nil, errors.New("mock.smartLog() error")
+ }
+ if m.emptyList {
+ return nil, errors.New("mock.smartLog() no devices error")
+ }
+
+ data := dataNVMeSmartLogJSON
+ if m.smartLogString {
+ data = dataNVMeSmartLogStringJSON
+ }
+ if m.smartLogFloat {
+ data = dataNVMeSmartLogFloatJSON
+ }
+
+ var v nvmeDeviceSmartLog
+ if err := json.Unmarshal(data, &v); err != nil {
+ return nil, err
+ }
+
+ return &v, nil
+}
diff --git a/src/go/collectors/go.d.plugin/modules/nvme/testdata/config.json b/src/go/collectors/go.d.plugin/modules/nvme/testdata/config.json
new file mode 100644
index 000000000..291ecee3d
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/nvme/testdata/config.json
@@ -0,0 +1,4 @@
+{
+ "update_every": 123,
+ "timeout": 123.123
+}
diff --git a/src/go/collectors/go.d.plugin/modules/nvme/testdata/config.yaml b/src/go/collectors/go.d.plugin/modules/nvme/testdata/config.yaml
new file mode 100644
index 000000000..25b0b4c78
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/nvme/testdata/config.yaml
@@ -0,0 +1,2 @@
+update_every: 123
+timeout: 123.123
diff --git a/src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-list-empty.json b/src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-list-empty.json
new file mode 100644
index 000000000..e8da2407f
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-list-empty.json
@@ -0,0 +1,4 @@
+{
+ "Devices": [
+ ]
+} \ No newline at end of file
diff --git a/src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-list.json b/src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-list.json
new file mode 100644
index 000000000..6bf159c4f
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-list.json
@@ -0,0 +1,30 @@
+{
+ "Devices": [
+ {
+ "NameSpace": 1,
+ "DevicePath": "/dev/nvme0n1",
+ "Firmware": "SU6SM001",
+ "Index": 0,
+ "ModelNumber": "Seagate FireCuda 530 ZP4000GM30023",
+ "ProductName": "Non-Volatile memory controller: Seagate Technology PLC Device 0x5018",
+ "SerialNumber": "7VS00KNX",
+ "UsedBytes": 4000787030016,
+ "MaximumLBA": 7814037168,
+ "PhysicalSize": 4000787030016,
+ "SectorSize": 512
+ },
+ {
+ "NameSpace": 1,
+ "DevicePath": "/dev/nvme1n1",
+ "Firmware": "SU6SM001",
+ "Index": 1,
+ "ModelNumber": "Seagate FireCuda 530 ZP4000GM30023",
+ "ProductName": "Non-Volatile memory controller: Seagate Technology PLC Device 0x5018",
+ "SerialNumber": "7VS00J76",
+ "UsedBytes": 4000787030016,
+ "MaximumLBA": 7814037168,
+ "PhysicalSize": 4000787030016,
+ "SectorSize": 512
+ }
+ ]
+} \ No newline at end of file
diff --git a/src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-smart-log-float.json b/src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-smart-log-float.json
new file mode 100644
index 000000000..f63dd9772
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-smart-log-float.json
@@ -0,0 +1,24 @@
+{
+ "critical_warning": 0,
+ "temperature": 310.0,
+ "avail_spare": 100.0,
+ "spare_thresh": 5.0,
+ "percent_used": 2.0,
+ "endurance_grp_critical_warning_summary": 0,
+ "data_units_read": 9898518.0,
+ "data_units_written": 136157684.0,
+ "host_read_commands": 313528805.0,
+ "host_write_commands": 1928062610.0,
+ "controller_busy_time": 8284.0,
+ "power_cycles": 64.0,
+ "power_on_hours": 4974.0,
+ "unsafe_shutdowns": 39.0,
+ "media_errors": 0,
+ "num_err_log_entries": 110.0,
+ "warning_temp_time": 0,
+ "critical_comp_time": 0,
+ "thm_temp1_trans_count": 0,
+ "thm_temp2_trans_count": 0,
+ "thm_temp1_total_time": 0,
+ "thm_temp2_total_time": 0
+}
diff --git a/src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-smart-log-string.json b/src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-smart-log-string.json
new file mode 100644
index 000000000..f582e7485
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-smart-log-string.json
@@ -0,0 +1,24 @@
+{
+ "critical_warning": "0",
+ "temperature": "310",
+ "avail_spare": "100",
+ "spare_thresh": "5",
+ "percent_used": "2",
+ "endurance_grp_critical_warning_summary": "0",
+ "data_units_read": "9898518",
+ "data_units_written": "136157684",
+ "host_read_commands": "313528805",
+ "host_write_commands": "1928062610",
+ "controller_busy_time": "8284",
+ "power_cycles": "64",
+ "power_on_hours": "4974",
+ "unsafe_shutdowns": "39",
+ "media_errors": "0",
+ "num_err_log_entries": "110",
+ "warning_temp_time": "0",
+ "critical_comp_time": "0",
+ "thm_temp1_trans_count": "0",
+ "thm_temp2_trans_count": "0",
+ "thm_temp1_total_time": "0",
+ "thm_temp2_total_time": "0"
+}
diff --git a/src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-smart-log.json b/src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-smart-log.json
new file mode 100644
index 000000000..cbd0e4c7d
--- /dev/null
+++ b/src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-smart-log.json
@@ -0,0 +1,24 @@
+{
+ "critical_warning": 0,
+ "temperature": 310,
+ "avail_spare": 100,
+ "spare_thresh": 5,
+ "percent_used": 2,
+ "endurance_grp_critical_warning_summary": 0,
+ "data_units_read": 9898518,
+ "data_units_written": 136157684,
+ "host_read_commands": 313528805,
+ "host_write_commands": 1928062610,
+ "controller_busy_time": 8284,
+ "power_cycles": 64,
+ "power_on_hours": 4974,
+ "unsafe_shutdowns": 39,
+ "media_errors": 0,
+ "num_err_log_entries": 110,
+ "warning_temp_time": 0,
+ "critical_comp_time": 0,
+ "thm_temp1_trans_count": 0,
+ "thm_temp2_trans_count": 0,
+ "thm_temp1_total_time": 0,
+ "thm_temp2_total_time": 0
+}