diff options
Diffstat (limited to '')
17 files changed, 1627 insertions, 0 deletions
diff --git a/src/go/collectors/go.d.plugin/modules/nvme/README.md b/src/go/collectors/go.d.plugin/modules/nvme/README.md new file mode 120000 index 000000000..ca657b905 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/nvme/README.md @@ -0,0 +1 @@ +integrations/nvme_devices.md
\ No newline at end of file diff --git a/src/go/collectors/go.d.plugin/modules/nvme/charts.go b/src/go/collectors/go.d.plugin/modules/nvme/charts.go new file mode 100644 index 000000000..8404d2dcc --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/nvme/charts.go @@ -0,0 +1,267 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package nvme + +import ( + "fmt" + "strings" + + "github.com/netdata/netdata/go/go.d.plugin/agent/module" +) + +const ( + _ = 2050 + iota // right after Disks section + prioDeviceEstimatedEndurancePerc + prioDeviceAvailableSparePerc + prioDeviceCompositeTemperature + prioDeviceIOTransferredCount + prioDevicePowerCyclesCount + prioDevicePowerOnTime + prioDeviceUnsafeShutdownsCount + prioDeviceCriticalWarningsState + prioDeviceMediaErrorsRate + prioDeviceErrorLogEntriesRate + prioDeviceWarningCompositeTemperatureTime + prioDeviceCriticalCompositeTemperatureTime + prioDeviceThmTemp1TransitionsCount + prioDeviceThmTemp2TransitionsRate + prioDeviceThmTemp1Time + prioDeviceThmTemp2Time +) + +var deviceChartsTmpl = module.Charts{ + deviceEstimatedEndurancePercChartTmpl.Copy(), + deviceAvailableSparePercChartTmpl.Copy(), + deviceCompositeTemperatureChartTmpl.Copy(), + deviceIOTransferredCountChartTmpl.Copy(), + devicePowerCyclesCountChartTmpl.Copy(), + devicePowerOnTimeChartTmpl.Copy(), + deviceUnsafeShutdownsCountChartTmpl.Copy(), + deviceCriticalWarningsStateChartTmpl.Copy(), + deviceMediaErrorsRateChartTmpl.Copy(), + deviceErrorLogEntriesRateChartTmpl.Copy(), + deviceWarnCompositeTemperatureTimeChartTmpl.Copy(), + deviceCritCompositeTemperatureTimeChartTmpl.Copy(), + deviceThmTemp1TransitionsRateChartTmpl.Copy(), + deviceThmTemp2TransitionsRateChartTmpl.Copy(), + deviceThmTemp1TimeChartTmpl.Copy(), + deviceThmTemp2TimeChartTmpl.Copy(), +} + +var deviceEstimatedEndurancePercChartTmpl = module.Chart{ + ID: "device_%s_estimated_endurance_perc", + Title: "Estimated endurance", + Units: "percentage", + Fam: "endurance", + Ctx: "nvme.device_estimated_endurance_perc", + Priority: prioDeviceEstimatedEndurancePerc, + Dims: module.Dims{ + {ID: "device_%s_percentage_used", Name: "used"}, + }, +} +var deviceAvailableSparePercChartTmpl = module.Chart{ + ID: "device_%s_available_spare_perc", + Title: "Remaining spare capacity", + Units: "percentage", + Fam: "spare", + Ctx: "nvme.device_available_spare_perc", + Priority: prioDeviceAvailableSparePerc, + Dims: module.Dims{ + {ID: "device_%s_available_spare", Name: "spare"}, + }, +} +var deviceCompositeTemperatureChartTmpl = module.Chart{ + ID: "device_%s_temperature", + Title: "Composite temperature", + Units: "celsius", + Fam: "temperature", + Ctx: "nvme.device_composite_temperature", + Priority: prioDeviceCompositeTemperature, + Dims: module.Dims{ + {ID: "device_%s_temperature", Name: "temperature"}, + }, +} +var deviceIOTransferredCountChartTmpl = module.Chart{ + ID: "device_%s_io_transferred_count", + Title: "Amount of data transferred to and from device", + Units: "bytes", + Fam: "transferred data", + Ctx: "nvme.device_io_transferred_count", + Priority: prioDeviceIOTransferredCount, + Type: module.Area, + Dims: module.Dims{ + {ID: "device_%s_data_units_read", Name: "read"}, + {ID: "device_%s_data_units_written", Name: "written", Mul: -1}, + }, +} + +var devicePowerCyclesCountChartTmpl = module.Chart{ + ID: "device_%s_power_cycles_count", + Title: "Power cycles", + Units: "cycles", + Fam: "power cycles", + Ctx: "nvme.device_power_cycles_count", + Priority: prioDevicePowerCyclesCount, + Dims: module.Dims{ + {ID: "device_%s_power_cycles", Name: "power"}, + }, +} +var devicePowerOnTimeChartTmpl = module.Chart{ + ID: "device_%s_power_on_time", + Title: "Power-on time", + Units: "seconds", + Fam: "power-on time", + Ctx: "nvme.device_power_on_time", + Priority: prioDevicePowerOnTime, + Dims: module.Dims{ + {ID: "device_%s_power_on_time", Name: "power-on"}, + }, +} +var deviceCriticalWarningsStateChartTmpl = module.Chart{ + ID: "device_%s_critical_warnings_state", + Title: "Critical warnings state", + Units: "state", + Fam: "critical warnings", + Ctx: "nvme.device_critical_warnings_state", + Priority: prioDeviceCriticalWarningsState, + Dims: module.Dims{ + {ID: "device_%s_critical_warning_available_spare", Name: "available_spare"}, + {ID: "device_%s_critical_warning_temp_threshold", Name: "temp_threshold"}, + {ID: "device_%s_critical_warning_nvm_subsystem_reliability", Name: "nvm_subsystem_reliability"}, + {ID: "device_%s_critical_warning_read_only", Name: "read_only"}, + {ID: "device_%s_critical_warning_volatile_mem_backup_failed", Name: "volatile_mem_backup_failed"}, + {ID: "device_%s_critical_warning_persistent_memory_read_only", Name: "persistent_memory_read_only"}, + }, +} +var deviceUnsafeShutdownsCountChartTmpl = module.Chart{ + ID: "device_%s_unsafe_shutdowns_count", + Title: "Unsafe shutdowns", + Units: "shutdowns", + Fam: "shutdowns", + Ctx: "nvme.device_unsafe_shutdowns_count", + Priority: prioDeviceUnsafeShutdownsCount, + Dims: module.Dims{ + {ID: "device_%s_unsafe_shutdowns", Name: "unsafe"}, + }, +} +var deviceMediaErrorsRateChartTmpl = module.Chart{ + ID: "device_%s_media_errors_rate", + Title: "Media and data integrity errors", + Units: "errors/s", + Fam: "media errors", + Ctx: "nvme.device_media_errors_rate", + Priority: prioDeviceMediaErrorsRate, + Dims: module.Dims{ + {ID: "device_%s_media_errors", Name: "media", Algo: module.Incremental}, + }, +} +var deviceErrorLogEntriesRateChartTmpl = module.Chart{ + ID: "device_%s_error_log_entries_rate", + Title: "Error log entries", + Units: "entries/s", + Fam: "error log", + Ctx: "nvme.device_error_log_entries_rate", + Priority: prioDeviceErrorLogEntriesRate, + Dims: module.Dims{ + {ID: "device_%s_num_err_log_entries", Name: "error_log", Algo: module.Incremental}, + }, +} +var deviceWarnCompositeTemperatureTimeChartTmpl = module.Chart{ + ID: "device_%s_warning_composite_temperature_time", + Title: "Warning composite temperature time", + Units: "seconds", + Fam: "warn temp time", + Ctx: "nvme.device_warning_composite_temperature_time", + Priority: prioDeviceWarningCompositeTemperatureTime, + Dims: module.Dims{ + {ID: "device_%s_warning_temp_time", Name: "wctemp"}, + }, +} +var deviceCritCompositeTemperatureTimeChartTmpl = module.Chart{ + ID: "device_%s_critical_composite_temperature_time", + Title: "Critical composite temperature time", + Units: "seconds", + Fam: "crit temp time", + Ctx: "nvme.device_critical_composite_temperature_time", + Priority: prioDeviceCriticalCompositeTemperatureTime, + Dims: module.Dims{ + {ID: "device_%s_critical_comp_time", Name: "cctemp"}, + }, +} +var ( + deviceThmTemp1TransitionsRateChartTmpl = module.Chart{ + ID: "device_%s_thm_temp1_transitions_rate", + Title: "Thermal management temp1 transitions", + Units: "transitions/s", + Fam: "thermal mgmt transitions", + Ctx: "nvme.device_thermal_mgmt_temp1_transitions_rate", + Priority: prioDeviceThmTemp1TransitionsCount, + Dims: module.Dims{ + {ID: "device_%s_thm_temp1_trans_count", Name: "temp1", Algo: module.Incremental}, + }, + } + deviceThmTemp2TransitionsRateChartTmpl = module.Chart{ + ID: "device_%s_thm_temp2_transitions_rate", + Title: "Thermal management temp2 transitions", + Units: "transitions/s", + Fam: "thermal mgmt transitions", + Ctx: "nvme.device_thermal_mgmt_temp2_transitions_rate", + Priority: prioDeviceThmTemp2TransitionsRate, + Dims: module.Dims{ + {ID: "device_%s_thm_temp2_trans_count", Name: "temp2", Algo: module.Incremental}, + }, + } +) +var ( + deviceThmTemp1TimeChartTmpl = module.Chart{ + ID: "device_%s_thm_temp1_time", + Title: "Thermal management temp1 time", + Units: "seconds", + Fam: "thermal mgmt time", + Ctx: "nvme.device_thermal_mgmt_temp1_time", + Priority: prioDeviceThmTemp1Time, + Dims: module.Dims{ + {ID: "device_%s_thm_temp1_total_time", Name: "temp1"}, + }, + } + deviceThmTemp2TimeChartTmpl = module.Chart{ + ID: "device_%s_thm_temp2_time", + Title: "Thermal management temp1 time", + Units: "seconds", + Fam: "thermal mgmt time", + Ctx: "nvme.device_thermal_mgmt_temp2_time", + Priority: prioDeviceThmTemp2Time, + Dims: module.Dims{ + {ID: "device_%s_thm_temp2_total_time", Name: "temp2"}, + }, + } +) + +func (n *NVMe) addDeviceCharts(device string) { + charts := deviceChartsTmpl.Copy() + + for _, chart := range *charts { + chart.ID = fmt.Sprintf(chart.ID, device) + chart.Labels = []module.Label{ + {Key: "device", Value: device}, + } + for _, dim := range chart.Dims { + dim.ID = fmt.Sprintf(dim.ID, device) + } + } + + if err := n.Charts().Add(*charts...); err != nil { + n.Warning(err) + } +} + +func (n *NVMe) removeDeviceCharts(device string) { + px := fmt.Sprintf("device_%s", device) + + for _, chart := range *n.Charts() { + if strings.HasPrefix(chart.ID, px) { + chart.MarkRemove() + chart.MarkNotCreated() + } + } +} diff --git a/src/go/collectors/go.d.plugin/modules/nvme/collect.go b/src/go/collectors/go.d.plugin/modules/nvme/collect.go new file mode 100644 index 000000000..1cc942395 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/nvme/collect.go @@ -0,0 +1,120 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package nvme + +import ( + "errors" + "fmt" + "path/filepath" + "strconv" + "time" +) + +func (n *NVMe) collect() (map[string]int64, error) { + if n.exec == nil { + return nil, errors.New("nvme-cli is not initialized (nil)") + } + + now := time.Now() + if n.forceListDevices || now.Sub(n.listDevicesTime) > n.listDevicesEvery { + n.forceListDevices = false + n.listDevicesTime = now + if err := n.listNVMeDevices(); err != nil { + return nil, err + } + } + + mx := make(map[string]int64) + + for path := range n.devicePaths { + if err := n.collectNVMeDevice(mx, path); err != nil { + n.Error(err) + n.forceListDevices = true + continue + } + } + + return mx, nil +} + +func (n *NVMe) collectNVMeDevice(mx map[string]int64, devicePath string) error { + stats, err := n.exec.smartLog(devicePath) + if err != nil { + return fmt.Errorf("exec nvme smart-log for '%s': %v", devicePath, err) + } + + device := extractDeviceFromPath(devicePath) + + mx["device_"+device+"_temperature"] = int64(float64(parseValue(stats.Temperature)) - 273.15) // Kelvin => Celsius + mx["device_"+device+"_percentage_used"] = parseValue(stats.PercentUsed) + mx["device_"+device+"_available_spare"] = parseValue(stats.AvailSpare) + mx["device_"+device+"_data_units_read"] = parseValue(stats.DataUnitsRead) * 1000 * 512 // units => bytes + mx["device_"+device+"_data_units_written"] = parseValue(stats.DataUnitsWritten) * 1000 * 512 // units => bytes + mx["device_"+device+"_host_read_commands"] = parseValue(stats.HostReadCommands) + mx["device_"+device+"_host_write_commands"] = parseValue(stats.HostWriteCommands) + mx["device_"+device+"_power_cycles"] = parseValue(stats.PowerCycles) + mx["device_"+device+"_power_on_time"] = parseValue(stats.PowerOnHours) * 3600 // hours => seconds + mx["device_"+device+"_unsafe_shutdowns"] = parseValue(stats.UnsafeShutdowns) + mx["device_"+device+"_media_errors"] = parseValue(stats.MediaErrors) + mx["device_"+device+"_num_err_log_entries"] = parseValue(stats.NumErrLogEntries) + mx["device_"+device+"_controller_busy_time"] = parseValue(stats.ControllerBusyTime) * 60 // minutes => seconds + mx["device_"+device+"_warning_temp_time"] = parseValue(stats.WarningTempTime) * 60 // minutes => seconds + mx["device_"+device+"_critical_comp_time"] = parseValue(stats.CriticalCompTime) * 60 // minutes => seconds + mx["device_"+device+"_thm_temp1_trans_count"] = parseValue(stats.ThmTemp1TransCount) + mx["device_"+device+"_thm_temp2_trans_count"] = parseValue(stats.ThmTemp2TransCount) + mx["device_"+device+"_thm_temp1_total_time"] = parseValue(stats.ThmTemp1TotalTime) // seconds + mx["device_"+device+"_thm_temp2_total_time"] = parseValue(stats.ThmTemp2TotalTime) // seconds + + mx["device_"+device+"_critical_warning_available_spare"] = boolToInt(parseValue(stats.CriticalWarning)&1 != 0) + mx["device_"+device+"_critical_warning_temp_threshold"] = boolToInt(parseValue(stats.CriticalWarning)&(1<<1) != 0) + mx["device_"+device+"_critical_warning_nvm_subsystem_reliability"] = boolToInt(parseValue(stats.CriticalWarning)&(1<<2) != 0) + mx["device_"+device+"_critical_warning_read_only"] = boolToInt(parseValue(stats.CriticalWarning)&(1<<3) != 0) + mx["device_"+device+"_critical_warning_volatile_mem_backup_failed"] = boolToInt(parseValue(stats.CriticalWarning)&(1<<4) != 0) + mx["device_"+device+"_critical_warning_persistent_memory_read_only"] = boolToInt(parseValue(stats.CriticalWarning)&(1<<5) != 0) + + return nil +} + +func (n *NVMe) listNVMeDevices() error { + devices, err := n.exec.list() + if err != nil { + return fmt.Errorf("exec nvme list: %v", err) + } + + seen := make(map[string]bool) + for _, v := range devices.Devices { + device := extractDeviceFromPath(v.DevicePath) + seen[device] = true + + if !n.devicePaths[v.DevicePath] { + n.devicePaths[v.DevicePath] = true + n.addDeviceCharts(device) + } + } + for path := range n.devicePaths { + device := extractDeviceFromPath(path) + if !seen[device] { + delete(n.devicePaths, device) + n.removeDeviceCharts(device) + } + } + + return nil +} + +func extractDeviceFromPath(devicePath string) string { + _, name := filepath.Split(devicePath) + return name +} + +func boolToInt(v bool) int64 { + if v { + return 1 + } + return 0 +} + +func parseValue(s nvmeNumber) int64 { + v, _ := strconv.ParseFloat(string(s), 64) + return int64(v) +} diff --git a/src/go/collectors/go.d.plugin/modules/nvme/config_schema.json b/src/go/collectors/go.d.plugin/modules/nvme/config_schema.json new file mode 100644 index 000000000..179a24ab1 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/nvme/config_schema.json @@ -0,0 +1,36 @@ +{ + "jsonSchema": { + "$schema": "http://json-schema.org/draft-07/schema#", + "title": "NVMe collector configuration", + "type": "object", + "properties": { + "update_every": { + "title": "Update every", + "description": "Data collection interval, measured in seconds.", + "type": "integer", + "minimum": 1, + "default": 10 + }, + "timeout": { + "title": "Timeout", + "description": "Timeout for executing the `nvme`, specified in seconds.", + "type": "number", + "minimum": 0.5, + "default": 2 + } + }, + "required": [], + "additionalProperties": false, + "patternProperties": { + "^name$": {} + } + }, + "uiSchema": { + "uiOptions": { + "fullPage": true + }, + "timeout": { + "ui:help": "Accepts decimals for precise control (e.g., type 1.5 for 1.5 seconds)." + } + } +} diff --git a/src/go/collectors/go.d.plugin/modules/nvme/exec.go b/src/go/collectors/go.d.plugin/modules/nvme/exec.go new file mode 100644 index 000000000..8c1281a2f --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/nvme/exec.go @@ -0,0 +1,94 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package nvme + +import ( + "bytes" + "context" + "encoding/json" + "os/exec" + "time" +) + +type nvmeDeviceList struct { + Devices []struct { + DevicePath string `json:"DevicePath"` + UsedBytes nvmeNumber `json:"UsedBytes"` + PhysicalSize nvmeNumber `json:"PhysicalSize"` + SectorSize nvmeNumber `json:"SectorSize"` + } +} + +// See "Health Information Log Page" in the Current Specification Version +// https://nvmexpress.org/developers/nvme-specification/ +type nvmeDeviceSmartLog struct { + CriticalWarning nvmeNumber `json:"critical_warning"` + Temperature nvmeNumber `json:"temperature"` + AvailSpare nvmeNumber `json:"avail_spare"` + SpareThresh nvmeNumber `json:"spare_thresh"` + PercentUsed nvmeNumber `json:"percent_used"` + DataUnitsRead nvmeNumber `json:"data_units_read"` + DataUnitsWritten nvmeNumber `json:"data_units_written"` + HostReadCommands nvmeNumber `json:"host_read_commands"` + HostWriteCommands nvmeNumber `json:"host_write_commands"` + ControllerBusyTime nvmeNumber `json:"controller_busy_time"` + PowerCycles nvmeNumber `json:"power_cycles"` + PowerOnHours nvmeNumber `json:"power_on_hours"` + UnsafeShutdowns nvmeNumber `json:"unsafe_shutdowns"` + MediaErrors nvmeNumber `json:"media_errors"` + NumErrLogEntries nvmeNumber `json:"num_err_log_entries"` + WarningTempTime nvmeNumber `json:"warning_temp_time"` + CriticalCompTime nvmeNumber `json:"critical_comp_time"` + ThmTemp1TransCount nvmeNumber `json:"thm_temp1_trans_count"` + ThmTemp2TransCount nvmeNumber `json:"thm_temp2_trans_count"` + ThmTemp1TotalTime nvmeNumber `json:"thm_temp1_total_time"` + ThmTemp2TotalTime nvmeNumber `json:"thm_temp2_total_time"` +} + +// nvme-cli 2.1.1 exposes some values as strings +type nvmeNumber string + +func (n *nvmeNumber) UnmarshalJSON(b []byte) error { + *n = nvmeNumber(bytes.Trim(b, "\"")) + return nil +} + +type nvmeCLIExec struct { + ndsudoPath string + timeout time.Duration +} + +func (n *nvmeCLIExec) list() (*nvmeDeviceList, error) { + bs, err := n.execute("nvme-list") + if err != nil { + return nil, err + } + + var v nvmeDeviceList + if err := json.Unmarshal(bs, &v); err != nil { + return nil, err + } + + return &v, nil +} + +func (n *nvmeCLIExec) smartLog(devicePath string) (*nvmeDeviceSmartLog, error) { + bs, err := n.execute("nvme-smart-log", "--device", devicePath) + if err != nil { + return nil, err + } + + var v nvmeDeviceSmartLog + if err := json.Unmarshal(bs, &v); err != nil { + return nil, err + } + + return &v, nil +} + +func (n *nvmeCLIExec) execute(arg ...string) ([]byte, error) { + ctx, cancel := context.WithTimeout(context.Background(), n.timeout) + defer cancel() + + return exec.CommandContext(ctx, n.ndsudoPath, arg...).Output() +} diff --git a/src/go/collectors/go.d.plugin/modules/nvme/init.go b/src/go/collectors/go.d.plugin/modules/nvme/init.go new file mode 100644 index 000000000..51f1400a0 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/nvme/init.go @@ -0,0 +1,26 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package nvme + +import ( + "fmt" + "os" + "path/filepath" + + "github.com/netdata/netdata/go/go.d.plugin/agent/executable" +) + +func (n *NVMe) initNVMeCLIExec() (nvmeCLI, error) { + ndsudoPath := filepath.Join(executable.Directory, "ndsudo") + + if _, err := os.Stat(ndsudoPath); err != nil { + return nil, fmt.Errorf("ndsudo executable not found: %v", err) + } + + nvmeExec := &nvmeCLIExec{ + ndsudoPath: ndsudoPath, + timeout: n.Timeout.Duration(), + } + + return nvmeExec, nil +} diff --git a/src/go/collectors/go.d.plugin/modules/nvme/integrations/nvme_devices.md b/src/go/collectors/go.d.plugin/modules/nvme/integrations/nvme_devices.md new file mode 100644 index 000000000..fd18c1fd2 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/nvme/integrations/nvme_devices.md @@ -0,0 +1,207 @@ +<!--startmeta +custom_edit_url: "https://github.com/netdata/netdata/edit/master/src/go/collectors/go.d.plugin/modules/nvme/README.md" +meta_yaml: "https://github.com/netdata/netdata/edit/master/src/go/collectors/go.d.plugin/modules/nvme/metadata.yaml" +sidebar_label: "NVMe devices" +learn_status: "Published" +learn_rel_path: "Collecting Metrics/Storage, Mount Points and Filesystems" +most_popular: False +message: "DO NOT EDIT THIS FILE DIRECTLY, IT IS GENERATED BY THE COLLECTOR'S metadata.yaml FILE" +endmeta--> + +# NVMe devices + + +<img src="https://netdata.cloud/img/nvme.svg" width="150"/> + + +Plugin: go.d.plugin +Module: nvme + +<img src="https://img.shields.io/badge/maintained%20by-Netdata-%2300ab44" /> + +## Overview + +This collector monitors the health of NVMe devices. It relies on the [`nvme`](https://github.com/linux-nvme/nvme-cli#nvme-cli) CLI tool but avoids directly executing the binary. Instead, it utilizes `ndsudo`, a Netdata helper specifically designed to run privileged commands securely within the Netdata environment. This approach eliminates the need to use `sudo`, improving security and potentially simplifying permission management. + + + + +This collector is supported on all platforms. + +This collector supports collecting metrics from multiple instances of this integration, including remote instances. + + +### Default Behavior + +#### Auto-Detection + +This integration doesn't support auto-detection. + +#### Limits + +The default configuration for this integration does not impose any limits on data collection. + +#### Performance Impact + +The default configuration for this integration is not expected to impose a significant performance impact on the system. + + +## Metrics + +Metrics grouped by *scope*. + +The scope defines the instance that the metric belongs to. An instance is uniquely identified by a set of labels. + + + +### Per device + +These metrics refer to the NVME device. + +Labels: + +| Label | Description | +|:-----------|:----------------| +| device | NVMe device name | + +Metrics: + +| Metric | Dimensions | Unit | +|:------|:----------|:----| +| nvme.device_estimated_endurance_perc | used | % | +| nvme.device_available_spare_perc | spare | % | +| nvme.device_composite_temperature | temperature | celsius | +| nvme.device_io_transferred_count | read, written | bytes | +| nvme.device_power_cycles_count | power | cycles | +| nvme.device_power_on_time | power-on | seconds | +| nvme.device_critical_warnings_state | available_spare, temp_threshold, nvm_subsystem_reliability, read_only, volatile_mem_backup_failed, persistent_memory_read_only | state | +| nvme.device_unsafe_shutdowns_count | unsafe | shutdowns | +| nvme.device_media_errors_rate | media | errors/s | +| nvme.device_error_log_entries_rate | error_log | entries/s | +| nvme.device_warning_composite_temperature_time | wctemp | seconds | +| nvme.device_critical_composite_temperature_time | cctemp | seconds | +| nvme.device_thermal_mgmt_temp1_transitions_rate | temp1 | transitions/s | +| nvme.device_thermal_mgmt_temp2_transitions_rate | temp2 | transitions/s | +| nvme.device_thermal_mgmt_temp1_time | temp1 | seconds | +| nvme.device_thermal_mgmt_temp2_time | temp2 | seconds | + + + +## Alerts + + +The following alerts are available: + +| Alert name | On metric | Description | +|:------------|:----------|:------------| +| [ nvme_device_critical_warnings_state ](https://github.com/netdata/netdata/blob/master/src/health/health.d/nvme.conf) | nvme.device_critical_warnings_state | NVMe device ${label:device} has critical warnings | + + +## Setup + +### Prerequisites + +#### Install nvme-cli + +See [Distro Support](https://github.com/linux-nvme/nvme-cli#distro-support). Install `nvme-cli` using your distribution's package manager. + + +#### For Netdata running in a Docker container: grant NVMe device access + +Your NVMe devices need to be accessible within the Docker container for Netdata to monitor them. + +Include the following option in your `docker run` command or add the device mapping in your `docker-compose.yml` file: + +- `docker run` + + ```bash + --device '/dev/nvme0n1:/dev/nvme0n1' + ``` + +- `docker-compose.yml` + + ```yaml + services: + netdata: + devices: + - "/dev/nvme0n1:/dev/nvme0n1" + ``` + +**Note**: Replace `/dev/nvme0n1` with your actual NVMe device name. + + + +### Configuration + +#### File + +The configuration file name for this integration is `go.d/nvme.conf`. + + +You can edit the configuration file using the `edit-config` script from the +Netdata [config directory](/docs/netdata-agent/configuration/README.md#the-netdata-config-directory). + +```bash +cd /etc/netdata 2>/dev/null || cd /opt/netdata/etc/netdata +sudo ./edit-config go.d/nvme.conf +``` +#### Options + +The following options can be defined globally: update_every, autodetection_retry. + + +<details open><summary>Config options</summary> + +| Name | Description | Default | Required | +|:----|:-----------|:-------|:--------:| +| update_every | Data collection frequency. | 10 | no | +| autodetection_retry | Recheck interval in seconds. Zero means no recheck will be scheduled. | 0 | no | +| timeout | nvme binary execution timeout. | 2 | no | + +</details> + +#### Examples + +##### Custom update_every + +Allows you to override the default data collection interval. + +<details open><summary>Config</summary> + +```yaml +jobs: + - name: nvme + update_every: 5 # Collect NVMe metrics every 5 seconds + +``` +</details> + + + +## Troubleshooting + +### Debug Mode + +To troubleshoot issues with the `nvme` collector, run the `go.d.plugin` with the debug option enabled. The output +should give you clues as to why the collector isn't working. + +- Navigate to the `plugins.d` directory, usually at `/usr/libexec/netdata/plugins.d/`. If that's not the case on + your system, open `netdata.conf` and look for the `plugins` setting under `[directories]`. + + ```bash + cd /usr/libexec/netdata/plugins.d/ + ``` + +- Switch to the `netdata` user. + + ```bash + sudo -u netdata -s + ``` + +- Run the `go.d.plugin` to debug the collector: + + ```bash + ./go.d.plugin -d -m nvme + ``` + + diff --git a/src/go/collectors/go.d.plugin/modules/nvme/metadata.yaml b/src/go/collectors/go.d.plugin/modules/nvme/metadata.yaml new file mode 100644 index 000000000..98f35af65 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/nvme/metadata.yaml @@ -0,0 +1,225 @@ +plugin_name: go.d.plugin +modules: + - meta: + id: collector-go.d.plugin-nvme + plugin_name: go.d.plugin + module_name: nvme + monitored_instance: + name: NVMe devices + link: "" + icon_filename: nvme.svg + categories: + - data-collection.storage-mount-points-and-filesystems + keywords: + - nvme + related_resources: + integrations: + list: [] + info_provided_to_referring_integrations: + description: "" + most_popular: false + overview: + data_collection: + metrics_description: > + This collector monitors the health of NVMe devices. + It relies on the [`nvme`](https://github.com/linux-nvme/nvme-cli#nvme-cli) CLI tool but avoids directly executing the binary. + Instead, it utilizes `ndsudo`, a Netdata helper specifically designed to run privileged commands securely within the Netdata environment. + This approach eliminates the need to use `sudo`, improving security and potentially simplifying permission management. + method_description: "" + supported_platforms: + include: [] + exclude: [] + multi_instance: true + additional_permissions: + description: "" + default_behavior: + auto_detection: + description: "" + limits: + description: "" + performance_impact: + description: "" + setup: + prerequisites: + list: + - title: Install nvme-cli + description: | + See [Distro Support](https://github.com/linux-nvme/nvme-cli#distro-support). Install `nvme-cli` using your distribution's package manager. + - title: "For Netdata running in a Docker container: grant NVMe device access" + description: | + Your NVMe devices need to be accessible within the Docker container for Netdata to monitor them. + + Include the following option in your `docker run` command or add the device mapping in your `docker-compose.yml` file: + + - `docker run` + + ```bash + --device '/dev/nvme0n1:/dev/nvme0n1' + ``` + + - `docker-compose.yml` + + ```yaml + services: + netdata: + devices: + - "/dev/nvme0n1:/dev/nvme0n1" + ``` + + **Note**: Replace `/dev/nvme0n1` with your actual NVMe device name. + configuration: + file: + name: go.d/nvme.conf + options: + description: | + The following options can be defined globally: update_every, autodetection_retry. + folding: + title: Config options + enabled: true + list: + - name: update_every + description: Data collection frequency. + default_value: 10 + required: false + - name: autodetection_retry + description: Recheck interval in seconds. Zero means no recheck will be scheduled. + default_value: 0 + required: false + - name: timeout + description: nvme binary execution timeout. + default_value: 2 + required: false + examples: + folding: + title: Config + enabled: true + list: + - name: Custom update_every + description: Allows you to override the default data collection interval. + config: | + jobs: + - name: nvme + update_every: 5 # Collect NVMe metrics every 5 seconds + troubleshooting: + problems: + list: [] + alerts: + - name: nvme_device_critical_warnings_state + metric: nvme.device_critical_warnings_state + info: "NVMe device ${label:device} has critical warnings" + link: https://github.com/netdata/netdata/blob/master/src/health/health.d/nvme.conf + metrics: + folding: + title: Metrics + enabled: false + description: "" + availability: [] + scopes: + - name: device + description: These metrics refer to the NVME device. + labels: + - name: device + description: NVMe device name + metrics: + - name: nvme.device_estimated_endurance_perc + description: Estimated endurance + unit: '%' + chart_type: line + dimensions: + - name: used + - name: nvme.device_available_spare_perc + description: Remaining spare capacity + unit: '%' + chart_type: line + dimensions: + - name: spare + - name: nvme.device_composite_temperature + description: Composite temperature + unit: celsius + chart_type: line + dimensions: + - name: temperature + - name: nvme.device_io_transferred_count + description: Amount of data transferred to and from device + unit: bytes + chart_type: area + dimensions: + - name: read + - name: written + - name: nvme.device_power_cycles_count + description: Power cycles + unit: cycles + chart_type: line + dimensions: + - name: power + - name: nvme.device_power_on_time + description: Power-on time + unit: seconds + chart_type: line + dimensions: + - name: power-on + - name: nvme.device_critical_warnings_state + description: Critical warnings state + unit: state + chart_type: line + dimensions: + - name: available_spare + - name: temp_threshold + - name: nvm_subsystem_reliability + - name: read_only + - name: volatile_mem_backup_failed + - name: persistent_memory_read_only + - name: nvme.device_unsafe_shutdowns_count + description: Unsafe shutdowns + unit: shutdowns + chart_type: line + dimensions: + - name: unsafe + - name: nvme.device_media_errors_rate + description: Media and data integrity errors + unit: errors/s + chart_type: line + dimensions: + - name: media + - name: nvme.device_error_log_entries_rate + description: Error log entries + unit: entries/s + chart_type: line + dimensions: + - name: error_log + - name: nvme.device_warning_composite_temperature_time + description: Warning composite temperature time + unit: seconds + chart_type: line + dimensions: + - name: wctemp + - name: nvme.device_critical_composite_temperature_time + description: Critical composite temperature time + unit: seconds + chart_type: line + dimensions: + - name: cctemp + - name: nvme.device_thermal_mgmt_temp1_transitions_rate + description: Thermal management temp1 transitions + unit: transitions/s + chart_type: line + dimensions: + - name: temp1 + - name: nvme.device_thermal_mgmt_temp2_transitions_rate + description: Thermal management temp2 transitions + unit: transitions/s + chart_type: line + dimensions: + - name: temp2 + - name: nvme.device_thermal_mgmt_temp1_time + description: Thermal management temp1 time + unit: seconds + chart_type: line + dimensions: + - name: temp1 + - name: nvme.device_thermal_mgmt_temp2_time + description: Thermal management temp2 time + unit: seconds + chart_type: line + dimensions: + - name: temp2 diff --git a/src/go/collectors/go.d.plugin/modules/nvme/nvme.go b/src/go/collectors/go.d.plugin/modules/nvme/nvme.go new file mode 100644 index 000000000..76b6445b3 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/nvme/nvme.go @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package nvme + +import ( + _ "embed" + "errors" + "time" + + "github.com/netdata/netdata/go/go.d.plugin/agent/module" + "github.com/netdata/netdata/go/go.d.plugin/pkg/web" +) + +//go:embed "config_schema.json" +var configSchema string + +func init() { + module.Register("nvme", module.Creator{ + JobConfigSchema: configSchema, + Defaults: module.Defaults{ + UpdateEvery: 10, + }, + Create: func() module.Module { return New() }, + Config: func() any { return &Config{} }, + }) +} + +func New() *NVMe { + return &NVMe{ + Config: Config{ + Timeout: web.Duration(time.Second * 2), + }, + + charts: &module.Charts{}, + devicePaths: make(map[string]bool), + listDevicesEvery: time.Minute * 10, + } + +} + +type Config struct { + UpdateEvery int `yaml:"update_every,omitempty" json:"update_every"` + Timeout web.Duration `yaml:"timeout,omitempty" json:"timeout"` +} + +type ( + NVMe struct { + module.Base + Config `yaml:",inline" json:""` + + charts *module.Charts + + exec nvmeCLI + + devicePaths map[string]bool + listDevicesTime time.Time + listDevicesEvery time.Duration + forceListDevices bool + } + nvmeCLI interface { + list() (*nvmeDeviceList, error) + smartLog(devicePath string) (*nvmeDeviceSmartLog, error) + } +) + +func (n *NVMe) Configuration() any { + return n.Config +} + +func (n *NVMe) Init() error { + nvmeExec, err := n.initNVMeCLIExec() + if err != nil { + n.Errorf("init nvme-cli exec: %v", err) + return err + } + n.exec = nvmeExec + + return nil +} + +func (n *NVMe) Check() error { + mx, err := n.collect() + if err != nil { + n.Error(err) + return err + } + if len(mx) == 0 { + return errors.New("no metrics collected") + } + return nil +} + +func (n *NVMe) Charts() *module.Charts { + return n.charts +} + +func (n *NVMe) Collect() map[string]int64 { + mx, err := n.collect() + if err != nil { + n.Error(err) + } + + if len(mx) == 0 { + return nil + } + return mx +} + +func (n *NVMe) Cleanup() {} diff --git a/src/go/collectors/go.d.plugin/modules/nvme/nvme_test.go b/src/go/collectors/go.d.plugin/modules/nvme/nvme_test.go new file mode 100644 index 000000000..ab814442d --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/nvme/nvme_test.go @@ -0,0 +1,430 @@ +// SPDX-License-Identifier: GPL-3.0-or-later + +package nvme + +import ( + "encoding/json" + "errors" + "fmt" + "os" + "testing" + + "github.com/netdata/netdata/go/go.d.plugin/agent/module" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +var ( + dataConfigJSON, _ = os.ReadFile("testdata/config.json") + dataConfigYAML, _ = os.ReadFile("testdata/config.yaml") + + dataNVMeListJSON, _ = os.ReadFile("testdata/nvme-list.json") + dataNVMeListEmptyJSON, _ = os.ReadFile("testdata/nvme-list-empty.json") + dataNVMeSmartLogJSON, _ = os.ReadFile("testdata/nvme-smart-log.json") + dataNVMeSmartLogStringJSON, _ = os.ReadFile("testdata/nvme-smart-log-string.json") + dataNVMeSmartLogFloatJSON, _ = os.ReadFile("testdata/nvme-smart-log-float.json") +) + +func Test_testDataIsValid(t *testing.T) { + for name, data := range map[string][]byte{ + "dataConfigJSON": dataConfigJSON, + "dataConfigYAML": dataConfigYAML, + "dataNVMeListJSON": dataNVMeListJSON, + "dataNVMeListEmptyJSON": dataNVMeListEmptyJSON, + "dataNVMeSmartLogStringJSON": dataNVMeSmartLogStringJSON, + "dataNVMeSmartLogFloatJSON": dataNVMeSmartLogFloatJSON, + } { + require.NotNil(t, data, name) + } +} + +func TestNVMe_ConfigurationSerialize(t *testing.T) { + module.TestConfigurationSerialize(t, &NVMe{}, dataConfigJSON, dataConfigYAML) +} + +func TestNVMe_Init(t *testing.T) { + tests := map[string]struct { + config Config + wantFail bool + }{ + "fails if 'ndsudo' not found": { + wantFail: true, + config: New().Config, + }, + } + + for name, test := range tests { + t.Run(name, func(t *testing.T) { + nv := New() + + if test.wantFail { + assert.Error(t, nv.Init()) + } else { + assert.NoError(t, nv.Init()) + } + }) + } +} + +func TestNVMe_Charts(t *testing.T) { + assert.NotNil(t, New().Charts()) +} + +func TestNVMe_Cleanup(t *testing.T) { + assert.NotPanics(t, New().Cleanup) +} + +func TestNVMe_Check(t *testing.T) { + tests := map[string]struct { + wantFail bool + prepare func(n *NVMe) + }{ + "success if all calls successful": { + wantFail: false, + prepare: prepareCaseOK, + }, + "fails if 'nvme list' returns an empty list": { + wantFail: true, + prepare: prepareCaseEmptyList, + }, + "fails if 'nvme list' returns an error": { + wantFail: true, + prepare: prepareCaseErrOnList, + }, + "fails if 'nvme smart-log' returns an error": { + wantFail: true, + prepare: prepareCaseErrOnSmartLog, + }, + } + + for name, test := range tests { + t.Run(name, func(t *testing.T) { + n := New() + + test.prepare(n) + + if test.wantFail { + assert.Error(t, n.Check()) + } else { + assert.NoError(t, n.Check()) + } + }) + } +} + +func TestNVMe_Collect(t *testing.T) { + type testCaseStep struct { + prepare func(n *NVMe) + check func(t *testing.T, n *NVMe) + } + + tests := map[string][]testCaseStep{ + "success if all calls successful": { + { + prepare: prepareCaseOK, + check: func(t *testing.T, n *NVMe) { + mx := n.Collect() + + expected := map[string]int64{ + "device_nvme0n1_available_spare": 100, + "device_nvme0n1_controller_busy_time": 497040, + "device_nvme0n1_critical_comp_time": 0, + "device_nvme0n1_critical_warning_available_spare": 0, + "device_nvme0n1_critical_warning_nvm_subsystem_reliability": 0, + "device_nvme0n1_critical_warning_persistent_memory_read_only": 0, + "device_nvme0n1_critical_warning_read_only": 0, + "device_nvme0n1_critical_warning_temp_threshold": 0, + "device_nvme0n1_critical_warning_volatile_mem_backup_failed": 0, + "device_nvme0n1_data_units_read": 5068041216000, + "device_nvme0n1_data_units_written": 69712734208000, + "device_nvme0n1_host_read_commands": 313528805, + "device_nvme0n1_host_write_commands": 1928062610, + "device_nvme0n1_media_errors": 0, + "device_nvme0n1_num_err_log_entries": 110, + "device_nvme0n1_percentage_used": 2, + "device_nvme0n1_power_cycles": 64, + "device_nvme0n1_power_on_time": 17906400, + "device_nvme0n1_temperature": 36, + "device_nvme0n1_thm_temp1_total_time": 0, + "device_nvme0n1_thm_temp1_trans_count": 0, + "device_nvme0n1_thm_temp2_total_time": 0, + "device_nvme0n1_thm_temp2_trans_count": 0, + "device_nvme0n1_unsafe_shutdowns": 39, + "device_nvme0n1_warning_temp_time": 0, + "device_nvme1n1_available_spare": 100, + "device_nvme1n1_controller_busy_time": 497040, + "device_nvme1n1_critical_comp_time": 0, + "device_nvme1n1_critical_warning_available_spare": 0, + "device_nvme1n1_critical_warning_nvm_subsystem_reliability": 0, + "device_nvme1n1_critical_warning_persistent_memory_read_only": 0, + "device_nvme1n1_critical_warning_read_only": 0, + "device_nvme1n1_critical_warning_temp_threshold": 0, + "device_nvme1n1_critical_warning_volatile_mem_backup_failed": 0, + "device_nvme1n1_data_units_read": 5068041216000, + "device_nvme1n1_data_units_written": 69712734208000, + "device_nvme1n1_host_read_commands": 313528805, + "device_nvme1n1_host_write_commands": 1928062610, + "device_nvme1n1_media_errors": 0, + "device_nvme1n1_num_err_log_entries": 110, + "device_nvme1n1_percentage_used": 2, + "device_nvme1n1_power_cycles": 64, + "device_nvme1n1_power_on_time": 17906400, + "device_nvme1n1_temperature": 36, + "device_nvme1n1_thm_temp1_total_time": 0, + "device_nvme1n1_thm_temp1_trans_count": 0, + "device_nvme1n1_thm_temp2_total_time": 0, + "device_nvme1n1_thm_temp2_trans_count": 0, + "device_nvme1n1_unsafe_shutdowns": 39, + "device_nvme1n1_warning_temp_time": 0, + } + + assert.Equal(t, expected, mx) + }, + }, + }, + "success if all calls successful with string values": { + { + prepare: prepareCaseStringValuesOK, + check: func(t *testing.T, n *NVMe) { + mx := n.Collect() + + expected := map[string]int64{ + "device_nvme0n1_available_spare": 100, + "device_nvme0n1_controller_busy_time": 497040, + "device_nvme0n1_critical_comp_time": 0, + "device_nvme0n1_critical_warning_available_spare": 0, + "device_nvme0n1_critical_warning_nvm_subsystem_reliability": 0, + "device_nvme0n1_critical_warning_persistent_memory_read_only": 0, + "device_nvme0n1_critical_warning_read_only": 0, + "device_nvme0n1_critical_warning_temp_threshold": 0, + "device_nvme0n1_critical_warning_volatile_mem_backup_failed": 0, + "device_nvme0n1_data_units_read": 5068041216000, + "device_nvme0n1_data_units_written": 69712734208000, + "device_nvme0n1_host_read_commands": 313528805, + "device_nvme0n1_host_write_commands": 1928062610, + "device_nvme0n1_media_errors": 0, + "device_nvme0n1_num_err_log_entries": 110, + "device_nvme0n1_percentage_used": 2, + "device_nvme0n1_power_cycles": 64, + "device_nvme0n1_power_on_time": 17906400, + "device_nvme0n1_temperature": 36, + "device_nvme0n1_thm_temp1_total_time": 0, + "device_nvme0n1_thm_temp1_trans_count": 0, + "device_nvme0n1_thm_temp2_total_time": 0, + "device_nvme0n1_thm_temp2_trans_count": 0, + "device_nvme0n1_unsafe_shutdowns": 39, + "device_nvme0n1_warning_temp_time": 0, + "device_nvme1n1_available_spare": 100, + "device_nvme1n1_controller_busy_time": 497040, + "device_nvme1n1_critical_comp_time": 0, + "device_nvme1n1_critical_warning_available_spare": 0, + "device_nvme1n1_critical_warning_nvm_subsystem_reliability": 0, + "device_nvme1n1_critical_warning_persistent_memory_read_only": 0, + "device_nvme1n1_critical_warning_read_only": 0, + "device_nvme1n1_critical_warning_temp_threshold": 0, + "device_nvme1n1_critical_warning_volatile_mem_backup_failed": 0, + "device_nvme1n1_data_units_read": 5068041216000, + "device_nvme1n1_data_units_written": 69712734208000, + "device_nvme1n1_host_read_commands": 313528805, + "device_nvme1n1_host_write_commands": 1928062610, + "device_nvme1n1_media_errors": 0, + "device_nvme1n1_num_err_log_entries": 110, + "device_nvme1n1_percentage_used": 2, + "device_nvme1n1_power_cycles": 64, + "device_nvme1n1_power_on_time": 17906400, + "device_nvme1n1_temperature": 36, + "device_nvme1n1_thm_temp1_total_time": 0, + "device_nvme1n1_thm_temp1_trans_count": 0, + "device_nvme1n1_thm_temp2_total_time": 0, + "device_nvme1n1_thm_temp2_trans_count": 0, + "device_nvme1n1_unsafe_shutdowns": 39, + "device_nvme1n1_warning_temp_time": 0, + } + + assert.Equal(t, expected, mx) + }, + }, + }, + "success if all calls successful with float values": { + { + prepare: prepareCaseFloatValuesOK, + check: func(t *testing.T, n *NVMe) { + mx := n.Collect() + + expected := map[string]int64{ + "device_nvme0n1_available_spare": 100, + "device_nvme0n1_controller_busy_time": 497040, + "device_nvme0n1_critical_comp_time": 0, + "device_nvme0n1_critical_warning_available_spare": 0, + "device_nvme0n1_critical_warning_nvm_subsystem_reliability": 0, + "device_nvme0n1_critical_warning_persistent_memory_read_only": 0, + "device_nvme0n1_critical_warning_read_only": 0, + "device_nvme0n1_critical_warning_temp_threshold": 0, + "device_nvme0n1_critical_warning_volatile_mem_backup_failed": 0, + "device_nvme0n1_data_units_read": 5068041216000, + "device_nvme0n1_data_units_written": 69712734208000, + "device_nvme0n1_host_read_commands": 313528805, + "device_nvme0n1_host_write_commands": 1928062610, + "device_nvme0n1_media_errors": 0, + "device_nvme0n1_num_err_log_entries": 110, + "device_nvme0n1_percentage_used": 2, + "device_nvme0n1_power_cycles": 64, + "device_nvme0n1_power_on_time": 17906400, + "device_nvme0n1_temperature": 36, + "device_nvme0n1_thm_temp1_total_time": 0, + "device_nvme0n1_thm_temp1_trans_count": 0, + "device_nvme0n1_thm_temp2_total_time": 0, + "device_nvme0n1_thm_temp2_trans_count": 0, + "device_nvme0n1_unsafe_shutdowns": 39, + "device_nvme0n1_warning_temp_time": 0, + "device_nvme1n1_available_spare": 100, + "device_nvme1n1_controller_busy_time": 497040, + "device_nvme1n1_critical_comp_time": 0, + "device_nvme1n1_critical_warning_available_spare": 0, + "device_nvme1n1_critical_warning_nvm_subsystem_reliability": 0, + "device_nvme1n1_critical_warning_persistent_memory_read_only": 0, + "device_nvme1n1_critical_warning_read_only": 0, + "device_nvme1n1_critical_warning_temp_threshold": 0, + "device_nvme1n1_critical_warning_volatile_mem_backup_failed": 0, + "device_nvme1n1_data_units_read": 5068041216000, + "device_nvme1n1_data_units_written": 69712734208000, + "device_nvme1n1_host_read_commands": 313528805, + "device_nvme1n1_host_write_commands": 1928062610, + "device_nvme1n1_media_errors": 0, + "device_nvme1n1_num_err_log_entries": 110, + "device_nvme1n1_percentage_used": 2, + "device_nvme1n1_power_cycles": 64, + "device_nvme1n1_power_on_time": 17906400, + "device_nvme1n1_temperature": 36, + "device_nvme1n1_thm_temp1_total_time": 0, + "device_nvme1n1_thm_temp1_trans_count": 0, + "device_nvme1n1_thm_temp2_total_time": 0, + "device_nvme1n1_thm_temp2_trans_count": 0, + "device_nvme1n1_unsafe_shutdowns": 39, + "device_nvme1n1_warning_temp_time": 0, + } + + assert.Equal(t, expected, mx) + }, + }, + }, + "fail if 'nvme list' returns an empty list": { + { + prepare: prepareCaseEmptyList, + check: func(t *testing.T, n *NVMe) { + mx := n.Collect() + + assert.Equal(t, (map[string]int64)(nil), mx) + }, + }, + }, + "fail if 'nvme list' returns an error": { + { + prepare: prepareCaseErrOnList, + check: func(t *testing.T, n *NVMe) { + mx := n.Collect() + + assert.Equal(t, (map[string]int64)(nil), mx) + }, + }, + }, + "fail if 'nvme smart-log' returns an error": { + { + prepare: prepareCaseErrOnSmartLog, + check: func(t *testing.T, n *NVMe) { + mx := n.Collect() + + assert.Equal(t, (map[string]int64)(nil), mx) + }, + }, + }, + } + + for name, test := range tests { + t.Run(name, func(t *testing.T) { + n := New() + + for i, step := range test { + t.Run(fmt.Sprintf("step[%d]", i), func(t *testing.T) { + step.prepare(n) + step.check(t, n) + }) + } + }) + } +} + +func prepareCaseOK(n *NVMe) { + n.exec = &mockNVMeCLIExec{} +} + +func prepareCaseStringValuesOK(n *NVMe) { + n.exec = &mockNVMeCLIExec{smartLogString: true} +} + +func prepareCaseFloatValuesOK(n *NVMe) { + n.exec = &mockNVMeCLIExec{smartLogFloat: true} +} + +func prepareCaseEmptyList(n *NVMe) { + n.exec = &mockNVMeCLIExec{emptyList: true} +} + +func prepareCaseErrOnList(n *NVMe) { + n.exec = &mockNVMeCLIExec{errOnList: true} +} + +func prepareCaseErrOnSmartLog(n *NVMe) { + n.exec = &mockNVMeCLIExec{errOnSmartLog: true} +} + +type mockNVMeCLIExec struct { + errOnList bool + errOnSmartLog bool + emptyList bool + smartLogString bool + smartLogFloat bool +} + +func (m *mockNVMeCLIExec) list() (*nvmeDeviceList, error) { + if m.errOnList { + return nil, errors.New("mock.list() error") + } + + data := dataNVMeListJSON + if m.emptyList { + data = dataNVMeListEmptyJSON + } + + var v nvmeDeviceList + if err := json.Unmarshal(data, &v); err != nil { + return nil, err + } + + return &v, nil +} + +func (m *mockNVMeCLIExec) smartLog(_ string) (*nvmeDeviceSmartLog, error) { + if m.errOnSmartLog { + return nil, errors.New("mock.smartLog() error") + } + if m.emptyList { + return nil, errors.New("mock.smartLog() no devices error") + } + + data := dataNVMeSmartLogJSON + if m.smartLogString { + data = dataNVMeSmartLogStringJSON + } + if m.smartLogFloat { + data = dataNVMeSmartLogFloatJSON + } + + var v nvmeDeviceSmartLog + if err := json.Unmarshal(data, &v); err != nil { + return nil, err + } + + return &v, nil +} diff --git a/src/go/collectors/go.d.plugin/modules/nvme/testdata/config.json b/src/go/collectors/go.d.plugin/modules/nvme/testdata/config.json new file mode 100644 index 000000000..291ecee3d --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/nvme/testdata/config.json @@ -0,0 +1,4 @@ +{ + "update_every": 123, + "timeout": 123.123 +} diff --git a/src/go/collectors/go.d.plugin/modules/nvme/testdata/config.yaml b/src/go/collectors/go.d.plugin/modules/nvme/testdata/config.yaml new file mode 100644 index 000000000..25b0b4c78 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/nvme/testdata/config.yaml @@ -0,0 +1,2 @@ +update_every: 123 +timeout: 123.123 diff --git a/src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-list-empty.json b/src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-list-empty.json new file mode 100644 index 000000000..e8da2407f --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-list-empty.json @@ -0,0 +1,4 @@ +{ + "Devices": [ + ] +}
\ No newline at end of file diff --git a/src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-list.json b/src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-list.json new file mode 100644 index 000000000..6bf159c4f --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-list.json @@ -0,0 +1,30 @@ +{ + "Devices": [ + { + "NameSpace": 1, + "DevicePath": "/dev/nvme0n1", + "Firmware": "SU6SM001", + "Index": 0, + "ModelNumber": "Seagate FireCuda 530 ZP4000GM30023", + "ProductName": "Non-Volatile memory controller: Seagate Technology PLC Device 0x5018", + "SerialNumber": "7VS00KNX", + "UsedBytes": 4000787030016, + "MaximumLBA": 7814037168, + "PhysicalSize": 4000787030016, + "SectorSize": 512 + }, + { + "NameSpace": 1, + "DevicePath": "/dev/nvme1n1", + "Firmware": "SU6SM001", + "Index": 1, + "ModelNumber": "Seagate FireCuda 530 ZP4000GM30023", + "ProductName": "Non-Volatile memory controller: Seagate Technology PLC Device 0x5018", + "SerialNumber": "7VS00J76", + "UsedBytes": 4000787030016, + "MaximumLBA": 7814037168, + "PhysicalSize": 4000787030016, + "SectorSize": 512 + } + ] +}
\ No newline at end of file diff --git a/src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-smart-log-float.json b/src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-smart-log-float.json new file mode 100644 index 000000000..f63dd9772 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-smart-log-float.json @@ -0,0 +1,24 @@ +{ + "critical_warning": 0, + "temperature": 310.0, + "avail_spare": 100.0, + "spare_thresh": 5.0, + "percent_used": 2.0, + "endurance_grp_critical_warning_summary": 0, + "data_units_read": 9898518.0, + "data_units_written": 136157684.0, + "host_read_commands": 313528805.0, + "host_write_commands": 1928062610.0, + "controller_busy_time": 8284.0, + "power_cycles": 64.0, + "power_on_hours": 4974.0, + "unsafe_shutdowns": 39.0, + "media_errors": 0, + "num_err_log_entries": 110.0, + "warning_temp_time": 0, + "critical_comp_time": 0, + "thm_temp1_trans_count": 0, + "thm_temp2_trans_count": 0, + "thm_temp1_total_time": 0, + "thm_temp2_total_time": 0 +} diff --git a/src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-smart-log-string.json b/src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-smart-log-string.json new file mode 100644 index 000000000..f582e7485 --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-smart-log-string.json @@ -0,0 +1,24 @@ +{ + "critical_warning": "0", + "temperature": "310", + "avail_spare": "100", + "spare_thresh": "5", + "percent_used": "2", + "endurance_grp_critical_warning_summary": "0", + "data_units_read": "9898518", + "data_units_written": "136157684", + "host_read_commands": "313528805", + "host_write_commands": "1928062610", + "controller_busy_time": "8284", + "power_cycles": "64", + "power_on_hours": "4974", + "unsafe_shutdowns": "39", + "media_errors": "0", + "num_err_log_entries": "110", + "warning_temp_time": "0", + "critical_comp_time": "0", + "thm_temp1_trans_count": "0", + "thm_temp2_trans_count": "0", + "thm_temp1_total_time": "0", + "thm_temp2_total_time": "0" +} diff --git a/src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-smart-log.json b/src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-smart-log.json new file mode 100644 index 000000000..cbd0e4c7d --- /dev/null +++ b/src/go/collectors/go.d.plugin/modules/nvme/testdata/nvme-smart-log.json @@ -0,0 +1,24 @@ +{ + "critical_warning": 0, + "temperature": 310, + "avail_spare": 100, + "spare_thresh": 5, + "percent_used": 2, + "endurance_grp_critical_warning_summary": 0, + "data_units_read": 9898518, + "data_units_written": 136157684, + "host_read_commands": 313528805, + "host_write_commands": 1928062610, + "controller_busy_time": 8284, + "power_cycles": 64, + "power_on_hours": 4974, + "unsafe_shutdowns": 39, + "media_errors": 0, + "num_err_log_entries": 110, + "warning_temp_time": 0, + "critical_comp_time": 0, + "thm_temp1_trans_count": 0, + "thm_temp2_trans_count": 0, + "thm_temp1_total_time": 0, + "thm_temp2_total_time": 0 +} |