diff options
Diffstat (limited to '')
-rw-r--r-- | collectors/python.d.plugin/smartd_log/Makefile.inc | 13 | ||||
-rw-r--r-- | collectors/python.d.plugin/smartd_log/README.md | 125 | ||||
-rw-r--r-- | collectors/python.d.plugin/smartd_log/smartd_log.chart.py | 772 | ||||
-rw-r--r-- | collectors/python.d.plugin/smartd_log/smartd_log.conf | 75 |
4 files changed, 985 insertions, 0 deletions
diff --git a/collectors/python.d.plugin/smartd_log/Makefile.inc b/collectors/python.d.plugin/smartd_log/Makefile.inc new file mode 100644 index 0000000..dc1d0f3 --- /dev/null +++ b/collectors/python.d.plugin/smartd_log/Makefile.inc @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +# THIS IS NOT A COMPLETE Makefile +# IT IS INCLUDED BY ITS PARENT'S Makefile.am +# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT + +# install these files +dist_python_DATA += smartd_log/smartd_log.chart.py +dist_pythonconfig_DATA += smartd_log/smartd_log.conf + +# do not install these files, but include them in the distribution +dist_noinst_DATA += smartd_log/README.md smartd_log/Makefile.inc + diff --git a/collectors/python.d.plugin/smartd_log/README.md b/collectors/python.d.plugin/smartd_log/README.md new file mode 100644 index 0000000..eef34ce --- /dev/null +++ b/collectors/python.d.plugin/smartd_log/README.md @@ -0,0 +1,125 @@ +<!-- +title: "Storage devices monitoring with Netdata" +custom_edit_url: https://github.com/netdata/netdata/edit/master/collectors/python.d.plugin/smartd_log/README.md +sidebar_label: "S.M.A.R.T. attributes" +--> + +# Storage devices monitoring with Netdata + +Monitors `smartd` log files to collect HDD/SSD S.M.A.R.T attributes. + +## Requirements + +- `smartmontools` + +It produces following charts for SCSI devices: + +1. **Read Error Corrected** + +2. **Read Error Uncorrected** + +3. **Write Error Corrected** + +4. **Write Error Uncorrected** + +5. **Verify Error Corrected** + +6. **Verify Error Uncorrected** + +7. **Temperature** + +For ATA devices: + +1. **Read Error Rate** + +2. **Seek Error Rate** + +3. **Soft Read Error Rate** + +4. **Write Error Rate** + +5. **SATA Interface Downshift** + +6. **UDMA CRC Error Count** + +7. **Throughput Performance** + +8. **Seek Time Performance** + +9. **Start/Stop Count** + +10. **Power-On Hours Count** + +11. **Power Cycle Count** + +12. **Unexpected Power Loss** + +13. **Spin-Up Time** + +14. **Spin-up Retries** + +15. **Calibration Retries** + +16. **Temperature** + +17. **Reallocated Sectors Count** + +18. **Reserved Block Count** + +19. **Program Fail Count** + +20. **Erase Fail Count** + +21. **Wear Leveller Worst Case Erase Count** + +22. **Unused Reserved NAND Blocks** + +23. **Reallocation Event Count** + +24. **Current Pending Sector Count** + +25. **Offline Uncorrectable Sector Count** + +26. **Percent Lifetime Used** + +## prerequisite + +`smartd` must be running with `-A` option to write smartd attribute information to files. + +For this you need to set `smartd_opts` (or `SMARTD_ARGS`, check _smartd.service_ content) in `/etc/default/smartmontools`: + +``` +# dump smartd attrs info every 600 seconds +smartd_opts="-A /var/log/smartd/ -i 600" +``` + +You may need to create the smartd directory before smartd will write to it: + +```sh +mkdir -p /var/log/smartd +``` + +Otherwise, all the smartd `.csv` files may get written to `/var/lib/smartmontools` (default location). See also <https://linux.die.net/man/8/smartd> for more info on the `-A --attributelog=PREFIX` command. + +`smartd` appends logs at every run. It's strongly recommended to use `logrotate` for smartd files. + +## Configuration + +Edit the `python.d/smartd_log.conf` configuration file using `edit-config` from the Netdata [config +directory](/docs/configure/nodes.md), which is typically at `/etc/netdata`. + +```bash +cd /etc/netdata # Replace this path with your Netdata config directory, if different +sudo ./edit-config python.d/smartd_log.conf +``` + +```yaml +local: + log_path : '/var/log/smartd/' +``` + +If no configuration is given, module will attempt to read log files in `/var/log/smartd/` directory. + +--- + + diff --git a/collectors/python.d.plugin/smartd_log/smartd_log.chart.py b/collectors/python.d.plugin/smartd_log/smartd_log.chart.py new file mode 100644 index 0000000..dc4e95d --- /dev/null +++ b/collectors/python.d.plugin/smartd_log/smartd_log.chart.py @@ -0,0 +1,772 @@ +# -*- coding: utf-8 -*- +# Description: smart netdata python.d module +# Author: ilyam8, vorph1 +# SPDX-License-Identifier: GPL-3.0-or-later + +import os +import re +from copy import deepcopy +from time import time + +from bases.FrameworkServices.SimpleService import SimpleService +from bases.collection import read_last_line + +INCREMENTAL = 'incremental' +ABSOLUTE = 'absolute' + +ATA = 'ata' +SCSI = 'scsi' +CSV = '.csv' + +DEF_RESCAN_INTERVAL = 60 +DEF_AGE = 30 +DEF_PATH = '/var/log/smartd' + +ATTR1 = '1' +ATTR2 = '2' +ATTR3 = '3' +ATTR4 = '4' +ATTR5 = '5' +ATTR7 = '7' +ATTR8 = '8' +ATTR9 = '9' +ATTR10 = '10' +ATTR11 = '11' +ATTR12 = '12' +ATTR13 = '13' +ATTR170 = '170' +ATTR171 = '171' +ATTR172 = '172' +ATTR173 = '173' +ATTR174 = '174' +ATTR180 = '180' +ATTR183 = '183' +ATTR190 = '190' +ATTR194 = '194' +ATTR196 = '196' +ATTR197 = '197' +ATTR198 = '198' +ATTR199 = '199' +ATTR202 = '202' +ATTR206 = '206' +ATTR233 = '233' +ATTR249 = '249' +ATTR_READ_ERR_COR = 'read-total-err-corrected' +ATTR_READ_ERR_UNC = 'read-total-unc-errors' +ATTR_WRITE_ERR_COR = 'write-total-err-corrected' +ATTR_WRITE_ERR_UNC = 'write-total-unc-errors' +ATTR_VERIFY_ERR_COR = 'verify-total-err-corrected' +ATTR_VERIFY_ERR_UNC = 'verify-total-unc-errors' +ATTR_TEMPERATURE = 'temperature' + +RE_ATA = re.compile( + '(\d+);' # attribute + '(\d+);' # normalized value + '(\d+)', # raw value + re.X +) + +RE_SCSI = re.compile( + '([a-z-]+);' # attribute + '([0-9.]+)', # raw value + re.X +) + +ORDER = [ + # errors + 'read_error_rate', + 'seek_error_rate', + 'soft_read_error_rate', + 'write_error_rate', + 'read_total_err_corrected', + 'read_total_unc_errors', + 'write_total_err_corrected', + 'write_total_unc_errors', + 'verify_total_err_corrected', + 'verify_total_unc_errors', + # external failure + 'sata_interface_downshift', + 'udma_crc_error_count', + # performance + 'throughput_performance', + 'seek_time_performance', + # power + 'start_stop_count', + 'power_on_hours_count', + 'power_cycle_count', + 'unexpected_power_loss', + # spin + 'spin_up_time', + 'spin_up_retries', + 'calibration_retries', + # temperature + 'airflow_temperature_celsius', + 'temperature_celsius', + # wear + 'reallocated_sectors_count', + 'reserved_block_count', + 'program_fail_count', + 'erase_fail_count', + 'wear_leveller_worst_case_erase_count', + 'unused_reserved_nand_blocks', + 'reallocation_event_count', + 'current_pending_sector_count', + 'offline_uncorrectable_sector_count', + 'percent_lifetime_used', + 'media_wearout_indicator', +] + +CHARTS = { + 'read_error_rate': { + 'options': [None, 'Read Error Rate', 'value', 'errors', 'smartd_log.read_error_rate', 'line'], + 'lines': [], + 'attrs': [ATTR1], + 'algo': ABSOLUTE, + }, + 'seek_error_rate': { + 'options': [None, 'Seek Error Rate', 'value', 'errors', 'smartd_log.seek_error_rate', 'line'], + 'lines': [], + 'attrs': [ATTR7], + 'algo': ABSOLUTE, + }, + 'soft_read_error_rate': { + 'options': [None, 'Soft Read Error Rate', 'errors', 'errors', 'smartd_log.soft_read_error_rate', 'line'], + 'lines': [], + 'attrs': [ATTR13], + 'algo': INCREMENTAL, + }, + 'write_error_rate': { + 'options': [None, 'Write Error Rate', 'value', 'errors', 'smartd_log.write_error_rate', 'line'], + 'lines': [], + 'attrs': [ATTR206], + 'algo': ABSOLUTE, + }, + 'read_total_err_corrected': { + 'options': [None, 'Read Error Corrected', 'errors', 'errors', 'smartd_log.read_total_err_corrected', 'line'], + 'lines': [], + 'attrs': [ATTR_READ_ERR_COR], + 'algo': INCREMENTAL, + }, + 'read_total_unc_errors': { + 'options': [None, 'Read Error Uncorrected', 'errors', 'errors', 'smartd_log.read_total_unc_errors', 'line'], + 'lines': [], + 'attrs': [ATTR_READ_ERR_UNC], + 'algo': INCREMENTAL, + }, + 'write_total_err_corrected': { + 'options': [None, 'Write Error Corrected', 'errors', 'errors', 'smartd_log.write_total_err_corrected', 'line'], + 'lines': [], + 'attrs': [ATTR_WRITE_ERR_COR], + 'algo': INCREMENTAL, + }, + 'write_total_unc_errors': { + 'options': [None, 'Write Error Uncorrected', 'errors', 'errors', 'smartd_log.write_total_unc_errors', 'line'], + 'lines': [], + 'attrs': [ATTR_WRITE_ERR_UNC], + 'algo': INCREMENTAL, + }, + 'verify_total_err_corrected': { + 'options': [None, 'Verify Error Corrected', 'errors', 'errors', 'smartd_log.verify_total_err_corrected', + 'line'], + 'lines': [], + 'attrs': [ATTR_VERIFY_ERR_COR], + 'algo': INCREMENTAL, + }, + 'verify_total_unc_errors': { + 'options': [None, 'Verify Error Uncorrected', 'errors', 'errors', 'smartd_log.verify_total_unc_errors', 'line'], + 'lines': [], + 'attrs': [ATTR_VERIFY_ERR_UNC], + 'algo': INCREMENTAL, + }, + 'sata_interface_downshift': { + 'options': [None, 'SATA Interface Downshift', 'events', 'external failure', + 'smartd_log.sata_interface_downshift', 'line'], + 'lines': [], + 'attrs': [ATTR183], + 'algo': INCREMENTAL, + }, + 'udma_crc_error_count': { + 'options': [None, 'UDMA CRC Error Count', 'errors', 'external failure', 'smartd_log.udma_crc_error_count', + 'line'], + 'lines': [], + 'attrs': [ATTR199], + 'algo': INCREMENTAL, + }, + 'throughput_performance': { + 'options': [None, 'Throughput Performance', 'value', 'performance', 'smartd_log.throughput_performance', + 'line'], + 'lines': [], + 'attrs': [ATTR2], + 'algo': ABSOLUTE, + }, + 'seek_time_performance': { + 'options': [None, 'Seek Time Performance', 'value', 'performance', 'smartd_log.seek_time_performance', 'line'], + 'lines': [], + 'attrs': [ATTR8], + 'algo': ABSOLUTE, + }, + 'start_stop_count': { + 'options': [None, 'Start/Stop Count', 'events', 'power', 'smartd_log.start_stop_count', 'line'], + 'lines': [], + 'attrs': [ATTR4], + 'algo': ABSOLUTE, + }, + 'power_on_hours_count': { + 'options': [None, 'Power-On Hours Count', 'hours', 'power', 'smartd_log.power_on_hours_count', 'line'], + 'lines': [], + 'attrs': [ATTR9], + 'algo': ABSOLUTE, + }, + 'power_cycle_count': { + 'options': [None, 'Power Cycle Count', 'events', 'power', 'smartd_log.power_cycle_count', 'line'], + 'lines': [], + 'attrs': [ATTR12], + 'algo': ABSOLUTE, + }, + 'unexpected_power_loss': { + 'options': [None, 'Unexpected Power Loss', 'events', 'power', 'smartd_log.unexpected_power_loss', 'line'], + 'lines': [], + 'attrs': [ATTR174], + 'algo': ABSOLUTE, + }, + 'spin_up_time': { + 'options': [None, 'Spin-Up Time', 'ms', 'spin', 'smartd_log.spin_up_time', 'line'], + 'lines': [], + 'attrs': [ATTR3], + 'algo': ABSOLUTE, + }, + 'spin_up_retries': { + 'options': [None, 'Spin-up Retries', 'retries', 'spin', 'smartd_log.spin_up_retries', 'line'], + 'lines': [], + 'attrs': [ATTR10], + 'algo': INCREMENTAL, + }, + 'calibration_retries': { + 'options': [None, 'Calibration Retries', 'retries', 'spin', 'smartd_log.calibration_retries', 'line'], + 'lines': [], + 'attrs': [ATTR11], + 'algo': INCREMENTAL, + }, + 'airflow_temperature_celsius': { + 'options': [None, 'Airflow Temperature Celsius', 'celsius', 'temperature', + 'smartd_log.airflow_temperature_celsius', 'line'], + 'lines': [], + 'attrs': [ATTR190], + 'algo': ABSOLUTE, + }, + 'temperature_celsius': { + 'options': [None, 'Temperature', 'celsius', 'temperature', 'smartd_log.temperature_celsius', 'line'], + 'lines': [], + 'attrs': [ATTR194, ATTR_TEMPERATURE], + 'algo': ABSOLUTE, + }, + 'reallocated_sectors_count': { + 'options': [None, 'Reallocated Sectors Count', 'sectors', 'wear', 'smartd_log.reallocated_sectors_count', + 'line'], + 'lines': [], + 'attrs': [ATTR5], + 'algo': ABSOLUTE, + }, + 'reserved_block_count': { + 'options': [None, 'Reserved Block Count', 'percentage', 'wear', 'smartd_log.reserved_block_count', 'line'], + 'lines': [], + 'attrs': [ATTR170], + 'algo': ABSOLUTE, + }, + 'program_fail_count': { + 'options': [None, 'Program Fail Count', 'errors', 'wear', 'smartd_log.program_fail_count', 'line'], + 'lines': [], + 'attrs': [ATTR171], + 'algo': INCREMENTAL, + }, + 'erase_fail_count': { + 'options': [None, 'Erase Fail Count', 'failures', 'wear', 'smartd_log.erase_fail_count', 'line'], + 'lines': [], + 'attrs': [ATTR172], + 'algo': INCREMENTAL, + }, + 'wear_leveller_worst_case_erase_count': { + 'options': [None, 'Wear Leveller Worst Case Erase Count', 'erases', 'wear', + 'smartd_log.wear_leveller_worst_case_erase_count', 'line'], + 'lines': [], + 'attrs': [ATTR173], + 'algo': ABSOLUTE, + }, + 'unused_reserved_nand_blocks': { + 'options': [None, 'Unused Reserved NAND Blocks', 'blocks', 'wear', 'smartd_log.unused_reserved_nand_blocks', + 'line'], + 'lines': [], + 'attrs': [ATTR180], + 'algo': ABSOLUTE, + }, + 'reallocation_event_count': { + 'options': [None, 'Reallocation Event Count', 'events', 'wear', 'smartd_log.reallocation_event_count', 'line'], + 'lines': [], + 'attrs': [ATTR196], + 'algo': INCREMENTAL, + }, + 'current_pending_sector_count': { + 'options': [None, 'Current Pending Sector Count', 'sectors', 'wear', 'smartd_log.current_pending_sector_count', + 'line'], + 'lines': [], + 'attrs': [ATTR197], + 'algo': ABSOLUTE, + }, + 'offline_uncorrectable_sector_count': { + 'options': [None, 'Offline Uncorrectable Sector Count', 'sectors', 'wear', + 'smartd_log.offline_uncorrectable_sector_count', 'line'], + 'lines': [], + 'attrs': [ATTR198], + 'algo': ABSOLUTE, + + }, + 'percent_lifetime_used': { + 'options': [None, 'Percent Lifetime Used', 'percentage', 'wear', 'smartd_log.percent_lifetime_used', 'line'], + 'lines': [], + 'attrs': [ATTR202], + 'algo': ABSOLUTE, + }, + 'media_wearout_indicator': { + 'options': [None, 'Media Wearout Indicator', 'percentage', 'wear', 'smartd_log.media_wearout_indicator', 'line'], + 'lines': [], + 'attrs': [ATTR233], + 'algo': ABSOLUTE, + }, + 'nand_writes_1gib': { + 'options': [None, 'NAND Writes', 'GiB', 'wear', 'smartd_log.nand_writes_1gib', 'line'], + 'lines': [], + 'attrs': [ATTR249], + 'algo': ABSOLUTE, + }, +} + +# NOTE: 'parse_temp' decodes ATA 194 raw value. Not heavily tested. Written by @Ferroin +# C code: +# https://github.com/smartmontools/smartmontools/blob/master/smartmontools/atacmds.cpp#L2051 +# +# Calling 'parse_temp' on the raw value will return a 4-tuple, containing +# * temperature +# * minimum +# * maximum +# * over-temperature count +# substituting None for values it can't decode. +# +# Example: +# >>> parse_temp(42952491042) +# >>> (34, 10, 43, None) +# +# +# def check_temp_word(i): +# if i <= 0x7F: +# return 0x11 +# elif i <= 0xFF: +# return 0x01 +# elif 0xFF80 <= i: +# return 0x10 +# return 0x00 +# +# +# def check_temp_range(t, b0, b1): +# if b0 > b1: +# t0, t1 = b1, b0 +# else: +# t0, t1 = b0, b1 +# +# if all([ +# -60 <= t0, +# t0 <= t, +# t <= t1, +# t1 <= 120, +# not (t0 == -1 and t1 <= 0) +# ]): +# return t0, t1 +# return None, None +# +# +# def parse_temp(raw): +# byte = list() +# word = list() +# for i in range(0, 6): +# byte.append(0xFF & (raw >> (i * 8))) +# for i in range(0, 3): +# word.append(0xFFFF & (raw >> (i * 16))) +# +# ctwd = check_temp_word(word[0]) +# +# if not word[2]: +# if ctwd and not word[1]: +# # byte[0] is temp, no other data +# return byte[0], None, None, None +# +# if ctwd and all(check_temp_range(byte[0], byte[2], byte[3])): +# # byte[0] is temp, byte[2] is max or min, byte[3] is min or max +# trange = check_temp_range(byte[0], byte[2], byte[3]) +# return byte[0], trange[0], trange[1], None +# +# if ctwd and all(check_temp_range(byte[0], byte[1], byte[2])): +# # byte[0] is temp, byte[1] is max or min, byte[2] is min or max +# trange = check_temp_range(byte[0], byte[1], byte[2]) +# return byte[0], trange[0], trange[1], None +# +# return None, None, None, None +# +# if ctwd: +# if all( +# [ +# ctwd & check_temp_word(word[1]) & check_temp_word(word[2]) != 0x00, +# all(check_temp_range(byte[0], byte[2], byte[4])), +# ] +# ): +# # byte[0] is temp, byte[2] is max or min, byte[4] is min or max +# trange = check_temp_range(byte[0], byte[2], byte[4]) +# return byte[0], trange[0], trange[1], None +# else: +# trange = check_temp_range(byte[0], byte[2], byte[3]) +# if word[2] < 0x7FFF and all(trange) and trange[1] >= 40: +# # byte[0] is temp, byte[2] is max or min, byte[3] is min or max, word[2] is overtemp count +# return byte[0], trange[0], trange[1], word[2] +# # no data +# return None, None, None, None + + +CHARTED_ATTRS = dict((attr, k) for k, v in CHARTS.items() for attr in v['attrs']) + + +class BaseAtaSmartAttribute: + def __init__(self, name, normalized_value, raw_value): + self.name = name + self.normalized_value = normalized_value + self.raw_value = raw_value + + def value(self): + raise NotImplementedError + + +class AtaRaw(BaseAtaSmartAttribute): + def value(self): + return self.raw_value + + +class AtaNormalized(BaseAtaSmartAttribute): + def value(self): + return self.normalized_value + + +class Ata3(BaseAtaSmartAttribute): + def value(self): + value = int(self.raw_value) + # https://github.com/netdata/netdata/issues/5919 + # + # 3;151;38684000679; + # 423 (Average 447) + # 38684000679 & 0xFFF -> 423 + # (38684000679 & 0xFFF0000) >> 16 -> 447 + if value > 1e6: + return value & 0xFFF + return value + + +class Ata9(BaseAtaSmartAttribute): + def value(self): + value = int(self.raw_value) + if value > 1e6: + return value & 0xFFFF + return value + + +class Ata190(BaseAtaSmartAttribute): + def value(self): + return 100 - int(self.normalized_value) + + +class Ata194(BaseAtaSmartAttribute): + # https://github.com/netdata/netdata/issues/3041 + # https://github.com/netdata/netdata/issues/5919 + # + # The low byte is the current temperature, the third lowest is the maximum, and the fifth lowest is the minimum + def value(self): + value = int(self.raw_value) + if value > 1e6: + return value & 0xFF + return min(int(self.normalized_value), int(self.raw_value)) + + +class BaseSCSISmartAttribute: + def __init__(self, name, raw_value): + self.name = name + self.raw_value = raw_value + + def value(self): + raise NotImplementedError + + +class SCSIRaw(BaseSCSISmartAttribute): + def value(self): + return self.raw_value + + +def ata_attribute_factory(value): + name = value[0] + + if name == ATTR3: + return Ata3(*value) + elif name == ATTR9: + return Ata9(*value) + elif name == ATTR190: + return Ata190(*value) + elif name == ATTR194: + return Ata194(*value) + elif name in [ + ATTR1, + ATTR7, + ATTR202, + ATTR206, + ATTR233, + ]: + return AtaNormalized(*value) + + return AtaRaw(*value) + + +def scsi_attribute_factory(value): + return SCSIRaw(*value) + + +def attribute_factory(value): + name = value[0] + if name.isdigit(): + return ata_attribute_factory(value) + return scsi_attribute_factory(value) + + +def handle_error(*errors): + def on_method(method): + def on_call(*args): + try: + return method(*args) + except errors: + return None + + return on_call + + return on_method + + +class DiskLogFile: + def __init__(self, full_path): + self.path = full_path + self.size = os.path.getsize(full_path) + + @handle_error(OSError) + def is_changed(self): + return self.size != os.path.getsize(self.path) + + @handle_error(OSError) + def is_active(self, current_time, limit): + return (current_time - os.path.getmtime(self.path)) / 60 < limit + + @handle_error(OSError) + def read(self): + self.size = os.path.getsize(self.path) + return read_last_line(self.path) + + +class BaseDisk: + def __init__(self, name, log_file): + self.raw_name = name + self.name = re.sub(r'_+', '_', name) + self.log_file = log_file + self.attrs = list() + self.alive = True + self.charted = False + + def __eq__(self, other): + if isinstance(other, BaseDisk): + return self.raw_name == other.raw_name + return self.raw_name == other + + def __ne__(self, other): + return not self == other + + def __hash__(self): + return hash(repr(self)) + + def parser(self, data): + raise NotImplementedError + + @handle_error(TypeError) + def populate_attrs(self): + self.attrs = list() + line = self.log_file.read() + for value in self.parser(line): + self.attrs.append(attribute_factory(value)) + + return len(self.attrs) + + def data(self): + data = dict() + for attr in self.attrs: + data['{0}_{1}'.format(self.name, attr.name)] = attr.value() + return data + + +class ATADisk(BaseDisk): + def parser(self, data): + return RE_ATA.findall(data) + + +class SCSIDisk(BaseDisk): + def parser(self, data): + return RE_SCSI.findall(data) + + +class Service(SimpleService): + def __init__(self, configuration=None, name=None): + SimpleService.__init__(self, configuration=configuration, name=name) + self.order = ORDER + self.definitions = deepcopy(CHARTS) + self.log_path = configuration.get('log_path', DEF_PATH) + self.age = configuration.get('age', DEF_AGE) + self.exclude = configuration.get('exclude_disks', str()).split() + self.disks = list() + self.runs = 0 + self.do_force_rescan = False + + def check(self): + return self.scan() > 0 + + def get_data(self): + self.runs += 1 + + if self.do_force_rescan or self.runs % DEF_RESCAN_INTERVAL == 0: + self.cleanup() + self.scan() + self.do_force_rescan = False + + data = dict() + + for disk in self.disks: + if not disk.alive: + continue + + if not disk.charted: + self.add_disk_to_charts(disk) + + changed = disk.log_file.is_changed() + + if changed is None: + disk.alive = False + self.do_force_rescan = True + continue + + if changed and disk.populate_attrs() is None: + disk.alive = False + self.do_force_rescan = True + continue + + data.update(disk.data()) + + return data + + def cleanup(self): + current_time = time() + for disk in self.disks[:]: + if any( + [ + not disk.alive, + not disk.log_file.is_active(current_time, self.age), + ] + ): + self.disks.remove(disk.raw_name) + self.remove_disk_from_charts(disk) + + def scan(self): + self.debug('scanning {0}'.format(self.log_path)) + current_time = time() + + for full_name in os.listdir(self.log_path): + disk = self.create_disk_from_file(full_name, current_time) + if not disk: + continue + self.disks.append(disk) + + return len(self.disks) + + def create_disk_from_file(self, full_name, current_time): + if not full_name.endswith(CSV): + self.debug('skipping {0}: not a csv file'.format(full_name)) + return None + + name = os.path.basename(full_name).split('.')[-3] + path = os.path.join(self.log_path, full_name) + + if name in self.disks: + self.debug('skipping {0}: already in disks'.format(full_name)) + return None + + if [p for p in self.exclude if p in name]: + self.debug('skipping {0}: filtered by `exclude` option'.format(full_name)) + return None + + if not os.access(path, os.R_OK): + self.debug('skipping {0}: not readable'.format(full_name)) + return None + + if os.path.getsize(path) == 0: + self.debug('skipping {0}: zero size'.format(full_name)) + return None + + if (current_time - os.path.getmtime(path)) / 60 > self.age: + self.debug('skipping {0}: haven\'t been updated for last {1} minutes'.format(full_name, self.age)) + return None + + if ATA in full_name: + disk = ATADisk(name, DiskLogFile(path)) + elif SCSI in full_name: + disk = SCSIDisk(name, DiskLogFile(path)) + else: + self.debug('skipping {0}: unknown type'.format(full_name)) + return None + + disk.populate_attrs() + if not disk.attrs: + self.error('skipping {0}: parsing failed'.format(full_name)) + return None + + self.debug('added {0}'.format(full_name)) + return disk + + def add_disk_to_charts(self, disk): + if len(self.charts) == 0 or disk.charted: + return + disk.charted = True + + for attr in disk.attrs: + chart_id = CHARTED_ATTRS.get(attr.name) + + if not chart_id or chart_id not in self.charts: + continue + + chart = self.charts[chart_id] + dim = [ + '{0}_{1}'.format(disk.name, attr.name), + disk.name, + CHARTS[chart_id]['algo'], + ] + + if dim[0] in self.charts[chart_id].dimensions: + chart.hide_dimension(dim[0], reverse=True) + else: + chart.add_dimension(dim) + + def remove_disk_from_charts(self, disk): + if len(self.charts) == 0 or not disk.charted: + return + + for attr in disk.attrs: + chart_id = CHARTED_ATTRS.get(attr.name) + + if not chart_id or chart_id not in self.charts: + continue + + self.charts[chart_id].del_dimension('{0}_{1}'.format(disk.name, attr.name)) diff --git a/collectors/python.d.plugin/smartd_log/smartd_log.conf b/collectors/python.d.plugin/smartd_log/smartd_log.conf new file mode 100644 index 0000000..6c01d95 --- /dev/null +++ b/collectors/python.d.plugin/smartd_log/smartd_log.conf @@ -0,0 +1,75 @@ +# netdata python.d.plugin configuration for smartd log +# +# This file is in YaML format. Generally the format is: +# +# name: value +# +# There are 2 sections: +# - global variables +# - one or more JOBS +# +# JOBS allow you to collect values from multiple sources. +# Each source will have its own set of charts. +# +# JOB parameters have to be indented (using spaces only, example below). + +# ---------------------------------------------------------------------- +# Global Variables +# These variables set the defaults for all JOBs, however each JOB +# may define its own, overriding the defaults. + +# update_every sets the default data collection frequency. +# If unset, the python.d.plugin default is used. +# update_every: 1 + +# priority controls the order of charts at the netdata dashboard. +# Lower numbers move the charts towards the top of the page. +# If unset, the default for python.d.plugin is used. +# priority: 60000 + +# penalty indicates whether to apply penalty to update_every in case of failures. +# Penalty will increase every 5 failed updates in a row. Maximum penalty is 10 minutes. +# penalty: yes + +# autodetection_retry sets the job re-check interval in seconds. +# The job is not deleted if check fails. +# Attempts to start the job are made once every autodetection_retry. +# This feature is disabled by default. +# autodetection_retry: 0 + +# ---------------------------------------------------------------------- +# JOBS (data collection sources) +# +# The default JOBS share the same *name*. JOBS with the same name +# are mutually exclusive. Only one of them will be allowed running at +# any time. This allows autodetection to try several alternatives and +# pick the one that works. +# +# Any number of jobs is supported. +# +# All python.d.plugin JOBS (for all its modules) support a set of +# predefined parameters. These are: +# +# job_name: +# name: myname # the JOB's name as it will appear at the +# # dashboard (by default is the job_name) +# # JOBs sharing a name are mutually exclusive +# update_every: 1 # the JOB's data collection frequency +# priority: 60000 # the JOB's order on the dashboard +# penalty: yes # the JOB's penalty +# autodetection_retry: 0 # the JOB's re-check interval in seconds +# +# Additionally to the above, smartd_log also supports the following: +# +# log_path: '/path/to/smartd_logs' # path to smartd log files. Default is /var/log/smartd +# exclude_disks: 'PATTERN1 PATTERN2' # space separated patterns. If the pattern is in the drive name, the module will not collect data for it. +# +# ---------------------------------------------------------------------- + +custom: + name: smartd_log + log_path: '/var/log/smartd/' + +debian: + name: smartd_log + log_path: '/var/lib/smartmontools/' |