From 12b9efaebb6d008437af4a72a98d05c4319fc825 Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Fri, 28 Dec 2018 15:42:52 +0100 Subject: Merging upstream version 1.11.1+dfsg Signed-off-by: Daniel Baumann --- collectors/python.d.plugin/smartd_log/README.md | 89 +- .../python.d.plugin/smartd_log/smartd_log.chart.py | 911 +++++++++++++++------ .../python.d.plugin/smartd_log/smartd_log.conf | 25 +- 3 files changed, 719 insertions(+), 306 deletions(-) (limited to 'collectors/python.d.plugin/smartd_log') diff --git a/collectors/python.d.plugin/smartd_log/README.md b/collectors/python.d.plugin/smartd_log/README.md index 121a63573..a31ad0c7a 100644 --- a/collectors/python.d.plugin/smartd_log/README.md +++ b/collectors/python.d.plugin/smartd_log/README.md @@ -2,29 +2,92 @@ Module monitor `smartd` log files to collect HDD/SSD S.M.A.R.T attributes. -It produces following charts (you can add additional attributes in the module configuration file): +**Requirements:** +* `smartmontools` -1. **Read Error Rate** attribute 1 +It produces following charts for SCSI devices: -2. **Start/Stop Count** attribute 4 +1. **Read Error Corrected** -3. **Reallocated Sectors Count** attribute 5 +2. **Read Error Uncorrected** -4. **Seek Error Rate** attribute 7 +3. **Write Error Corrected** -5. **Power-On Hours Count** attribute 9 +4. **Write Error Uncorrected** -6. **Power Cycle Count** attribute 12 +5. **Verify Error Corrected** -7. **Load/Unload Cycles** attribute 193 +6. **Verify Error Uncorrected** -8. **Temperature** attribute 194 +7. **Temperature** -9. **Current Pending Sectors** attribute 197 -10. **Off-Line Uncorrectable** attribute 198 +For ATA devices: +1. **Read Error Rate** -11. **Write Error Rate** attribute 200 +2. **Seek Error Rate** + +3. **Soft Read Error Rate** + +4. **Write Error Rate** + +5. **SATA Interface Downshift** + +6. **UDMA CRC Error Count** + +7. **Throughput Performance** + +8. **Seek Time Performance** + +9. **Start/Stop Count** + +10. **Power-On Hours Count** + +11. **Power Cycle Count** + +12. **Unexpected Power Loss** + +13. **Spin-Up Time** + +14. **Spin-up Retries** + +15. **Calibration Retries** + +16. **Temperature** + +17. **Reallocated Sectors Count** + +18. **Reserved Block Count** + +19. **Program Fail Count** + +20. **Erase Fail Count** + +21. **Wear Leveller Worst Case Erase Count** + +22. **Unused Reserved NAND Blocks** + +23. **Reallocation Event Count** + +24. **Current Pending Sector Count** + +25. **Offline Uncorrectable Sector Count** + +26. **Percent Lifetime Used** + +### prerequisite +`smartd` must be running with `-A` option to write smartd attribute information to files. + +For this you need to set `smartd_opts` (or `SMARTD_ARGS`, check _smartd.service_ content) in `/etc/default/smartmontools`: + + +``` +# dump smartd attrs info every 600 seconds +smartd_opts="-A /var/log/smartd/ -i 600" +``` + + +`smartd` appends logs at every run. It's strongly recommended to use `logrotate` for smartd files. ### configuration @@ -33,6 +96,6 @@ local: log_path : '/var/log/smartd/' ``` -If no configuration is given, module will attempt to read log files in /var/log/smartd/ directory. +If no configuration is given, module will attempt to read log files in `/var/log/smartd/` directory. --- diff --git a/collectors/python.d.plugin/smartd_log/smartd_log.chart.py b/collectors/python.d.plugin/smartd_log/smartd_log.chart.py index 21dbccecc..13762fabe 100644 --- a/collectors/python.d.plugin/smartd_log/smartd_log.chart.py +++ b/collectors/python.d.plugin/smartd_log/smartd_log.chart.py @@ -6,182 +6,537 @@ import os import re -from collections import namedtuple +from copy import deepcopy from time import time from bases.collection import read_last_line from bases.FrameworkServices.SimpleService import SimpleService -# charts order (can be overridden if you want less charts, or different order) -ORDER = ['1', '4', '5', '7', '9', '12', '193', '194', '197', '198', '200'] - -SMART_ATTR = { - '1': 'Read Error Rate', - '2': 'Throughput Performance', - '3': 'Spin-Up Time', - '4': 'Start/Stop Count', - '5': 'Reallocated Sectors Count', - '6': 'Read Channel Margin', - '7': 'Seek Error Rate', - '8': 'Seek Time Performance', - '9': 'Power-On Hours Count', - '10': 'Spin-up Retries', - '11': 'Calibration Retries', - '12': 'Power Cycle Count', - '13': 'Soft Read Error Rate', - '100': 'Erase/Program Cycles', - '103': 'Translation Table Rebuild', - '108': 'Unknown (108)', - '170': 'Reserved Block Count', - '171': 'Program Fail Count', - '172': 'Erase Fail Count', - '173': 'Wear Leveller Worst Case Erase Count', - '174': 'Unexpected Power Loss', - '175': 'Program Fail Count', - '176': 'Erase Fail Count', - '177': 'Wear Leveling Count', - '178': 'Used Reserved Block Count', - '179': 'Used Reserved Block Count', - '180': 'Unused Reserved Block Count', - '181': 'Program Fail Count', - '182': 'Erase Fail Count', - '183': 'SATA Downshifts', - '184': 'End-to-End error', - '185': 'Head Stability', - '186': 'Induced Op-Vibration Detection', - '187': 'Reported Uncorrectable Errors', - '188': 'Command Timeout', - '189': 'High Fly Writes', - '190': 'Temperature', - '191': 'G-Sense Errors', - '192': 'Power-Off Retract Cycles', - '193': 'Load/Unload Cycles', - '194': 'Temperature', - '195': 'Hardware ECC Recovered', - '196': 'Reallocation Events', - '197': 'Current Pending Sectors', - '198': 'Off-line Uncorrectable', - '199': 'UDMA CRC Error Rate', - '200': 'Write Error Rate', - '201': 'Soft Read Errors', - '202': 'Data Address Mark Errors', - '203': 'Run Out Cancel', - '204': 'Soft ECC Corrections', - '205': 'Thermal Asperity Rate', - '206': 'Flying Height', - '207': 'Spin High Current', - '209': 'Offline Seek Performance', - '220': 'Disk Shift', - '221': 'G-Sense Error Rate', - '222': 'Loaded Hours', - '223': 'Load/Unload Retries', - '224': 'Load Friction', - '225': 'Load/Unload Cycles', - '226': 'Load-in Time', - '227': 'Torque Amplification Count', - '228': 'Power-Off Retracts', - '230': 'GMR Head Amplitude', - '231': 'Temperature', - '232': 'Available Reserved Space', - '233': 'Media Wearout Indicator', - '240': 'Head Flying Hours', - '241': 'Total LBAs Written', - '242': 'Total LBAs Read', - '250': 'Read Error Retry Rate' -} - -LIMIT = namedtuple('LIMIT', ['min', 'max']) - -LIMITS = { - '194': LIMIT(0, 200) -} -RESCAN_INTERVAL = 60 - -REGEX = re.compile( +INCREMENTAL = 'incremental' +ABSOLUTE = 'absolute' + +ATA = 'ata' +SCSI = 'scsi' +CSV = '.csv' + +DEF_RESCAN_INTERVAL = 60 +DEF_AGE = 30 +DEF_PATH = '/var/log/smartd' + +ATTR1 = '1' +ATTR2 = '2' +ATTR3 = '3' +ATTR4 = '4' +ATTR5 = '5' +ATTR7 = '7' +ATTR8 = '8' +ATTR9 = '9' +ATTR10 = '10' +ATTR11 = '11' +ATTR12 = '12' +ATTR13 = '13' +ATTR170 = '170' +ATTR171 = '171' +ATTR172 = '172' +ATTR173 = '173' +ATTR174 = '174' +ATTR180 = '180' +ATTR183 = '183' +ATTR190 = '190' +ATTR194 = '194' +ATTR196 = '196' +ATTR197 = '197' +ATTR198 = '198' +ATTR199 = '199' +ATTR202 = '202' +ATTR206 = '206' +ATTR_READ_ERR_COR = 'read-total-err-corrected' +ATTR_READ_ERR_UNC = 'read-total-unc-errors' +ATTR_WRITE_ERR_COR = 'write-total-err-corrected' +ATTR_WRITE_ERR_UNC = 'write-total-unc-errors' +ATTR_VERIFY_ERR_COR = 'verify-total-err-corrected' +ATTR_VERIFY_ERR_UNC = 'verify-total-unc-errors' +ATTR_TEMPERATURE = 'temperature' + + +RE_ATA = re.compile( '(\d+);' # attribute '(\d+);' # normalized value '(\d+)', # raw value re.X ) +RE_SCSI = re.compile( + '([a-z-]+);' # attribute + '([0-9.]+)', # raw value + re.X +) -def chart_template(chart_name): - units, attr_id = chart_name.split('_')[-2:] - title = '{value_type} {description}'.format(value_type=units.capitalize(), - description=SMART_ATTR[attr_id]) - family = SMART_ATTR[attr_id].lower() - - return { - chart_name: { - 'options': [None, title, units, family, 'smartd_log.' + chart_name, 'line'], - 'lines': [] - } +ORDER = [ + # errors + 'read_error_rate', + 'seek_error_rate', + 'soft_read_error_rate', + 'write_error_rate', + 'read_total_err_corrected', + 'read_total_unc_errors', + 'write_total_err_corrected', + 'write_total_unc_errors', + 'verify_total_err_corrected', + 'verify_total_unc_errors', + # external failure + 'sata_interface_downshift', + 'udma_crc_error_count', + # performance + 'throughput_performance', + 'seek_time_performance', + # power + 'start_stop_count', + 'power_on_hours_count', + 'power_cycle_count', + 'unexpected_power_loss', + # spin + 'spin_up_time', + 'spin_up_retries', + 'calibration_retries', + # temperature + 'airflow_temperature_celsius', + 'temperature_celsius', + # wear + 'reallocated_sectors_count', + 'reserved_block_count', + 'program_fail_count', + 'erase_fail_count', + 'wear_leveller_worst_case_erase_count', + 'unused_reserved_nand_blocks', + 'reallocation_event_count', + 'current_pending_sector_count', + 'offline_uncorrectable_sector_count', + 'percent_lifetime_used', +] + +CHARTS = { + 'read_error_rate': { + 'options': [None, 'Read Error Rate', 'value', 'errors', 'smartd_log.read_error_rate', 'line'], + 'lines': [], + 'attrs': [ATTR1], + 'algo': ABSOLUTE, + }, + 'seek_error_rate': { + 'options': [None, 'Seek Error Rate', 'value', 'errors', 'smartd_log.seek_error_rate', 'line'], + 'lines': [], + 'attrs': [ATTR7], + 'algo': ABSOLUTE, + }, + 'soft_read_error_rate': { + 'options': [None, 'Soft Read Error Rate', 'errors', 'errors', 'smartd_log.soft_read_error_rate', 'line'], + 'lines': [], + 'attrs': [ATTR13], + 'algo': INCREMENTAL, + }, + 'write_error_rate': { + 'options': [None, 'Write Error Rate', 'value', 'errors', 'smartd_log.write_error_rate', 'line'], + 'lines': [], + 'attrs': [ATTR206], + 'algo': ABSOLUTE, + }, + 'read_total_err_corrected': { + 'options': [None, 'Read Error Corrected', 'errors', 'errors', 'smartd_log.read_total_err_corrected', 'line'], + 'lines': [], + 'attrs': [ATTR_READ_ERR_COR], + 'algo': INCREMENTAL, + }, + 'read_total_unc_errors': { + 'options': [None, 'Read Error Uncorrected', 'errors', 'errors', 'smartd_log.read_total_unc_errors', 'line'], + 'lines': [], + 'attrs': [ATTR_READ_ERR_UNC], + 'algo': INCREMENTAL, + }, + 'write_total_err_corrected': { + 'options': [None, 'Write Error Corrected', 'errors', 'errors', 'smartd_log.read_total_err_corrected', 'line'], + 'lines': [], + 'attrs': [ATTR_WRITE_ERR_COR], + 'algo': INCREMENTAL, + }, + 'write_total_unc_errors': { + 'options': [None, 'Write Error Uncorrected', 'errors', 'errors', 'smartd_log.write_total_unc_errors', 'line'], + 'lines': [], + 'attrs': [ATTR_WRITE_ERR_UNC], + 'algo': INCREMENTAL, + }, + 'verify_total_err_corrected': { + 'options': [None, 'Verify Error Corrected', 'errors', 'errors', 'smartd_log.verify_total_err_corrected', + 'line'], + 'lines': [], + 'attrs': [ATTR_VERIFY_ERR_COR], + 'algo': INCREMENTAL, + }, + 'verify_total_unc_errors': { + 'options': [None, 'Verify Error Uncorrected', 'errors', 'errors', 'smartd_log.verify_total_unc_errors', 'line'], + 'lines': [], + 'attrs': [ATTR_VERIFY_ERR_UNC], + 'algo': INCREMENTAL, + }, + 'sata_interface_downshift': { + 'options': [None, 'SATA Interface Downshift', 'events', 'external failure', + 'smartd_log.sata_interface_downshift', 'line'], + 'lines': [], + 'attrs': [ATTR183], + 'algo': INCREMENTAL, + }, + 'udma_crc_error_count': { + 'options': [None, 'UDMA CRC Error Count', 'errors', 'external failure', 'smartd_log.udma_crc_error_count', + 'line'], + 'lines': [], + 'attrs': [ATTR199], + 'algo': INCREMENTAL, + }, + 'throughput_performance': { + 'options': [None, 'Throughput Performance', 'value', 'performance', 'smartd_log.throughput_performance', + 'line'], + 'lines': [], + 'attrs': [ATTR2], + 'algo': ABSOLUTE, + }, + 'seek_time_performance': { + 'options': [None, 'Seek Time Performance', 'value', 'performance', 'smartd_log.seek_time_performance', 'line'], + 'lines': [], + 'attrs': [ATTR8], + 'algo': ABSOLUTE, + }, + 'start_stop_count': { + 'options': [None, 'Start/Stop Count', 'events', 'power', 'smartd_log.start_stop_count', 'line'], + 'lines': [], + 'attrs': [ATTR4], + 'algo': ABSOLUTE, + }, + 'power_on_hours_count': { + 'options': [None, 'Power-On Hours Count', 'hours', 'power', 'smartd_log.power_on_hours_count', 'line'], + 'lines': [], + 'attrs': [ATTR9], + 'algo': ABSOLUTE, + }, + 'power_cycle_count': { + 'options': [None, 'Power Cycle Count', 'events', 'power', 'smartd_log.power_cycle_count', 'line'], + 'lines': [], + 'attrs': [ATTR12], + 'algo': ABSOLUTE, + }, + 'unexpected_power_loss': { + 'options': [None, 'Unexpected Power Loss', 'events', 'power', 'smartd_log.unexpected_power_loss', 'line'], + 'lines': [], + 'attrs': [ATTR174], + 'algo': ABSOLUTE, + }, + 'spin_up_time': { + 'options': [None, 'Spin-Up Time', 'ms', 'spin', 'smartd_log.spin_up_time', 'line'], + 'lines': [], + 'attrs': [ATTR3], + 'algo': ABSOLUTE, + }, + 'spin_up_retries': { + 'options': [None, 'Spin-up Retries', 'retries', 'spin', 'smartd_log.spin_up_retries', 'line'], + 'lines': [], + 'attrs': [ATTR10], + 'algo': INCREMENTAL, + }, + 'calibration_retries': { + 'options': [None, 'Calibration Retries', 'retries', 'spin', 'smartd_log.calibration_retries', 'line'], + 'lines': [], + 'attrs': [ATTR11], + 'algo': INCREMENTAL, + }, + 'airflow_temperature_celsius': { + 'options': [None, 'Airflow Temperature Celsius', 'celsius', 'temperature', + 'smartd_log.airflow_temperature_celsius', 'line'], + 'lines': [], + 'attrs': [ATTR190], + 'algo': ABSOLUTE, + }, + 'temperature_celsius': { + 'options': [None, 'Temperature', 'celsius', 'temperature', 'smartd_log.temperature_celsius', 'line'], + 'lines': [], + 'attrs': [ATTR194, ATTR_TEMPERATURE], + 'algo': ABSOLUTE, + }, + 'reallocated_sectors_count': { + 'options': [None, 'Reallocated Sectors Count', 'sectors', 'wear', 'smartd_log.reallocated_sectors_count', + 'line'], + 'lines': [], + 'attrs': [ATTR5], + 'algo': INCREMENTAL, + }, + 'reserved_block_count': { + 'options': [None, 'Reserved Block Count', '%', 'wear', 'smartd_log.reserved_block_count', 'line'], + 'lines': [], + 'attrs': [ATTR170], + 'algo': ABSOLUTE, + }, + 'program_fail_count': { + 'options': [None, 'Program Fail Count', 'errors', 'wear', 'smartd_log.program_fail_count', 'line'], + 'lines': [], + 'attrs': [ATTR171], + 'algo': INCREMENTAL, + }, + 'erase_fail_count': { + 'options': [None, 'Erase Fail Count', 'failures', 'wear', 'smartd_log.erase_fail_count', 'line'], + 'lines': [], + 'attrs': [ATTR172], + 'algo': INCREMENTAL, + }, + 'wear_leveller_worst_case_erase_count': { + 'options': [None, 'Wear Leveller Worst Case Erase Count', 'erases', 'wear', + 'smartd_log.wear_leveller_worst_case_erase_count', 'line'], + 'lines': [], + 'attrs': [ATTR173], + 'algo': ABSOLUTE, + }, + 'unused_reserved_nand_blocks': { + 'options': [None, 'Unused Reserved NAND Blocks', 'blocks', 'wear', 'smartd_log.unused_reserved_nand_blocks', + 'line'], + 'lines': [], + 'attrs': [ATTR180], + 'algo': ABSOLUTE, + }, + 'reallocation_event_count': { + 'options': [None, 'Reallocation Event Count', 'events', 'wear', 'smartd_log.reallocation_event_count', 'line'], + 'lines': [], + 'attrs': [ATTR196], + 'algo': INCREMENTAL, + }, + 'current_pending_sector_count': { + 'options': [None, 'Current Pending Sector Count', 'sectors', 'wear', 'smartd_log.current_pending_sector_count', + 'line'], + 'lines': [], + 'attrs': [ATTR197], + 'algo': ABSOLUTE, + }, + 'offline_uncorrectable_sector_count': { + 'options': [None, 'Offline Uncorrectable Sector Count', 'sectors', 'wear', + 'smartd_log.offline_uncorrectable_sector_count', 'line'], + 'lines': [], + 'attrs': [ATTR198], + 'algo': ABSOLUTE, + + }, + 'percent_lifetime_used': { + 'options': [None, 'Percent Lifetime Used', '%', 'wear', 'smartd_log.percent_lifetime_used', 'line'], + 'lines': [], + 'attrs': [ATTR202], + 'algo': ABSOLUTE, } +} + +# NOTE: 'parse_temp' decodes ATA 194 raw value. Not heavily tested. Written by @Ferroin +# C code: +# https://github.com/smartmontools/smartmontools/blob/master/smartmontools/atacmds.cpp#L2051 +# +# Calling 'parse_temp' on the raw value will return a 4-tuple, containing +# * temperature +# * minimum +# * maximum +# * over-temperature count +# substituting None for values it can't decode. +# +# Example: +# >>> parse_temp(42952491042) +# >>> (34, 10, 43, None) +# +# +# def check_temp_word(i): +# if i <= 0x7F: +# return 0x11 +# elif i <= 0xFF: +# return 0x01 +# elif 0xFF80 <= i: +# return 0x10 +# return 0x00 +# +# +# def check_temp_range(t, b0, b1): +# if b0 > b1: +# t0, t1 = b1, b0 +# else: +# t0, t1 = b0, b1 +# +# if all([ +# -60 <= t0, +# t0 <= t, +# t <= t1, +# t1 <= 120, +# not (t0 == -1 and t1 <= 0) +# ]): +# return t0, t1 +# return None, None +# +# +# def parse_temp(raw): +# byte = list() +# word = list() +# for i in range(0, 6): +# byte.append(0xFF & (raw >> (i * 8))) +# for i in range(0, 3): +# word.append(0xFFFF & (raw >> (i * 16))) +# +# ctwd = check_temp_word(word[0]) +# +# if not word[2]: +# if ctwd and not word[1]: +# # byte[0] is temp, no other data +# return byte[0], None, None, None +# +# if ctwd and all(check_temp_range(byte[0], byte[2], byte[3])): +# # byte[0] is temp, byte[2] is max or min, byte[3] is min or max +# trange = check_temp_range(byte[0], byte[2], byte[3]) +# return byte[0], trange[0], trange[1], None +# +# if ctwd and all(check_temp_range(byte[0], byte[1], byte[2])): +# # byte[0] is temp, byte[1] is max or min, byte[2] is min or max +# trange = check_temp_range(byte[0], byte[1], byte[2]) +# return byte[0], trange[0], trange[1], None +# +# return None, None, None, None +# +# if ctwd: +# if all( +# [ +# ctwd & check_temp_word(word[1]) & check_temp_word(word[2]) != 0x00, +# all(check_temp_range(byte[0], byte[2], byte[4])), +# ] +# ): +# # byte[0] is temp, byte[2] is max or min, byte[4] is min or max +# trange = check_temp_range(byte[0], byte[2], byte[4]) +# return byte[0], trange[0], trange[1], None +# else: +# trange = check_temp_range(byte[0], byte[2], byte[3]) +# if word[2] < 0x7FFF and all(trange) and trange[1] >= 40: +# # byte[0] is temp, byte[2] is max or min, byte[3] is min or max, word[2] is overtemp count +# return byte[0], trange[0], trange[1], word[2] +# # no data +# return None, None, None, None + + +CHARTED_ATTRS = dict((attr, k) for k, v in CHARTS.items() for attr in v['attrs']) + + +class BaseAtaSmartAttribute: + def __init__(self, name, normalized_value, raw_value): + self.name = name + self.normalized_value = normalized_value + self.raw_value = raw_value + + def value(self): + raise NotImplementedError + + +class AtaRaw(BaseAtaSmartAttribute): + def value(self): + return self.raw_value + + +class AtaNormalized(BaseAtaSmartAttribute): + def value(self): + return self.normalized_value + + +class Ata9(BaseAtaSmartAttribute): + def value(self): + value = int(self.raw_value) + if value > 1e6: + return value & 0xFFFF + return value + + +class Ata190(BaseAtaSmartAttribute): + def value(self): + return 100 - int(self.normalized_value) + + +class BaseSCSISmartAttribute: + def __init__(self, name, raw_value): + self.name = name + self.raw_value = raw_value + + def value(self): + raise NotImplementedError + + +class SCSIRaw(BaseSCSISmartAttribute): + def value(self): + return self.raw_value + + +def ata_attribute_factory(value): + name = value[0] + + if name == ATTR9: + return Ata9(*value) + elif name == ATTR190: + return Ata190(*value) + elif name in [ + ATTR1, + ATTR7, + ATTR194, + ATTR202, + ATTR206, + ]: + return AtaNormalized(*value) + + return AtaRaw(*value) -def handle_os_error(method): - def on_call(*args): - try: - return method(*args) - except OSError: - return None - return on_call +def scsi_attribute_factory(value): + return SCSIRaw(*value) -class SmartAttribute(object): - def __init__(self, idx, normalized, raw): - self.id = idx - self.normalized = normalized - self._raw = raw +def attribute_factory(value): + name = value[0] + if name.isdigit(): + return ata_attribute_factory(value) + return scsi_attribute_factory(value) - @property - def raw(self): - if self.id in LIMITS: - limit = LIMITS[self.id] - if limit.min <= int(self._raw) <= limit.max: - return self._raw - return None - return self._raw - @raw.setter - def raw(self, value): - self._raw = value +def handle_error(*errors): + def on_method(method): + def on_call(*args): + try: + return method(*args) + except errors: + return None + return on_call + return on_method class DiskLogFile: - def __init__(self, path): - self.path = path - self.size = os.path.getsize(path) + def __init__(self, full_path): + self.path = full_path + self.size = os.path.getsize(full_path) - @handle_os_error + @handle_error(OSError) def is_changed(self): - new_size = os.path.getsize(self.path) - old_size, self.size = self.size, new_size - - return new_size != old_size and new_size + return self.size != os.path.getsize(self.path) - @staticmethod - @handle_os_error - def is_valid(log_file, exclude): - return all([log_file.endswith('.csv'), - not [p for p in exclude if p in log_file], - os.access(log_file, os.R_OK), - os.path.getsize(log_file)]) + @handle_error(OSError) + def is_active(self, current_time, limit): + return (current_time - os.path.getmtime(self.path)) / 60 < limit + @handle_error(OSError) + def read(self): + self.size = os.path.getsize(self.path) + return read_last_line(self.path) -class Disk: - def __init__(self, full_path, age): - self.log_file = DiskLogFile(full_path) - self.name = os.path.basename(full_path).split('.')[-3] - self.age = int(age) - self.status = True - self.attributes = dict() - self.get_attributes() +class BaseDisk: + def __init__(self, name, log_file): + self.name = re.sub(r'_+', '_', name) + self.log_file = log_file + self.attrs = list() + self.alive = True + self.charted = False def __eq__(self, other): - if isinstance(other, Disk): + if isinstance(other, BaseDisk): return self.name == other.name return self.name == other @@ -191,163 +546,179 @@ class Disk: def __hash__(self): return hash(repr(self)) - @handle_os_error - def is_active(self): - return (time() - os.path.getmtime(self.log_file.path)) / 60 < self.age + def parser(self, data): + raise NotImplementedError - @handle_os_error - def get_attributes(self): - last_line = read_last_line(self.log_file.path) - self.attributes = dict((attr, SmartAttribute(attr, normalized, raw)) for attr, normalized, raw - in REGEX.findall(last_line)) - return True + @handle_error(TypeError) + def populate_attrs(self): + self.attrs = list() + line = self.log_file.read() + for value in self.parser(line): + self.attrs.append(attribute_factory(value)) + + return len(self.attrs) def data(self): data = dict() - for attr in self.attributes.values(): - data['_'.join([self.name, 'normalized', attr.id])] = attr.normalized - if attr.raw is not None: - data['_'.join([self.name, 'raw', attr.id])] = attr.raw + for attr in self.attrs: + data['{0}_{1}'.format(self.name, attr.name)] = attr.value() return data -class Service(SimpleService): - def __init__(self, configuration=None, name=None): - SimpleService.__init__(self, configuration=configuration, name=name) - self.log_path = self.configuration.get('log_path', '/var/log/smartd') - self.raw = self.configuration.get('raw_values', True) - self.exclude = self.configuration.get('exclude_disks', str()).split() - self.age = self.configuration.get('age', 30) +class ATADisk(BaseDisk): + def parser(self, data): + return RE_ATA.findall(data) - self.runs = 0 - self.disks = list() - self.order = list() - self.definitions = dict() - def check(self): - self.disks = self.scan() +class SCSIDisk(BaseDisk): + def parser(self, data): + return RE_SCSI.findall(data) - if not self.disks: - return None - user_defined_sa = self.configuration.get('smart_attributes') +class Service(SimpleService): + def __init__(self, configuration=None, name=None): + SimpleService.__init__(self, configuration=configuration, name=name) + self.order = ORDER + self.definitions = deepcopy(CHARTS) - if user_defined_sa: - order = user_defined_sa.split() or ORDER - else: - order = ORDER + self.log_path = configuration.get('log_path', DEF_PATH) + self.age = configuration.get('age', DEF_AGE) + self.exclude = configuration.get('exclude_disks', str()).split() - self.create_charts(order) + self.disks = list() + self.runs = 0 - return True + def check(self): + return self.scan() > 0 def get_data(self): self.runs += 1 - if self.runs % RESCAN_INTERVAL == 0: - self.cleanup_and_rescan() + if self.runs % DEF_RESCAN_INTERVAL == 0: + self.cleanup() + self.scan() data = dict() for disk in self.disks: - - if not disk.status: + if not disk.alive: continue + if not disk.charted: + self.add_disk_to_charts(disk) + changed = disk.log_file.is_changed() - # True = changed, False = unchanged, None = Exception if changed is None: - disk.status = False + disk.alive = False continue - if changed: - success = disk.get_attributes() - if not success: - disk.status = False - continue + if changed and disk.populate_attrs() is None: + disk.alive = False + continue data.update(disk.data()) - return data or None - - def create_charts(self, order): - for attr in order: - raw_name, normalized_name = 'attr_id_raw_' + attr, 'attr_id_normalized_' + attr - raw, normalized = chart_template(raw_name), chart_template(normalized_name) - self.order.extend([normalized_name, raw_name]) - self.definitions.update(raw) - self.definitions.update(normalized) - - for disk in self.disks: - if attr not in disk.attributes: - self.debug("'{disk}' has no attribute '{attr_id}'".format(disk=disk.name, - attr_id=attr)) - continue - normalized[normalized_name]['lines'].append(['_'.join([disk.name, 'normalized', attr]), disk.name]) - - if not self.raw: - continue - - if disk.attributes[attr].raw is not None: - raw[raw_name]['lines'].append(['_'.join([disk.name, 'raw', attr]), disk.name]) - continue - self.debug("'{disk}' attribute '{attr_id}' value not in {limits}".format(disk=disk.name, - attr_id=attr, - limits=LIMITS[attr])) - - def cleanup_and_rescan(self): - self.cleanup() - new_disks = self.scan(only_new=True) - - for disk in new_disks: - valid = False - - for chart in self.charts: - value_type, idx = chart.id.split('_')[2:] - - if idx in disk.attributes: - valid = True - dimension_id = '_'.join([disk.name, value_type, idx]) - - if dimension_id in chart: - chart.hide_dimension(dimension_id=dimension_id, reverse=True) - else: - chart.add_dimension([dimension_id, disk.name]) - if valid: - self.disks.append(disk) + return data def cleanup(self): - for disk in self.disks: + current_time = time() + for disk in self.disks[:]: + if any( + [ + not disk.alive, + not disk.log_file.is_active(current_time, self.age), + ] + ): + self.disks.remove(disk.name) + self.remove_disk_from_charts(disk) + + def scan(self): + self.debug('scanning {0}'.format(self.log_path)) + current_time = time() + + for full_name in os.listdir(self.log_path): + disk = self.create_disk_from_file(full_name, current_time) + if not disk: + continue + self.disks.append(disk) + + return len(self.disks) + + def create_disk_from_file(self, full_name, current_time): + name = os.path.basename(full_name).split('.')[-3] + path = os.path.join(self.log_path, full_name) + + if name in self.disks: + return None + + if [p for p in self.exclude if p in name]: + return None + + if not full_name.endswith(CSV): + self.debug('skipping {0}: not a csv file'.format(full_name)) + return None + + if not os.access(path, os.R_OK): + self.debug('skipping {0}: not readable'.format(full_name)) + return None + + if os.path.getsize(path) == 0: + self.debug('skipping {0}: zero size'.format(full_name)) + return None + + if (current_time - os.path.getmtime(path)) / 60 > self.age: + self.debug('skipping {0}: haven\'t been updated for last {1} minutes'.format(full_name, self.age)) + return None + + if ATA in full_name: + disk = ATADisk(name, DiskLogFile(path)) + elif SCSI in full_name: + disk = SCSIDisk(name, DiskLogFile(path)) + else: + self.debug('skipping {0}: unknown type'.format(full_name)) + return None + + disk.populate_attrs() + if not disk.attrs: + self.error('skipping {0}: parsing failed'.format(full_name)) + return None + + self.debug('added {0}'.format(full_name)) + return disk + + def add_disk_to_charts(self, disk): + if len(self.charts) == 0 or disk.charted: + return + disk.charted = True + + for attr in disk.attrs: + chart_id = CHARTED_ATTRS.get(attr.name) + + if not chart_id or chart_id not in self.charts: + continue + + chart = self.charts[chart_id] + dim = [ + '{0}_{1}'.format(disk.name, attr.name), + disk.name, + CHARTS[chart_id]['algo'], + ] + + if dim[0] in self.charts[chart_id].dimensions: + chart.hide_dimension(dim[0], reverse=True) + else: + chart.add_dimension(dim) + + def remove_disk_from_charts(self, disk): + if len(self.charts) == 0 or not disk.charted: + return + + for attr in disk.attrs: + chart_id = CHARTED_ATTRS.get(attr.name) + + if not chart_id or chart_id not in self.charts: + continue - if not disk.is_active(): - disk.status = False - if not disk.status: - for chart in self.charts: - dimension_id = '_'.join([disk.name, chart.id[8:]]) - chart.hide_dimension(dimension_id=dimension_id) - - self.disks = [disk for disk in self.disks if disk.status] - - def scan(self, only_new=None): - new_disks = list() - for f in os.listdir(self.log_path): - full_path = os.path.join(self.log_path, f) - - if DiskLogFile.is_valid(full_path, self.exclude): - disk = Disk(full_path, self.age) - - active = disk.is_active() - if active is None: - continue - if active: - if not only_new: - new_disks.append(disk) - else: - if disk not in self.disks: - new_disks.append(disk) - else: - if not only_new: - self.debug("'{disk}' not updated in the last {age} minutes, " - "skipping it.".format(disk=disk.name, age=self.age)) - return new_disks + # TODO: can't delete dimension + self.charts[chart_id].hide_dimension('{0}_{1}'.format(disk.name, attr.name)) diff --git a/collectors/python.d.plugin/smartd_log/smartd_log.conf b/collectors/python.d.plugin/smartd_log/smartd_log.conf index 3fab3f1c0..ab7f45b0f 100644 --- a/collectors/python.d.plugin/smartd_log/smartd_log.conf +++ b/collectors/python.d.plugin/smartd_log/smartd_log.conf @@ -63,28 +63,7 @@ # # Additionally to the above, smartd_log also supports the following: # -# log_path: '/path/to/smartdlogs' # path to smartd log files. Default is /var/log/smartd -# raw_values: yes # enable/disable raw values charts. Enabled by default. -# smart_attributes: '1 2 3 4 44' # smart attributes charts. Default are ['1', '4', '5', '7', '9', '12', '193', '194', '197', '198', '200']. -# exclude_disks: 'PATTERN1 PATTERN2' # space separated patterns. If the pattern is in the drive name, the module will not collect data for it. -# -# ---------------------------------------------------------------------- -# Additional information -# Plugin reads smartd log files (-A option). -# You need to add (man smartd) to /etc/default/smartmontools '-i 600 -A /var/log/smartd/' to pass additional options to smartd on startup -# Then restart smartd service and check /path/log/smartdlogs -# ls /var/log/smartd/ -# CDC_WD10EZEX_00BN5A0-WD_WCC3F7FLVZS9.ata.csv WDC_WD10EZEX_00BN5A0-WD_WCC3F7FLVZS9.ata.csv ZDC_WD10EZEX_00BN5A0-WD_WCC3F7FLVZS9.ata.csv -# -# Smartd APPEND logs at every run. Its NOT RECOMMENDED to set '-i' option below 60 sec. -# STRONGLY RECOMMENDED to create smartd conf file for logrotate -# -# RAW vs NORMALIZED values -# "Normalized value", commonly referred to as just "value". This is a most universal measurement, on the scale from 0 (bad) to some maximum (good) value. -# Maximum values are typically 100, 200 or 253. Rule of thumb is: high values are good, low values are bad. -# -# "Raw value" - the value of the attribute as it is tracked by the device, before any normalization takes place. -# Some raw numbers provide valuable insight when properly interpreted. These cases will be discussed later on. -# Raw values are typically listed in hexadecimal numbers. The raw value has different structure for different vendors and is often not meaningful as a decimal number. +# log_path: '/path/to/smartd_logs' # path to smartd log files. Default is /var/log/smartd +# exclude_disks: 'PATTERN1 PATTERN2' # space separated patterns. If the pattern is in the drive name, the module will not collect data for it. # # ---------------------------------------------------------------------- -- cgit v1.2.3