diff options
Diffstat (limited to 'collectors/python.d.plugin/smartd_log/smartd_log.chart.py')
-rw-r--r-- | collectors/python.d.plugin/smartd_log/smartd_log.chart.py | 790 |
1 files changed, 0 insertions, 790 deletions
diff --git a/collectors/python.d.plugin/smartd_log/smartd_log.chart.py b/collectors/python.d.plugin/smartd_log/smartd_log.chart.py deleted file mode 100644 index a896164df..000000000 --- a/collectors/python.d.plugin/smartd_log/smartd_log.chart.py +++ /dev/null @@ -1,790 +0,0 @@ -# -*- coding: utf-8 -*- -# Description: smart netdata python.d module -# Author: ilyam8, vorph1 -# SPDX-License-Identifier: GPL-3.0-or-later - -import os -import re -from copy import deepcopy -from time import time - -from bases.FrameworkServices.SimpleService import SimpleService -from bases.collection import read_last_line - -INCREMENTAL = 'incremental' -ABSOLUTE = 'absolute' - -ATA = 'ata' -SCSI = 'scsi' -CSV = '.csv' - -DEF_RESCAN_INTERVAL = 60 -DEF_AGE = 30 -DEF_PATH = '/var/log/smartd' - -ATTR1 = '1' -ATTR2 = '2' -ATTR3 = '3' -ATTR4 = '4' -ATTR5 = '5' -ATTR7 = '7' -ATTR8 = '8' -ATTR9 = '9' -ATTR10 = '10' -ATTR11 = '11' -ATTR12 = '12' -ATTR13 = '13' -ATTR170 = '170' -ATTR171 = '171' -ATTR172 = '172' -ATTR173 = '173' -ATTR174 = '174' -ATTR177 = '177' -ATTR180 = '180' -ATTR183 = '183' -ATTR190 = '190' -ATTR194 = '194' -ATTR196 = '196' -ATTR197 = '197' -ATTR198 = '198' -ATTR199 = '199' -ATTR202 = '202' -ATTR206 = '206' -ATTR233 = '233' -ATTR241 = '241' -ATTR242 = '242' -ATTR249 = '249' -ATTR_READ_ERR_COR = 'read-total-err-corrected' -ATTR_READ_ERR_UNC = 'read-total-unc-errors' -ATTR_WRITE_ERR_COR = 'write-total-err-corrected' -ATTR_WRITE_ERR_UNC = 'write-total-unc-errors' -ATTR_VERIFY_ERR_COR = 'verify-total-err-corrected' -ATTR_VERIFY_ERR_UNC = 'verify-total-unc-errors' -ATTR_TEMPERATURE = 'temperature' - -RE_ATA = re.compile( - '(\d+);' # attribute - '(\d+);' # normalized value - '(\d+)', # raw value - re.X -) - -RE_SCSI = re.compile( - '([a-z-]+);' # attribute - '([0-9.]+)', # raw value - re.X -) - -ORDER = [ - # errors - 'read_error_rate', - 'seek_error_rate', - 'soft_read_error_rate', - 'write_error_rate', - 'read_total_err_corrected', - 'read_total_unc_errors', - 'write_total_err_corrected', - 'write_total_unc_errors', - 'verify_total_err_corrected', - 'verify_total_unc_errors', - # external failure - 'sata_interface_downshift', - 'udma_crc_error_count', - # performance - 'throughput_performance', - 'seek_time_performance', - # power - 'start_stop_count', - 'power_on_hours_count', - 'power_cycle_count', - 'unexpected_power_loss', - # spin - 'spin_up_time', - 'spin_up_retries', - 'calibration_retries', - # temperature - 'airflow_temperature_celsius', - 'temperature_celsius', - # wear - 'reallocated_sectors_count', - 'reserved_block_count', - 'program_fail_count', - 'erase_fail_count', - 'wear_leveller_worst_case_erase_count', - 'unused_reserved_nand_blocks', - 'reallocation_event_count', - 'current_pending_sector_count', - 'offline_uncorrectable_sector_count', - 'percent_lifetime_used', - 'media_wearout_indicator', - 'total_lbas_written', - 'total_lbas_read', -] - -CHARTS = { - 'read_error_rate': { - 'options': [None, 'Read Error Rate', 'value', 'errors', 'smartd_log.read_error_rate', 'line'], - 'lines': [], - 'attrs': [ATTR1], - 'algo': ABSOLUTE, - }, - 'seek_error_rate': { - 'options': [None, 'Seek Error Rate', 'value', 'errors', 'smartd_log.seek_error_rate', 'line'], - 'lines': [], - 'attrs': [ATTR7], - 'algo': ABSOLUTE, - }, - 'soft_read_error_rate': { - 'options': [None, 'Soft Read Error Rate', 'errors', 'errors', 'smartd_log.soft_read_error_rate', 'line'], - 'lines': [], - 'attrs': [ATTR13], - 'algo': INCREMENTAL, - }, - 'write_error_rate': { - 'options': [None, 'Write Error Rate', 'value', 'errors', 'smartd_log.write_error_rate', 'line'], - 'lines': [], - 'attrs': [ATTR206], - 'algo': ABSOLUTE, - }, - 'read_total_err_corrected': { - 'options': [None, 'Read Error Corrected', 'errors', 'errors', 'smartd_log.read_total_err_corrected', 'line'], - 'lines': [], - 'attrs': [ATTR_READ_ERR_COR], - 'algo': INCREMENTAL, - }, - 'read_total_unc_errors': { - 'options': [None, 'Read Error Uncorrected', 'errors', 'errors', 'smartd_log.read_total_unc_errors', 'line'], - 'lines': [], - 'attrs': [ATTR_READ_ERR_UNC], - 'algo': INCREMENTAL, - }, - 'write_total_err_corrected': { - 'options': [None, 'Write Error Corrected', 'errors', 'errors', 'smartd_log.write_total_err_corrected', 'line'], - 'lines': [], - 'attrs': [ATTR_WRITE_ERR_COR], - 'algo': INCREMENTAL, - }, - 'write_total_unc_errors': { - 'options': [None, 'Write Error Uncorrected', 'errors', 'errors', 'smartd_log.write_total_unc_errors', 'line'], - 'lines': [], - 'attrs': [ATTR_WRITE_ERR_UNC], - 'algo': INCREMENTAL, - }, - 'verify_total_err_corrected': { - 'options': [None, 'Verify Error Corrected', 'errors', 'errors', 'smartd_log.verify_total_err_corrected', - 'line'], - 'lines': [], - 'attrs': [ATTR_VERIFY_ERR_COR], - 'algo': INCREMENTAL, - }, - 'verify_total_unc_errors': { - 'options': [None, 'Verify Error Uncorrected', 'errors', 'errors', 'smartd_log.verify_total_unc_errors', 'line'], - 'lines': [], - 'attrs': [ATTR_VERIFY_ERR_UNC], - 'algo': INCREMENTAL, - }, - 'sata_interface_downshift': { - 'options': [None, 'SATA Interface Downshift', 'events', 'external failure', - 'smartd_log.sata_interface_downshift', 'line'], - 'lines': [], - 'attrs': [ATTR183], - 'algo': INCREMENTAL, - }, - 'udma_crc_error_count': { - 'options': [None, 'UDMA CRC Error Count', 'errors', 'external failure', 'smartd_log.udma_crc_error_count', - 'line'], - 'lines': [], - 'attrs': [ATTR199], - 'algo': INCREMENTAL, - }, - 'throughput_performance': { - 'options': [None, 'Throughput Performance', 'value', 'performance', 'smartd_log.throughput_performance', - 'line'], - 'lines': [], - 'attrs': [ATTR2], - 'algo': ABSOLUTE, - }, - 'seek_time_performance': { - 'options': [None, 'Seek Time Performance', 'value', 'performance', 'smartd_log.seek_time_performance', 'line'], - 'lines': [], - 'attrs': [ATTR8], - 'algo': ABSOLUTE, - }, - 'start_stop_count': { - 'options': [None, 'Start/Stop Count', 'events', 'power', 'smartd_log.start_stop_count', 'line'], - 'lines': [], - 'attrs': [ATTR4], - 'algo': ABSOLUTE, - }, - 'power_on_hours_count': { - 'options': [None, 'Power-On Hours Count', 'hours', 'power', 'smartd_log.power_on_hours_count', 'line'], - 'lines': [], - 'attrs': [ATTR9], - 'algo': ABSOLUTE, - }, - 'power_cycle_count': { - 'options': [None, 'Power Cycle Count', 'events', 'power', 'smartd_log.power_cycle_count', 'line'], - 'lines': [], - 'attrs': [ATTR12], - 'algo': ABSOLUTE, - }, - 'unexpected_power_loss': { - 'options': [None, 'Unexpected Power Loss', 'events', 'power', 'smartd_log.unexpected_power_loss', 'line'], - 'lines': [], - 'attrs': [ATTR174], - 'algo': ABSOLUTE, - }, - 'spin_up_time': { - 'options': [None, 'Spin-Up Time', 'ms', 'spin', 'smartd_log.spin_up_time', 'line'], - 'lines': [], - 'attrs': [ATTR3], - 'algo': ABSOLUTE, - }, - 'spin_up_retries': { - 'options': [None, 'Spin-up Retries', 'retries', 'spin', 'smartd_log.spin_up_retries', 'line'], - 'lines': [], - 'attrs': [ATTR10], - 'algo': INCREMENTAL, - }, - 'calibration_retries': { - 'options': [None, 'Calibration Retries', 'retries', 'spin', 'smartd_log.calibration_retries', 'line'], - 'lines': [], - 'attrs': [ATTR11], - 'algo': INCREMENTAL, - }, - 'airflow_temperature_celsius': { - 'options': [None, 'Airflow Temperature Celsius', 'celsius', 'temperature', - 'smartd_log.airflow_temperature_celsius', 'line'], - 'lines': [], - 'attrs': [ATTR190], - 'algo': ABSOLUTE, - }, - 'temperature_celsius': { - 'options': [None, 'Temperature', 'celsius', 'temperature', 'smartd_log.temperature_celsius', 'line'], - 'lines': [], - 'attrs': [ATTR194, ATTR_TEMPERATURE], - 'algo': ABSOLUTE, - }, - 'reallocated_sectors_count': { - 'options': [None, 'Reallocated Sectors Count', 'sectors', 'wear', 'smartd_log.reallocated_sectors_count', - 'line'], - 'lines': [], - 'attrs': [ATTR5], - 'algo': ABSOLUTE, - }, - 'reserved_block_count': { - 'options': [None, 'Reserved Block Count', 'percentage', 'wear', 'smartd_log.reserved_block_count', 'line'], - 'lines': [], - 'attrs': [ATTR170], - 'algo': ABSOLUTE, - }, - 'program_fail_count': { - 'options': [None, 'Program Fail Count', 'errors', 'wear', 'smartd_log.program_fail_count', 'line'], - 'lines': [], - 'attrs': [ATTR171], - 'algo': INCREMENTAL, - }, - 'erase_fail_count': { - 'options': [None, 'Erase Fail Count', 'failures', 'wear', 'smartd_log.erase_fail_count', 'line'], - 'lines': [], - 'attrs': [ATTR172], - 'algo': INCREMENTAL, - }, - 'wear_leveller_worst_case_erase_count': { - 'options': [None, 'Wear Leveller Worst Case Erase Count', 'erases', 'wear', - 'smartd_log.wear_leveller_worst_case_erase_count', 'line'], - 'lines': [], - 'attrs': [ATTR173], - 'algo': ABSOLUTE, - }, - 'unused_reserved_nand_blocks': { - 'options': [None, 'Unused Reserved NAND Blocks', 'blocks', 'wear', 'smartd_log.unused_reserved_nand_blocks', - 'line'], - 'lines': [], - 'attrs': [ATTR180], - 'algo': ABSOLUTE, - }, - 'reallocation_event_count': { - 'options': [None, 'Reallocation Event Count', 'events', 'wear', 'smartd_log.reallocation_event_count', 'line'], - 'lines': [], - 'attrs': [ATTR196], - 'algo': INCREMENTAL, - }, - 'current_pending_sector_count': { - 'options': [None, 'Current Pending Sector Count', 'sectors', 'wear', 'smartd_log.current_pending_sector_count', - 'line'], - 'lines': [], - 'attrs': [ATTR197], - 'algo': ABSOLUTE, - }, - 'offline_uncorrectable_sector_count': { - 'options': [None, 'Offline Uncorrectable Sector Count', 'sectors', 'wear', - 'smartd_log.offline_uncorrectable_sector_count', 'line'], - 'lines': [], - 'attrs': [ATTR198], - 'algo': ABSOLUTE, - - }, - 'percent_lifetime_used': { - 'options': [None, 'Percent Lifetime Used', 'percentage', 'wear', 'smartd_log.percent_lifetime_used', 'line'], - 'lines': [], - 'attrs': [ATTR202], - 'algo': ABSOLUTE, - }, - 'media_wearout_indicator': { - 'options': [None, 'Media Wearout Indicator', 'percentage', 'wear', 'smartd_log.media_wearout_indicator', 'line'], - 'lines': [], - 'attrs': [ATTR233, ATTR177], - 'algo': ABSOLUTE, - }, - 'nand_writes_1gib': { - 'options': [None, 'NAND Writes', 'GiB', 'wear', 'smartd_log.nand_writes_1gib', 'line'], - 'lines': [], - 'attrs': [ATTR249], - 'algo': ABSOLUTE, - }, - 'total_lbas_written': { - 'options': [None, 'Total LBAs Written', 'sectors', 'wear', 'smartd_log.total_lbas_written', 'line'], - 'lines': [], - 'attrs': [ATTR241], - 'algo': ABSOLUTE, - }, - 'total_lbas_read': { - 'options': [None, 'Total LBAs Read', 'sectors', 'wear', 'smartd_log.total_lbas_read', 'line'], - 'lines': [], - 'attrs': [ATTR242], - 'algo': ABSOLUTE, - }, -} - -# NOTE: 'parse_temp' decodes ATA 194 raw value. Not heavily tested. Written by @Ferroin -# C code: -# https://github.com/smartmontools/smartmontools/blob/master/smartmontools/atacmds.cpp#L2051 -# -# Calling 'parse_temp' on the raw value will return a 4-tuple, containing -# * temperature -# * minimum -# * maximum -# * over-temperature count -# substituting None for values it can't decode. -# -# Example: -# >>> parse_temp(42952491042) -# >>> (34, 10, 43, None) -# -# -# def check_temp_word(i): -# if i <= 0x7F: -# return 0x11 -# elif i <= 0xFF: -# return 0x01 -# elif 0xFF80 <= i: -# return 0x10 -# return 0x00 -# -# -# def check_temp_range(t, b0, b1): -# if b0 > b1: -# t0, t1 = b1, b0 -# else: -# t0, t1 = b0, b1 -# -# if all([ -# -60 <= t0, -# t0 <= t, -# t <= t1, -# t1 <= 120, -# not (t0 == -1 and t1 <= 0) -# ]): -# return t0, t1 -# return None, None -# -# -# def parse_temp(raw): -# byte = list() -# word = list() -# for i in range(0, 6): -# byte.append(0xFF & (raw >> (i * 8))) -# for i in range(0, 3): -# word.append(0xFFFF & (raw >> (i * 16))) -# -# ctwd = check_temp_word(word[0]) -# -# if not word[2]: -# if ctwd and not word[1]: -# # byte[0] is temp, no other data -# return byte[0], None, None, None -# -# if ctwd and all(check_temp_range(byte[0], byte[2], byte[3])): -# # byte[0] is temp, byte[2] is max or min, byte[3] is min or max -# trange = check_temp_range(byte[0], byte[2], byte[3]) -# return byte[0], trange[0], trange[1], None -# -# if ctwd and all(check_temp_range(byte[0], byte[1], byte[2])): -# # byte[0] is temp, byte[1] is max or min, byte[2] is min or max -# trange = check_temp_range(byte[0], byte[1], byte[2]) -# return byte[0], trange[0], trange[1], None -# -# return None, None, None, None -# -# if ctwd: -# if all( -# [ -# ctwd & check_temp_word(word[1]) & check_temp_word(word[2]) != 0x00, -# all(check_temp_range(byte[0], byte[2], byte[4])), -# ] -# ): -# # byte[0] is temp, byte[2] is max or min, byte[4] is min or max -# trange = check_temp_range(byte[0], byte[2], byte[4]) -# return byte[0], trange[0], trange[1], None -# else: -# trange = check_temp_range(byte[0], byte[2], byte[3]) -# if word[2] < 0x7FFF and all(trange) and trange[1] >= 40: -# # byte[0] is temp, byte[2] is max or min, byte[3] is min or max, word[2] is overtemp count -# return byte[0], trange[0], trange[1], word[2] -# # no data -# return None, None, None, None - - -CHARTED_ATTRS = dict((attr, k) for k, v in CHARTS.items() for attr in v['attrs']) - - -class BaseAtaSmartAttribute: - def __init__(self, name, normalized_value, raw_value): - self.name = name - self.normalized_value = normalized_value - self.raw_value = raw_value - - def value(self): - raise NotImplementedError - - -class AtaRaw(BaseAtaSmartAttribute): - def value(self): - return self.raw_value - - -class AtaNormalized(BaseAtaSmartAttribute): - def value(self): - return self.normalized_value - - -class Ata3(BaseAtaSmartAttribute): - def value(self): - value = int(self.raw_value) - # https://github.com/netdata/netdata/issues/5919 - # - # 3;151;38684000679; - # 423 (Average 447) - # 38684000679 & 0xFFF -> 423 - # (38684000679 & 0xFFF0000) >> 16 -> 447 - if value > 1e6: - return value & 0xFFF - return value - - -class Ata9(BaseAtaSmartAttribute): - def value(self): - value = int(self.raw_value) - if value > 1e6: - return value & 0xFFFF - return value - - -class Ata190(BaseAtaSmartAttribute): - def value(self): - return 100 - int(self.normalized_value) - - -class Ata194(BaseAtaSmartAttribute): - # https://github.com/netdata/netdata/issues/3041 - # https://github.com/netdata/netdata/issues/5919 - # - # The low byte is the current temperature, the third lowest is the maximum, and the fifth lowest is the minimum - def value(self): - value = int(self.raw_value) - if value > 1e6: - return value & 0xFF - return min(int(self.normalized_value), int(self.raw_value)) - - -class BaseSCSISmartAttribute: - def __init__(self, name, raw_value): - self.name = name - self.raw_value = raw_value - - def value(self): - raise NotImplementedError - - -class SCSIRaw(BaseSCSISmartAttribute): - def value(self): - return self.raw_value - - -def ata_attribute_factory(value): - name = value[0] - - if name == ATTR3: - return Ata3(*value) - elif name == ATTR9: - return Ata9(*value) - elif name == ATTR190: - return Ata190(*value) - elif name == ATTR194: - return Ata194(*value) - elif name in [ - ATTR1, - ATTR7, - ATTR177, - ATTR202, - ATTR206, - ATTR233, - ]: - return AtaNormalized(*value) - - return AtaRaw(*value) - - -def scsi_attribute_factory(value): - return SCSIRaw(*value) - - -def attribute_factory(value): - name = value[0] - if name.isdigit(): - return ata_attribute_factory(value) - return scsi_attribute_factory(value) - - -def handle_error(*errors): - def on_method(method): - def on_call(*args): - try: - return method(*args) - except errors: - return None - - return on_call - - return on_method - - -class DiskLogFile: - def __init__(self, full_path): - self.path = full_path - self.size = os.path.getsize(full_path) - - @handle_error(OSError) - def is_changed(self): - return self.size != os.path.getsize(self.path) - - @handle_error(OSError) - def is_active(self, current_time, limit): - return (current_time - os.path.getmtime(self.path)) / 60 < limit - - @handle_error(OSError) - def read(self): - self.size = os.path.getsize(self.path) - return read_last_line(self.path) - - -class BaseDisk: - def __init__(self, name, log_file): - self.raw_name = name - self.name = re.sub(r'_+', '_', name) - self.log_file = log_file - self.attrs = list() - self.alive = True - self.charted = False - - def __eq__(self, other): - if isinstance(other, BaseDisk): - return self.raw_name == other.raw_name - return self.raw_name == other - - def __ne__(self, other): - return not self == other - - def __hash__(self): - return hash(repr(self)) - - def parser(self, data): - raise NotImplementedError - - @handle_error(TypeError) - def populate_attrs(self): - self.attrs = list() - line = self.log_file.read() - for value in self.parser(line): - self.attrs.append(attribute_factory(value)) - - return len(self.attrs) - - def data(self): - data = dict() - for attr in self.attrs: - data['{0}_{1}'.format(self.name, attr.name)] = attr.value() - return data - - -class ATADisk(BaseDisk): - def parser(self, data): - return RE_ATA.findall(data) - - -class SCSIDisk(BaseDisk): - def parser(self, data): - return RE_SCSI.findall(data) - - -class Service(SimpleService): - def __init__(self, configuration=None, name=None): - SimpleService.__init__(self, configuration=configuration, name=name) - self.order = ORDER - self.definitions = deepcopy(CHARTS) - self.log_path = configuration.get('log_path', DEF_PATH) - self.age = configuration.get('age', DEF_AGE) - self.exclude = configuration.get('exclude_disks', str()).split() - self.disks = list() - self.runs = 0 - self.do_force_rescan = False - - def check(self): - return self.scan() > 0 - - def get_data(self): - self.runs += 1 - - if self.do_force_rescan or self.runs % DEF_RESCAN_INTERVAL == 0: - self.cleanup() - self.scan() - self.do_force_rescan = False - - data = dict() - - for disk in self.disks: - if not disk.alive: - continue - - if not disk.charted: - self.add_disk_to_charts(disk) - - changed = disk.log_file.is_changed() - - if changed is None: - disk.alive = False - self.do_force_rescan = True - continue - - if changed and disk.populate_attrs() is None: - disk.alive = False - self.do_force_rescan = True - continue - - data.update(disk.data()) - - return data - - def cleanup(self): - current_time = time() - for disk in self.disks[:]: - if any( - [ - not disk.alive, - not disk.log_file.is_active(current_time, self.age), - ] - ): - self.disks.remove(disk.raw_name) - self.remove_disk_from_charts(disk) - - def scan(self): - self.debug('scanning {0}'.format(self.log_path)) - current_time = time() - - for full_name in os.listdir(self.log_path): - disk = self.create_disk_from_file(full_name, current_time) - if not disk: - continue - self.disks.append(disk) - - return len(self.disks) - - def create_disk_from_file(self, full_name, current_time): - if not full_name.endswith(CSV): - self.debug('skipping {0}: not a csv file'.format(full_name)) - return None - - name = os.path.basename(full_name).split('.')[-3] - path = os.path.join(self.log_path, full_name) - - if name in self.disks: - self.debug('skipping {0}: already in disks'.format(full_name)) - return None - - if [p for p in self.exclude if p in name]: - self.debug('skipping {0}: filtered by `exclude` option'.format(full_name)) - return None - - if not os.access(path, os.R_OK): - self.debug('skipping {0}: not readable'.format(full_name)) - return None - - if os.path.getsize(path) == 0: - self.debug('skipping {0}: zero size'.format(full_name)) - return None - - if (current_time - os.path.getmtime(path)) / 60 > self.age: - self.debug('skipping {0}: haven\'t been updated for last {1} minutes'.format(full_name, self.age)) - return None - - if ATA in full_name: - disk = ATADisk(name, DiskLogFile(path)) - elif SCSI in full_name: - disk = SCSIDisk(name, DiskLogFile(path)) - else: - self.debug('skipping {0}: unknown type'.format(full_name)) - return None - - disk.populate_attrs() - if not disk.attrs: - self.error('skipping {0}: parsing failed'.format(full_name)) - return None - - self.debug('added {0}'.format(full_name)) - return disk - - def add_disk_to_charts(self, disk): - if len(self.charts) == 0 or disk.charted: - return - disk.charted = True - - for attr in disk.attrs: - chart_id = CHARTED_ATTRS.get(attr.name) - - if not chart_id or chart_id not in self.charts: - continue - - chart = self.charts[chart_id] - dim = [ - '{0}_{1}'.format(disk.name, attr.name), - disk.name, - CHARTS[chart_id]['algo'], - ] - - if dim[0] in self.charts[chart_id].dimensions: - chart.hide_dimension(dim[0], reverse=True) - else: - chart.add_dimension(dim) - - def remove_disk_from_charts(self, disk): - if len(self.charts) == 0 or not disk.charted: - return - - for attr in disk.attrs: - chart_id = CHARTED_ATTRS.get(attr.name) - - if not chart_id or chart_id not in self.charts: - continue - - self.charts[chart_id].del_dimension('{0}_{1}'.format(disk.name, attr.name)) |