diff options
Diffstat (limited to 'python.d/smartd_log.chart.py')
-rw-r--r-- | python.d/smartd_log.chart.py | 508 |
1 files changed, 316 insertions, 192 deletions
diff --git a/python.d/smartd_log.chart.py b/python.d/smartd_log.chart.py index 4039c153..07ad88cd 100644 --- a/python.d/smartd_log.chart.py +++ b/python.d/smartd_log.chart.py @@ -2,221 +2,345 @@ # Description: smart netdata python.d module # Author: l2isbad, vorph1 -from re import compile as r_compile -from os import listdir, access, R_OK -from os.path import isfile, join, getsize, basename, isdir -try: - from queue import Queue -except ImportError: - from Queue import Queue -from threading import Thread -from base import SimpleService +import os +import re + from collections import namedtuple +from time import time -# default module values (can be overridden per job in `config`) -update_every = 5 -priority = 60000 +from bases.collection import read_last_line +from bases.FrameworkServices.SimpleService import SimpleService # charts order (can be overridden if you want less charts, or different order) ORDER = ['1', '4', '5', '7', '9', '12', '193', '194', '197', '198', '200'] SMART_ATTR = { - '1': 'Read Error Rate', - '2': 'Throughput Performance', - '3': 'Spin-Up Time', - '4': 'Start/Stop Count', - '5': 'Reallocated Sectors Count', - '6': 'Read Channel Margin', - '7': 'Seek Error Rate', - '8': 'Seek Time Performance', - '9': 'Power-On Hours Count', - '10': 'Spin-up Retries', - '11': 'Calibration Retries', - '12': 'Power Cycle Count', - '13': 'Soft Read Error Rate', - '100': 'Erase/Program Cycles', - '103': 'Translation Table Rebuild', - '108': 'Unknown (108)', - '170': 'Reserved Block Count', - '171': 'Program Fail Count', - '172': 'Erase Fail Count', - '173': 'Wear Leveller Worst Case Erase Count', - '174': 'Unexpected Power Loss', - '175': 'Program Fail Count', - '176': 'Erase Fail Count', - '177': 'Wear Leveling Count', - '178': 'Used Reserved Block Count', - '179': 'Used Reserved Block Count', - '180': 'Unused Reserved Block Count', - '181': 'Program Fail Count', - '182': 'Erase Fail Count', - '183': 'SATA Downshifts', - '184': 'End-to-End error', - '185': 'Head Stability', - '186': 'Induced Op-Vibration Detection', - '187': 'Reported Uncorrectable Errors', - '188': 'Command Timeout', - '189': 'High Fly Writes', - '190': 'Temperature', - '191': 'G-Sense Errors', - '192': 'Power-Off Retract Cycles', - '193': 'Load/Unload Cycles', - '194': 'Temperature', - '195': 'Hardware ECC Recovered', - '196': 'Reallocation Events', - '197': 'Current Pending Sectors', - '198': 'Off-line Uncorrectable', - '199': 'UDMA CRC Error Rate', - '200': 'Write Error Rate', - '201': 'Soft Read Errors', - '202': 'Data Address Mark Errors', - '203': 'Run Out Cancel', - '204': 'Soft ECC Corrections', - '205': 'Thermal Asperity Rate', - '206': 'Flying Height', - '207': 'Spin High Current', - '209': 'Offline Seek Performance', - '220': 'Disk Shift', - '221': 'G-Sense Error Rate', - '222': 'Loaded Hours', - '223': 'Load/Unload Retries', - '224': 'Load Friction', - '225': 'Load/Unload Cycles', - '226': 'Load-in Time', - '227': 'Torque Amplification Count', - '228': 'Power-Off Retracts', - '230': 'GMR Head Amplitude', - '231': 'Temperature', - '232': 'Available Reserved Space', - '233': 'Media Wearout Indicator', - '240': 'Head Flying Hours', - '241': 'Total LBAs Written', - '242': 'Total LBAs Read', - '250': 'Read Error Retry Rate' + '1': 'Read Error Rate', + '2': 'Throughput Performance', + '3': 'Spin-Up Time', + '4': 'Start/Stop Count', + '5': 'Reallocated Sectors Count', + '6': 'Read Channel Margin', + '7': 'Seek Error Rate', + '8': 'Seek Time Performance', + '9': 'Power-On Hours Count', + '10': 'Spin-up Retries', + '11': 'Calibration Retries', + '12': 'Power Cycle Count', + '13': 'Soft Read Error Rate', + '100': 'Erase/Program Cycles', + '103': 'Translation Table Rebuild', + '108': 'Unknown (108)', + '170': 'Reserved Block Count', + '171': 'Program Fail Count', + '172': 'Erase Fail Count', + '173': 'Wear Leveller Worst Case Erase Count', + '174': 'Unexpected Power Loss', + '175': 'Program Fail Count', + '176': 'Erase Fail Count', + '177': 'Wear Leveling Count', + '178': 'Used Reserved Block Count', + '179': 'Used Reserved Block Count', + '180': 'Unused Reserved Block Count', + '181': 'Program Fail Count', + '182': 'Erase Fail Count', + '183': 'SATA Downshifts', + '184': 'End-to-End error', + '185': 'Head Stability', + '186': 'Induced Op-Vibration Detection', + '187': 'Reported Uncorrectable Errors', + '188': 'Command Timeout', + '189': 'High Fly Writes', + '190': 'Temperature', + '191': 'G-Sense Errors', + '192': 'Power-Off Retract Cycles', + '193': 'Load/Unload Cycles', + '194': 'Temperature', + '195': 'Hardware ECC Recovered', + '196': 'Reallocation Events', + '197': 'Current Pending Sectors', + '198': 'Off-line Uncorrectable', + '199': 'UDMA CRC Error Rate', + '200': 'Write Error Rate', + '201': 'Soft Read Errors', + '202': 'Data Address Mark Errors', + '203': 'Run Out Cancel', + '204': 'Soft ECC Corrections', + '205': 'Thermal Asperity Rate', + '206': 'Flying Height', + '207': 'Spin High Current', + '209': 'Offline Seek Performance', + '220': 'Disk Shift', + '221': 'G-Sense Error Rate', + '222': 'Loaded Hours', + '223': 'Load/Unload Retries', + '224': 'Load Friction', + '225': 'Load/Unload Cycles', + '226': 'Load-in Time', + '227': 'Torque Amplification Count', + '228': 'Power-Off Retracts', + '230': 'GMR Head Amplitude', + '231': 'Temperature', + '232': 'Available Reserved Space', + '233': 'Media Wearout Indicator', + '240': 'Head Flying Hours', + '241': 'Total LBAs Written', + '242': 'Total LBAs Read', + '250': 'Read Error Retry Rate' +} + +LIMIT = namedtuple('LIMIT', ['min', 'max']) + +LIMITS = { + '194': LIMIT(0, 200) } -NAMED_DISKS = namedtuple('disks', ['name', 'size', 'number']) +RESCAN_INTERVAL = 60 + +REGEX = re.compile( + '(\d+);' # attribute + '(\d+);' # normalized value + '(\d+)', # raw value + re.X +) + + +def chart_template(chart_name): + units, attr_id = chart_name.split('_')[-2:] + title = '{value_type} {description}'.format(value_type=units.capitalize(), + description=SMART_ATTR[attr_id]) + family = SMART_ATTR[attr_id].lower() + + return { + chart_name: { + 'options': [None, title, units, family, 'smartd_log.' + chart_name, 'line'], + 'lines': [] + } + } + + +def handle_os_error(method): + def on_call(*args): + try: + return method(*args) + except OSError: + return None + return on_call + + +class SmartAttribute(object): + def __init__(self, idx, normalized, raw): + self.id = idx + self.normalized = normalized + self._raw = raw + + @property + def raw(self): + if self.id in LIMITS: + limit = LIMITS[self.id] + if limit.min <= int(self._raw) <= limit.max: + return self._raw + return None + return self._raw + + @raw.setter + def raw(self, value): + self._raw = value + + +class DiskLogFile: + def __init__(self, path): + self.path = path + self.size = os.path.getsize(path) + + @handle_os_error + def is_changed(self): + new_size = os.path.getsize(self.path) + old_size, self.size = self.size, new_size + + return new_size != old_size and new_size + + @staticmethod + @handle_os_error + def is_valid(log_file, exclude): + return all([log_file.endswith('.csv'), + not [p for p in exclude if p in log_file], + os.access(log_file, os.R_OK), + os.path.getsize(log_file)]) + + +class Disk: + def __init__(self, full_path, age): + self.log_file = DiskLogFile(full_path) + self.name = os.path.basename(full_path).split('.')[-3] + self.age = int(age) + self.status = True + self.attributes = dict() + + self.get_attributes() + + def __eq__(self, other): + if isinstance(other, Disk): + return self.name == other.name + return self.name == other + + @handle_os_error + def is_active(self): + return (time() - os.path.getmtime(self.log_file.path)) / 60 < self.age + + @handle_os_error + def get_attributes(self): + last_line = read_last_line(self.log_file.path) + self.attributes = dict((attr, SmartAttribute(attr, normalized, raw)) for attr, normalized, raw + in REGEX.findall(last_line)) + return True + + def data(self): + data = dict() + for attr in self.attributes.values(): + data['_'.join([self.name, 'normalized', attr.id])] = attr.normalized + if attr.raw is not None: + data['_'.join([self.name, 'raw', attr.id])] = attr.raw + return data class Service(SimpleService): def __init__(self, configuration=None, name=None): SimpleService.__init__(self, configuration=configuration, name=name) - self.regex = r_compile(r'(\d+);(\d+);(\d+)') self.log_path = self.configuration.get('log_path', '/var/log/smartd') - self.raw_values = self.configuration.get('raw_values') - self.attr = self.configuration.get('smart_attributes', []) - self.previous_data = dict() + self.raw = self.configuration.get('raw_values', True) + self.exclude = self.configuration.get('exclude_disks', str()).split() + self.age = self.configuration.get('age', 30) + + self.runs = 0 + self.disks = list() + self.order = list() + self.definitions = dict() def check(self): - # Can\'t start without smartd readable diks log files - disks = find_disks_in_log_path(self.log_path) - if not disks: - self.error('Can\'t locate any smartd log files in %s' % self.log_path) - return False - - # List of namedtuples to track smartd log file size - self.disks = [NAMED_DISKS(name=disks[i], size=0, number=i) for i in range(len(disks))] - - if self._get_data(): - self.create_charts() - return True - else: - self.error('Can\'t collect any data. Sorry.') - return False - - def _get_raw_data(self, queue, disk): - # The idea is to open a file. - # Jump to the end. - # Seek backward until '\n' symbol appears - # If '\n' is found or it's the beginning of the file - # readline()! (last or first line) - with open(disk, 'rb') as f: - f.seek(-2, 2) - while f.read(1) != b'\n': - f.seek(-2, 1) - if f.tell() == 0: - break - result = f.readline() - - result = result.decode() - result = self.regex.findall(result) - - queue.put([basename(disk), result]) - - def _get_data(self): - threads, result = list(), list() - queue = Queue() - to_netdata = dict() - - # If the size has not changed there is no reason to poll log files. - disks = [disk for disk in self.disks if self.size_changed(disk)] - if disks: - for disk in disks: - th = Thread(target=self._get_raw_data, args=(queue, disk.name)) - th.start() - threads.append(th) - - for thread in threads: - thread.join() - result.append(queue.get()) + self.disks = self.scan() + + if not self.disks: + return None + + user_defined_sa = self.configuration.get('smart_attributes') + + if user_defined_sa: + order = user_defined_sa.split() or ORDER else: - # Data from last real poll - return self.previous_data or None + order = ORDER - for elem in result: - for a, n, r in elem[1]: - to_netdata.update({'_'.join([elem[0], a]): r if self.raw_values else n}) + self.create_charts(order) - self.previous_data.update(to_netdata) + return True - return to_netdata or None + def get_data(self): + self.runs += 1 - def size_changed(self, disk): - # We are not interested in log files: - # 1. zero size - # 2. size is not changed since last poll - try: - size = getsize(disk.name) - if size != disk.size and size: - self.disks[disk.number] = disk._replace(size=size) - return True - else: - return False - except OSError: - # Remove unreadable/nonexisting log files from list of disks and previous_data - self.disks.remove(disk) - self.previous_data = dict([(k, v) for k, v in self.previous_data.items() if basename(disk.name) not in k]) - return False + if self.runs % RESCAN_INTERVAL == 0: + self.cleanup_and_rescan() + + data = dict() + + for disk in self.disks: + + if not disk.status: + continue + + changed = disk.log_file.is_changed() + + # True = changed, False = unchanged, None = Exception + if changed is None: + disk.status = False + continue + + if changed: + success = disk.get_attributes() + if not success: + disk.status = False + continue + + data.update(disk.data()) + + return data or None - def create_charts(self): + def create_charts(self, order): + for attr in order: + raw_name, normalized_name = 'attr_id_raw_' + attr, 'attr_id_normalized_' + attr + raw, normalized = chart_template(raw_name), chart_template(normalized_name) + self.order.extend([normalized_name, raw_name]) + self.definitions.update(raw) + self.definitions.update(normalized) - def create_lines(attrid): - result = list() for disk in self.disks: - name = basename(disk.name) - result.append(['_'.join([name, attrid]), name[:name.index('.')], 'absolute']) - return result + if attr not in disk.attributes: + self.debug("'{disk}' has no attribute '{attr_id}'".format(disk=disk.name, + attr_id=attr)) + continue + normalized[normalized_name]['lines'].append(['_'.join([disk.name, 'normalized', attr]), disk.name]) - # Use configured attributes, if present. If something goes wrong we don't care. - order = ORDER - try: - order = [attr for attr in self.attr.split() if attr in SMART_ATTR.keys()] or ORDER - except Exception: - pass - self.order = [''.join(['attrid', i]) for i in order] - self.definitions = dict() - units = 'raw' if self.raw_values else 'normalized' - - for k, v in dict([(k, v) for k, v in SMART_ATTR.items() if k in ORDER]).items(): - self.definitions.update({''.join(['attrid', k]): { - 'options': [None, v, units, v.lower(), 'smartd.attrid' + k, 'line'], - 'lines': create_lines(k)}}) - -def find_disks_in_log_path(log_path): - # smartd log file is OK if: - # 1. it is a file - # 2. file name endswith with 'csv' - # 3. file is readable - if not isdir(log_path): return None - return [join(log_path, f) for f in listdir(log_path) - if all([isfile(join(log_path, f)), f.endswith('.csv'), access(join(log_path, f), R_OK)])] + if not self.raw: + continue + + if disk.attributes[attr].raw is not None: + raw[raw_name]['lines'].append(['_'.join([disk.name, 'raw', attr]), disk.name]) + continue + self.debug("'{disk}' attribute '{attr_id}' value not in {limits}".format(disk=disk.name, + attr_id=attr, + limits=LIMITS[attr])) + + def cleanup_and_rescan(self): + self.cleanup() + new_disks = self.scan(only_new=True) + + for disk in new_disks: + valid = False + + for chart in self.charts: + value_type, idx = chart.id.split('_')[2:] + + if idx in disk.attributes: + valid = True + dimension_id = '_'.join([disk.name, value_type, idx]) + + if dimension_id in chart: + chart.hide_dimension(dimension_id=dimension_id, reverse=True) + else: + chart.add_dimension([dimension_id, disk.name]) + if valid: + self.disks.append(disk) + + def cleanup(self): + for disk in self.disks: + + if not disk.is_active(): + disk.status = False + if not disk.status: + for chart in self.charts: + dimension_id = '_'.join([disk.name, chart.id[8:]]) + chart.hide_dimension(dimension_id=dimension_id) + + self.disks = [disk for disk in self.disks if disk.status] + + def scan(self, only_new=None): + new_disks = list() + for f in os.listdir(self.log_path): + full_path = os.path.join(self.log_path, f) + + if DiskLogFile.is_valid(full_path, self.exclude): + disk = Disk(full_path, self.age) + + active = disk.is_active() + if active is None: + continue + if active: + if not only_new: + new_disks.append(disk) + else: + if disk not in self.disks: + new_disks.append(disk) + else: + if not only_new: + self.debug("'{disk}' not updated in the last {age} minutes, " + "skipping it.".format(disk=disk.name, age=self.age)) + return new_disks |