diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2018-11-07 12:19:29 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2018-11-07 12:20:17 +0000 |
commit | a64a253794ac64cb40befee54db53bde17dd0d49 (patch) | |
tree | c1024acc5f6e508814b944d99f112259bb28b1be /collectors/python.d.plugin/smartd_log | |
parent | New upstream version 1.10.0+dfsg (diff) | |
download | netdata-a64a253794ac64cb40befee54db53bde17dd0d49.tar.xz netdata-a64a253794ac64cb40befee54db53bde17dd0d49.zip |
New upstream version 1.11.0+dfsgupstream/1.11.0+dfsg
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'collectors/python.d.plugin/smartd_log')
-rw-r--r-- | collectors/python.d.plugin/smartd_log/Makefile.inc | 13 | ||||
-rw-r--r-- | collectors/python.d.plugin/smartd_log/README.md | 38 | ||||
-rw-r--r-- | collectors/python.d.plugin/smartd_log/smartd_log.chart.py | 353 | ||||
-rw-r--r-- | collectors/python.d.plugin/smartd_log/smartd_log.conf | 90 |
4 files changed, 494 insertions, 0 deletions
diff --git a/collectors/python.d.plugin/smartd_log/Makefile.inc b/collectors/python.d.plugin/smartd_log/Makefile.inc new file mode 100644 index 000000000..dc1d0f3fb --- /dev/null +++ b/collectors/python.d.plugin/smartd_log/Makefile.inc @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: GPL-3.0-or-later + +# THIS IS NOT A COMPLETE Makefile +# IT IS INCLUDED BY ITS PARENT'S Makefile.am +# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT + +# install these files +dist_python_DATA += smartd_log/smartd_log.chart.py +dist_pythonconfig_DATA += smartd_log/smartd_log.conf + +# do not install these files, but include them in the distribution +dist_noinst_DATA += smartd_log/README.md smartd_log/Makefile.inc + diff --git a/collectors/python.d.plugin/smartd_log/README.md b/collectors/python.d.plugin/smartd_log/README.md new file mode 100644 index 000000000..121a63573 --- /dev/null +++ b/collectors/python.d.plugin/smartd_log/README.md @@ -0,0 +1,38 @@ +# smartd_log + +Module monitor `smartd` log files to collect HDD/SSD S.M.A.R.T attributes. + +It produces following charts (you can add additional attributes in the module configuration file): + +1. **Read Error Rate** attribute 1 + +2. **Start/Stop Count** attribute 4 + +3. **Reallocated Sectors Count** attribute 5 + +4. **Seek Error Rate** attribute 7 + +5. **Power-On Hours Count** attribute 9 + +6. **Power Cycle Count** attribute 12 + +7. **Load/Unload Cycles** attribute 193 + +8. **Temperature** attribute 194 + +9. **Current Pending Sectors** attribute 197 + +10. **Off-Line Uncorrectable** attribute 198 + +11. **Write Error Rate** attribute 200 + +### configuration + +```yaml +local: + log_path : '/var/log/smartd/' +``` + +If no configuration is given, module will attempt to read log files in /var/log/smartd/ directory. + +--- diff --git a/collectors/python.d.plugin/smartd_log/smartd_log.chart.py b/collectors/python.d.plugin/smartd_log/smartd_log.chart.py new file mode 100644 index 000000000..21dbccecc --- /dev/null +++ b/collectors/python.d.plugin/smartd_log/smartd_log.chart.py @@ -0,0 +1,353 @@ +# -*- coding: utf-8 -*- +# Description: smart netdata python.d module +# Author: l2isbad, vorph1 +# SPDX-License-Identifier: GPL-3.0-or-later + +import os +import re + +from collections import namedtuple +from time import time + +from bases.collection import read_last_line +from bases.FrameworkServices.SimpleService import SimpleService + +# charts order (can be overridden if you want less charts, or different order) +ORDER = ['1', '4', '5', '7', '9', '12', '193', '194', '197', '198', '200'] + +SMART_ATTR = { + '1': 'Read Error Rate', + '2': 'Throughput Performance', + '3': 'Spin-Up Time', + '4': 'Start/Stop Count', + '5': 'Reallocated Sectors Count', + '6': 'Read Channel Margin', + '7': 'Seek Error Rate', + '8': 'Seek Time Performance', + '9': 'Power-On Hours Count', + '10': 'Spin-up Retries', + '11': 'Calibration Retries', + '12': 'Power Cycle Count', + '13': 'Soft Read Error Rate', + '100': 'Erase/Program Cycles', + '103': 'Translation Table Rebuild', + '108': 'Unknown (108)', + '170': 'Reserved Block Count', + '171': 'Program Fail Count', + '172': 'Erase Fail Count', + '173': 'Wear Leveller Worst Case Erase Count', + '174': 'Unexpected Power Loss', + '175': 'Program Fail Count', + '176': 'Erase Fail Count', + '177': 'Wear Leveling Count', + '178': 'Used Reserved Block Count', + '179': 'Used Reserved Block Count', + '180': 'Unused Reserved Block Count', + '181': 'Program Fail Count', + '182': 'Erase Fail Count', + '183': 'SATA Downshifts', + '184': 'End-to-End error', + '185': 'Head Stability', + '186': 'Induced Op-Vibration Detection', + '187': 'Reported Uncorrectable Errors', + '188': 'Command Timeout', + '189': 'High Fly Writes', + '190': 'Temperature', + '191': 'G-Sense Errors', + '192': 'Power-Off Retract Cycles', + '193': 'Load/Unload Cycles', + '194': 'Temperature', + '195': 'Hardware ECC Recovered', + '196': 'Reallocation Events', + '197': 'Current Pending Sectors', + '198': 'Off-line Uncorrectable', + '199': 'UDMA CRC Error Rate', + '200': 'Write Error Rate', + '201': 'Soft Read Errors', + '202': 'Data Address Mark Errors', + '203': 'Run Out Cancel', + '204': 'Soft ECC Corrections', + '205': 'Thermal Asperity Rate', + '206': 'Flying Height', + '207': 'Spin High Current', + '209': 'Offline Seek Performance', + '220': 'Disk Shift', + '221': 'G-Sense Error Rate', + '222': 'Loaded Hours', + '223': 'Load/Unload Retries', + '224': 'Load Friction', + '225': 'Load/Unload Cycles', + '226': 'Load-in Time', + '227': 'Torque Amplification Count', + '228': 'Power-Off Retracts', + '230': 'GMR Head Amplitude', + '231': 'Temperature', + '232': 'Available Reserved Space', + '233': 'Media Wearout Indicator', + '240': 'Head Flying Hours', + '241': 'Total LBAs Written', + '242': 'Total LBAs Read', + '250': 'Read Error Retry Rate' +} + +LIMIT = namedtuple('LIMIT', ['min', 'max']) + +LIMITS = { + '194': LIMIT(0, 200) +} + +RESCAN_INTERVAL = 60 + +REGEX = re.compile( + '(\d+);' # attribute + '(\d+);' # normalized value + '(\d+)', # raw value + re.X +) + + +def chart_template(chart_name): + units, attr_id = chart_name.split('_')[-2:] + title = '{value_type} {description}'.format(value_type=units.capitalize(), + description=SMART_ATTR[attr_id]) + family = SMART_ATTR[attr_id].lower() + + return { + chart_name: { + 'options': [None, title, units, family, 'smartd_log.' + chart_name, 'line'], + 'lines': [] + } + } + + +def handle_os_error(method): + def on_call(*args): + try: + return method(*args) + except OSError: + return None + return on_call + + +class SmartAttribute(object): + def __init__(self, idx, normalized, raw): + self.id = idx + self.normalized = normalized + self._raw = raw + + @property + def raw(self): + if self.id in LIMITS: + limit = LIMITS[self.id] + if limit.min <= int(self._raw) <= limit.max: + return self._raw + return None + return self._raw + + @raw.setter + def raw(self, value): + self._raw = value + + +class DiskLogFile: + def __init__(self, path): + self.path = path + self.size = os.path.getsize(path) + + @handle_os_error + def is_changed(self): + new_size = os.path.getsize(self.path) + old_size, self.size = self.size, new_size + + return new_size != old_size and new_size + + @staticmethod + @handle_os_error + def is_valid(log_file, exclude): + return all([log_file.endswith('.csv'), + not [p for p in exclude if p in log_file], + os.access(log_file, os.R_OK), + os.path.getsize(log_file)]) + + +class Disk: + def __init__(self, full_path, age): + self.log_file = DiskLogFile(full_path) + self.name = os.path.basename(full_path).split('.')[-3] + self.age = int(age) + self.status = True + self.attributes = dict() + + self.get_attributes() + + def __eq__(self, other): + if isinstance(other, Disk): + return self.name == other.name + return self.name == other + + def __ne__(self, other): + return not self == other + + def __hash__(self): + return hash(repr(self)) + + @handle_os_error + def is_active(self): + return (time() - os.path.getmtime(self.log_file.path)) / 60 < self.age + + @handle_os_error + def get_attributes(self): + last_line = read_last_line(self.log_file.path) + self.attributes = dict((attr, SmartAttribute(attr, normalized, raw)) for attr, normalized, raw + in REGEX.findall(last_line)) + return True + + def data(self): + data = dict() + for attr in self.attributes.values(): + data['_'.join([self.name, 'normalized', attr.id])] = attr.normalized + if attr.raw is not None: + data['_'.join([self.name, 'raw', attr.id])] = attr.raw + return data + + +class Service(SimpleService): + def __init__(self, configuration=None, name=None): + SimpleService.__init__(self, configuration=configuration, name=name) + self.log_path = self.configuration.get('log_path', '/var/log/smartd') + self.raw = self.configuration.get('raw_values', True) + self.exclude = self.configuration.get('exclude_disks', str()).split() + self.age = self.configuration.get('age', 30) + + self.runs = 0 + self.disks = list() + self.order = list() + self.definitions = dict() + + def check(self): + self.disks = self.scan() + + if not self.disks: + return None + + user_defined_sa = self.configuration.get('smart_attributes') + + if user_defined_sa: + order = user_defined_sa.split() or ORDER + else: + order = ORDER + + self.create_charts(order) + + return True + + def get_data(self): + self.runs += 1 + + if self.runs % RESCAN_INTERVAL == 0: + self.cleanup_and_rescan() + + data = dict() + + for disk in self.disks: + + if not disk.status: + continue + + changed = disk.log_file.is_changed() + + # True = changed, False = unchanged, None = Exception + if changed is None: + disk.status = False + continue + + if changed: + success = disk.get_attributes() + if not success: + disk.status = False + continue + + data.update(disk.data()) + + return data or None + + def create_charts(self, order): + for attr in order: + raw_name, normalized_name = 'attr_id_raw_' + attr, 'attr_id_normalized_' + attr + raw, normalized = chart_template(raw_name), chart_template(normalized_name) + self.order.extend([normalized_name, raw_name]) + self.definitions.update(raw) + self.definitions.update(normalized) + + for disk in self.disks: + if attr not in disk.attributes: + self.debug("'{disk}' has no attribute '{attr_id}'".format(disk=disk.name, + attr_id=attr)) + continue + normalized[normalized_name]['lines'].append(['_'.join([disk.name, 'normalized', attr]), disk.name]) + + if not self.raw: + continue + + if disk.attributes[attr].raw is not None: + raw[raw_name]['lines'].append(['_'.join([disk.name, 'raw', attr]), disk.name]) + continue + self.debug("'{disk}' attribute '{attr_id}' value not in {limits}".format(disk=disk.name, + attr_id=attr, + limits=LIMITS[attr])) + + def cleanup_and_rescan(self): + self.cleanup() + new_disks = self.scan(only_new=True) + + for disk in new_disks: + valid = False + + for chart in self.charts: + value_type, idx = chart.id.split('_')[2:] + + if idx in disk.attributes: + valid = True + dimension_id = '_'.join([disk.name, value_type, idx]) + + if dimension_id in chart: + chart.hide_dimension(dimension_id=dimension_id, reverse=True) + else: + chart.add_dimension([dimension_id, disk.name]) + if valid: + self.disks.append(disk) + + def cleanup(self): + for disk in self.disks: + + if not disk.is_active(): + disk.status = False + if not disk.status: + for chart in self.charts: + dimension_id = '_'.join([disk.name, chart.id[8:]]) + chart.hide_dimension(dimension_id=dimension_id) + + self.disks = [disk for disk in self.disks if disk.status] + + def scan(self, only_new=None): + new_disks = list() + for f in os.listdir(self.log_path): + full_path = os.path.join(self.log_path, f) + + if DiskLogFile.is_valid(full_path, self.exclude): + disk = Disk(full_path, self.age) + + active = disk.is_active() + if active is None: + continue + if active: + if not only_new: + new_disks.append(disk) + else: + if disk not in self.disks: + new_disks.append(disk) + else: + if not only_new: + self.debug("'{disk}' not updated in the last {age} minutes, " + "skipping it.".format(disk=disk.name, age=self.age)) + return new_disks diff --git a/collectors/python.d.plugin/smartd_log/smartd_log.conf b/collectors/python.d.plugin/smartd_log/smartd_log.conf new file mode 100644 index 000000000..3fab3f1c0 --- /dev/null +++ b/collectors/python.d.plugin/smartd_log/smartd_log.conf @@ -0,0 +1,90 @@ +# netdata python.d.plugin configuration for smartd log +# +# This file is in YaML format. Generally the format is: +# +# name: value +# +# There are 2 sections: +# - global variables +# - one or more JOBS +# +# JOBS allow you to collect values from multiple sources. +# Each source will have its own set of charts. +# +# JOB parameters have to be indented (using spaces only, example below). + +# ---------------------------------------------------------------------- +# Global Variables +# These variables set the defaults for all JOBs, however each JOB +# may define its own, overriding the defaults. + +# update_every sets the default data collection frequency. +# If unset, the python.d.plugin default is used. +# update_every: 1 + +# priority controls the order of charts at the netdata dashboard. +# Lower numbers move the charts towards the top of the page. +# If unset, the default for python.d.plugin is used. +# priority: 60000 + +# retries sets the number of retries to be made in case of failures. +# If unset, the default for python.d.plugin is used. +# Attempts to restore the service are made once every update_every +# and only if the module has collected values in the past. +# retries: 60 + +# autodetection_retry sets the job re-check interval in seconds. +# The job is not deleted if check fails. +# Attempts to start the job are made once every autodetection_retry. +# This feature is disabled by default. +# autodetection_retry: 0 + +# ---------------------------------------------------------------------- +# JOBS (data collection sources) +# +# The default JOBS share the same *name*. JOBS with the same name +# are mutually exclusive. Only one of them will be allowed running at +# any time. This allows autodetection to try several alternatives and +# pick the one that works. +# +# Any number of jobs is supported. +# +# All python.d.plugin JOBS (for all its modules) support a set of +# predefined parameters. These are: +# +# job_name: +# name: myname # the JOB's name as it will appear at the +# # dashboard (by default is the job_name) +# # JOBs sharing a name are mutually exclusive +# update_every: 1 # the JOB's data collection frequency +# priority: 60000 # the JOB's order on the dashboard +# retries: 60 # the JOB's number of restoration attempts +# autodetection_retry: 0 # the JOB's re-check interval in seconds +# +# Additionally to the above, smartd_log also supports the following: +# +# log_path: '/path/to/smartdlogs' # path to smartd log files. Default is /var/log/smartd +# raw_values: yes # enable/disable raw values charts. Enabled by default. +# smart_attributes: '1 2 3 4 44' # smart attributes charts. Default are ['1', '4', '5', '7', '9', '12', '193', '194', '197', '198', '200']. +# exclude_disks: 'PATTERN1 PATTERN2' # space separated patterns. If the pattern is in the drive name, the module will not collect data for it. +# +# ---------------------------------------------------------------------- +# Additional information +# Plugin reads smartd log files (-A option). +# You need to add (man smartd) to /etc/default/smartmontools '-i 600 -A /var/log/smartd/' to pass additional options to smartd on startup +# Then restart smartd service and check /path/log/smartdlogs +# ls /var/log/smartd/ +# CDC_WD10EZEX_00BN5A0-WD_WCC3F7FLVZS9.ata.csv WDC_WD10EZEX_00BN5A0-WD_WCC3F7FLVZS9.ata.csv ZDC_WD10EZEX_00BN5A0-WD_WCC3F7FLVZS9.ata.csv +# +# Smartd APPEND logs at every run. Its NOT RECOMMENDED to set '-i' option below 60 sec. +# STRONGLY RECOMMENDED to create smartd conf file for logrotate +# +# RAW vs NORMALIZED values +# "Normalized value", commonly referred to as just "value". This is a most universal measurement, on the scale from 0 (bad) to some maximum (good) value. +# Maximum values are typically 100, 200 or 253. Rule of thumb is: high values are good, low values are bad. +# +# "Raw value" - the value of the attribute as it is tracked by the device, before any normalization takes place. +# Some raw numbers provide valuable insight when properly interpreted. These cases will be discussed later on. +# Raw values are typically listed in hexadecimal numbers. The raw value has different structure for different vendors and is often not meaningful as a decimal number. +# +# ---------------------------------------------------------------------- |