summaryrefslogtreecommitdiffstats
path: root/collectors/python.d.plugin/smartd_log
diff options
context:
space:
mode:
Diffstat (limited to 'collectors/python.d.plugin/smartd_log')
-rw-r--r--collectors/python.d.plugin/smartd_log/Makefile.inc13
-rw-r--r--collectors/python.d.plugin/smartd_log/README.md38
-rw-r--r--collectors/python.d.plugin/smartd_log/smartd_log.chart.py353
-rw-r--r--collectors/python.d.plugin/smartd_log/smartd_log.conf90
4 files changed, 494 insertions, 0 deletions
diff --git a/collectors/python.d.plugin/smartd_log/Makefile.inc b/collectors/python.d.plugin/smartd_log/Makefile.inc
new file mode 100644
index 000000000..dc1d0f3fb
--- /dev/null
+++ b/collectors/python.d.plugin/smartd_log/Makefile.inc
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+# THIS IS NOT A COMPLETE Makefile
+# IT IS INCLUDED BY ITS PARENT'S Makefile.am
+# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT
+
+# install these files
+dist_python_DATA += smartd_log/smartd_log.chart.py
+dist_pythonconfig_DATA += smartd_log/smartd_log.conf
+
+# do not install these files, but include them in the distribution
+dist_noinst_DATA += smartd_log/README.md smartd_log/Makefile.inc
+
diff --git a/collectors/python.d.plugin/smartd_log/README.md b/collectors/python.d.plugin/smartd_log/README.md
new file mode 100644
index 000000000..121a63573
--- /dev/null
+++ b/collectors/python.d.plugin/smartd_log/README.md
@@ -0,0 +1,38 @@
+# smartd_log
+
+Module monitor `smartd` log files to collect HDD/SSD S.M.A.R.T attributes.
+
+It produces following charts (you can add additional attributes in the module configuration file):
+
+1. **Read Error Rate** attribute 1
+
+2. **Start/Stop Count** attribute 4
+
+3. **Reallocated Sectors Count** attribute 5
+
+4. **Seek Error Rate** attribute 7
+
+5. **Power-On Hours Count** attribute 9
+
+6. **Power Cycle Count** attribute 12
+
+7. **Load/Unload Cycles** attribute 193
+
+8. **Temperature** attribute 194
+
+9. **Current Pending Sectors** attribute 197
+
+10. **Off-Line Uncorrectable** attribute 198
+
+11. **Write Error Rate** attribute 200
+
+### configuration
+
+```yaml
+local:
+ log_path : '/var/log/smartd/'
+```
+
+If no configuration is given, module will attempt to read log files in /var/log/smartd/ directory.
+
+---
diff --git a/collectors/python.d.plugin/smartd_log/smartd_log.chart.py b/collectors/python.d.plugin/smartd_log/smartd_log.chart.py
new file mode 100644
index 000000000..21dbccecc
--- /dev/null
+++ b/collectors/python.d.plugin/smartd_log/smartd_log.chart.py
@@ -0,0 +1,353 @@
+# -*- coding: utf-8 -*-
+# Description: smart netdata python.d module
+# Author: l2isbad, vorph1
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+import os
+import re
+
+from collections import namedtuple
+from time import time
+
+from bases.collection import read_last_line
+from bases.FrameworkServices.SimpleService import SimpleService
+
+# charts order (can be overridden if you want less charts, or different order)
+ORDER = ['1', '4', '5', '7', '9', '12', '193', '194', '197', '198', '200']
+
+SMART_ATTR = {
+ '1': 'Read Error Rate',
+ '2': 'Throughput Performance',
+ '3': 'Spin-Up Time',
+ '4': 'Start/Stop Count',
+ '5': 'Reallocated Sectors Count',
+ '6': 'Read Channel Margin',
+ '7': 'Seek Error Rate',
+ '8': 'Seek Time Performance',
+ '9': 'Power-On Hours Count',
+ '10': 'Spin-up Retries',
+ '11': 'Calibration Retries',
+ '12': 'Power Cycle Count',
+ '13': 'Soft Read Error Rate',
+ '100': 'Erase/Program Cycles',
+ '103': 'Translation Table Rebuild',
+ '108': 'Unknown (108)',
+ '170': 'Reserved Block Count',
+ '171': 'Program Fail Count',
+ '172': 'Erase Fail Count',
+ '173': 'Wear Leveller Worst Case Erase Count',
+ '174': 'Unexpected Power Loss',
+ '175': 'Program Fail Count',
+ '176': 'Erase Fail Count',
+ '177': 'Wear Leveling Count',
+ '178': 'Used Reserved Block Count',
+ '179': 'Used Reserved Block Count',
+ '180': 'Unused Reserved Block Count',
+ '181': 'Program Fail Count',
+ '182': 'Erase Fail Count',
+ '183': 'SATA Downshifts',
+ '184': 'End-to-End error',
+ '185': 'Head Stability',
+ '186': 'Induced Op-Vibration Detection',
+ '187': 'Reported Uncorrectable Errors',
+ '188': 'Command Timeout',
+ '189': 'High Fly Writes',
+ '190': 'Temperature',
+ '191': 'G-Sense Errors',
+ '192': 'Power-Off Retract Cycles',
+ '193': 'Load/Unload Cycles',
+ '194': 'Temperature',
+ '195': 'Hardware ECC Recovered',
+ '196': 'Reallocation Events',
+ '197': 'Current Pending Sectors',
+ '198': 'Off-line Uncorrectable',
+ '199': 'UDMA CRC Error Rate',
+ '200': 'Write Error Rate',
+ '201': 'Soft Read Errors',
+ '202': 'Data Address Mark Errors',
+ '203': 'Run Out Cancel',
+ '204': 'Soft ECC Corrections',
+ '205': 'Thermal Asperity Rate',
+ '206': 'Flying Height',
+ '207': 'Spin High Current',
+ '209': 'Offline Seek Performance',
+ '220': 'Disk Shift',
+ '221': 'G-Sense Error Rate',
+ '222': 'Loaded Hours',
+ '223': 'Load/Unload Retries',
+ '224': 'Load Friction',
+ '225': 'Load/Unload Cycles',
+ '226': 'Load-in Time',
+ '227': 'Torque Amplification Count',
+ '228': 'Power-Off Retracts',
+ '230': 'GMR Head Amplitude',
+ '231': 'Temperature',
+ '232': 'Available Reserved Space',
+ '233': 'Media Wearout Indicator',
+ '240': 'Head Flying Hours',
+ '241': 'Total LBAs Written',
+ '242': 'Total LBAs Read',
+ '250': 'Read Error Retry Rate'
+}
+
+LIMIT = namedtuple('LIMIT', ['min', 'max'])
+
+LIMITS = {
+ '194': LIMIT(0, 200)
+}
+
+RESCAN_INTERVAL = 60
+
+REGEX = re.compile(
+ '(\d+);' # attribute
+ '(\d+);' # normalized value
+ '(\d+)', # raw value
+ re.X
+)
+
+
+def chart_template(chart_name):
+ units, attr_id = chart_name.split('_')[-2:]
+ title = '{value_type} {description}'.format(value_type=units.capitalize(),
+ description=SMART_ATTR[attr_id])
+ family = SMART_ATTR[attr_id].lower()
+
+ return {
+ chart_name: {
+ 'options': [None, title, units, family, 'smartd_log.' + chart_name, 'line'],
+ 'lines': []
+ }
+ }
+
+
+def handle_os_error(method):
+ def on_call(*args):
+ try:
+ return method(*args)
+ except OSError:
+ return None
+ return on_call
+
+
+class SmartAttribute(object):
+ def __init__(self, idx, normalized, raw):
+ self.id = idx
+ self.normalized = normalized
+ self._raw = raw
+
+ @property
+ def raw(self):
+ if self.id in LIMITS:
+ limit = LIMITS[self.id]
+ if limit.min <= int(self._raw) <= limit.max:
+ return self._raw
+ return None
+ return self._raw
+
+ @raw.setter
+ def raw(self, value):
+ self._raw = value
+
+
+class DiskLogFile:
+ def __init__(self, path):
+ self.path = path
+ self.size = os.path.getsize(path)
+
+ @handle_os_error
+ def is_changed(self):
+ new_size = os.path.getsize(self.path)
+ old_size, self.size = self.size, new_size
+
+ return new_size != old_size and new_size
+
+ @staticmethod
+ @handle_os_error
+ def is_valid(log_file, exclude):
+ return all([log_file.endswith('.csv'),
+ not [p for p in exclude if p in log_file],
+ os.access(log_file, os.R_OK),
+ os.path.getsize(log_file)])
+
+
+class Disk:
+ def __init__(self, full_path, age):
+ self.log_file = DiskLogFile(full_path)
+ self.name = os.path.basename(full_path).split('.')[-3]
+ self.age = int(age)
+ self.status = True
+ self.attributes = dict()
+
+ self.get_attributes()
+
+ def __eq__(self, other):
+ if isinstance(other, Disk):
+ return self.name == other.name
+ return self.name == other
+
+ def __ne__(self, other):
+ return not self == other
+
+ def __hash__(self):
+ return hash(repr(self))
+
+ @handle_os_error
+ def is_active(self):
+ return (time() - os.path.getmtime(self.log_file.path)) / 60 < self.age
+
+ @handle_os_error
+ def get_attributes(self):
+ last_line = read_last_line(self.log_file.path)
+ self.attributes = dict((attr, SmartAttribute(attr, normalized, raw)) for attr, normalized, raw
+ in REGEX.findall(last_line))
+ return True
+
+ def data(self):
+ data = dict()
+ for attr in self.attributes.values():
+ data['_'.join([self.name, 'normalized', attr.id])] = attr.normalized
+ if attr.raw is not None:
+ data['_'.join([self.name, 'raw', attr.id])] = attr.raw
+ return data
+
+
+class Service(SimpleService):
+ def __init__(self, configuration=None, name=None):
+ SimpleService.__init__(self, configuration=configuration, name=name)
+ self.log_path = self.configuration.get('log_path', '/var/log/smartd')
+ self.raw = self.configuration.get('raw_values', True)
+ self.exclude = self.configuration.get('exclude_disks', str()).split()
+ self.age = self.configuration.get('age', 30)
+
+ self.runs = 0
+ self.disks = list()
+ self.order = list()
+ self.definitions = dict()
+
+ def check(self):
+ self.disks = self.scan()
+
+ if not self.disks:
+ return None
+
+ user_defined_sa = self.configuration.get('smart_attributes')
+
+ if user_defined_sa:
+ order = user_defined_sa.split() or ORDER
+ else:
+ order = ORDER
+
+ self.create_charts(order)
+
+ return True
+
+ def get_data(self):
+ self.runs += 1
+
+ if self.runs % RESCAN_INTERVAL == 0:
+ self.cleanup_and_rescan()
+
+ data = dict()
+
+ for disk in self.disks:
+
+ if not disk.status:
+ continue
+
+ changed = disk.log_file.is_changed()
+
+ # True = changed, False = unchanged, None = Exception
+ if changed is None:
+ disk.status = False
+ continue
+
+ if changed:
+ success = disk.get_attributes()
+ if not success:
+ disk.status = False
+ continue
+
+ data.update(disk.data())
+
+ return data or None
+
+ def create_charts(self, order):
+ for attr in order:
+ raw_name, normalized_name = 'attr_id_raw_' + attr, 'attr_id_normalized_' + attr
+ raw, normalized = chart_template(raw_name), chart_template(normalized_name)
+ self.order.extend([normalized_name, raw_name])
+ self.definitions.update(raw)
+ self.definitions.update(normalized)
+
+ for disk in self.disks:
+ if attr not in disk.attributes:
+ self.debug("'{disk}' has no attribute '{attr_id}'".format(disk=disk.name,
+ attr_id=attr))
+ continue
+ normalized[normalized_name]['lines'].append(['_'.join([disk.name, 'normalized', attr]), disk.name])
+
+ if not self.raw:
+ continue
+
+ if disk.attributes[attr].raw is not None:
+ raw[raw_name]['lines'].append(['_'.join([disk.name, 'raw', attr]), disk.name])
+ continue
+ self.debug("'{disk}' attribute '{attr_id}' value not in {limits}".format(disk=disk.name,
+ attr_id=attr,
+ limits=LIMITS[attr]))
+
+ def cleanup_and_rescan(self):
+ self.cleanup()
+ new_disks = self.scan(only_new=True)
+
+ for disk in new_disks:
+ valid = False
+
+ for chart in self.charts:
+ value_type, idx = chart.id.split('_')[2:]
+
+ if idx in disk.attributes:
+ valid = True
+ dimension_id = '_'.join([disk.name, value_type, idx])
+
+ if dimension_id in chart:
+ chart.hide_dimension(dimension_id=dimension_id, reverse=True)
+ else:
+ chart.add_dimension([dimension_id, disk.name])
+ if valid:
+ self.disks.append(disk)
+
+ def cleanup(self):
+ for disk in self.disks:
+
+ if not disk.is_active():
+ disk.status = False
+ if not disk.status:
+ for chart in self.charts:
+ dimension_id = '_'.join([disk.name, chart.id[8:]])
+ chart.hide_dimension(dimension_id=dimension_id)
+
+ self.disks = [disk for disk in self.disks if disk.status]
+
+ def scan(self, only_new=None):
+ new_disks = list()
+ for f in os.listdir(self.log_path):
+ full_path = os.path.join(self.log_path, f)
+
+ if DiskLogFile.is_valid(full_path, self.exclude):
+ disk = Disk(full_path, self.age)
+
+ active = disk.is_active()
+ if active is None:
+ continue
+ if active:
+ if not only_new:
+ new_disks.append(disk)
+ else:
+ if disk not in self.disks:
+ new_disks.append(disk)
+ else:
+ if not only_new:
+ self.debug("'{disk}' not updated in the last {age} minutes, "
+ "skipping it.".format(disk=disk.name, age=self.age))
+ return new_disks
diff --git a/collectors/python.d.plugin/smartd_log/smartd_log.conf b/collectors/python.d.plugin/smartd_log/smartd_log.conf
new file mode 100644
index 000000000..3fab3f1c0
--- /dev/null
+++ b/collectors/python.d.plugin/smartd_log/smartd_log.conf
@@ -0,0 +1,90 @@
+# netdata python.d.plugin configuration for smartd log
+#
+# This file is in YaML format. Generally the format is:
+#
+# name: value
+#
+# There are 2 sections:
+# - global variables
+# - one or more JOBS
+#
+# JOBS allow you to collect values from multiple sources.
+# Each source will have its own set of charts.
+#
+# JOB parameters have to be indented (using spaces only, example below).
+
+# ----------------------------------------------------------------------
+# Global Variables
+# These variables set the defaults for all JOBs, however each JOB
+# may define its own, overriding the defaults.
+
+# update_every sets the default data collection frequency.
+# If unset, the python.d.plugin default is used.
+# update_every: 1
+
+# priority controls the order of charts at the netdata dashboard.
+# Lower numbers move the charts towards the top of the page.
+# If unset, the default for python.d.plugin is used.
+# priority: 60000
+
+# retries sets the number of retries to be made in case of failures.
+# If unset, the default for python.d.plugin is used.
+# Attempts to restore the service are made once every update_every
+# and only if the module has collected values in the past.
+# retries: 60
+
+# autodetection_retry sets the job re-check interval in seconds.
+# The job is not deleted if check fails.
+# Attempts to start the job are made once every autodetection_retry.
+# This feature is disabled by default.
+# autodetection_retry: 0
+
+# ----------------------------------------------------------------------
+# JOBS (data collection sources)
+#
+# The default JOBS share the same *name*. JOBS with the same name
+# are mutually exclusive. Only one of them will be allowed running at
+# any time. This allows autodetection to try several alternatives and
+# pick the one that works.
+#
+# Any number of jobs is supported.
+#
+# All python.d.plugin JOBS (for all its modules) support a set of
+# predefined parameters. These are:
+#
+# job_name:
+# name: myname # the JOB's name as it will appear at the
+# # dashboard (by default is the job_name)
+# # JOBs sharing a name are mutually exclusive
+# update_every: 1 # the JOB's data collection frequency
+# priority: 60000 # the JOB's order on the dashboard
+# retries: 60 # the JOB's number of restoration attempts
+# autodetection_retry: 0 # the JOB's re-check interval in seconds
+#
+# Additionally to the above, smartd_log also supports the following:
+#
+# log_path: '/path/to/smartdlogs' # path to smartd log files. Default is /var/log/smartd
+# raw_values: yes # enable/disable raw values charts. Enabled by default.
+# smart_attributes: '1 2 3 4 44' # smart attributes charts. Default are ['1', '4', '5', '7', '9', '12', '193', '194', '197', '198', '200'].
+# exclude_disks: 'PATTERN1 PATTERN2' # space separated patterns. If the pattern is in the drive name, the module will not collect data for it.
+#
+# ----------------------------------------------------------------------
+# Additional information
+# Plugin reads smartd log files (-A option).
+# You need to add (man smartd) to /etc/default/smartmontools '-i 600 -A /var/log/smartd/' to pass additional options to smartd on startup
+# Then restart smartd service and check /path/log/smartdlogs
+# ls /var/log/smartd/
+# CDC_WD10EZEX_00BN5A0-WD_WCC3F7FLVZS9.ata.csv WDC_WD10EZEX_00BN5A0-WD_WCC3F7FLVZS9.ata.csv ZDC_WD10EZEX_00BN5A0-WD_WCC3F7FLVZS9.ata.csv
+#
+# Smartd APPEND logs at every run. Its NOT RECOMMENDED to set '-i' option below 60 sec.
+# STRONGLY RECOMMENDED to create smartd conf file for logrotate
+#
+# RAW vs NORMALIZED values
+# "Normalized value", commonly referred to as just "value". This is a most universal measurement, on the scale from 0 (bad) to some maximum (good) value.
+# Maximum values are typically 100, 200 or 253. Rule of thumb is: high values are good, low values are bad.
+#
+# "Raw value" - the value of the attribute as it is tracked by the device, before any normalization takes place.
+# Some raw numbers provide valuable insight when properly interpreted. These cases will be discussed later on.
+# Raw values are typically listed in hexadecimal numbers. The raw value has different structure for different vendors and is often not meaningful as a decimal number.
+#
+# ----------------------------------------------------------------------