New upstream version 1.11.0+dfsgupstream/1.11.0+dfsg

Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
author: Daniel Baumann <daniel.baumann@progress-linux.org> 2018-11-07 12:19:29 +0000
committer: Daniel Baumann <daniel.baumann@progress-linux.org> 2018-11-07 12:20:17 +0000
commit: a64a253794ac64cb40befee54db53bde17dd0d49 (patch)
tree: c1024acc5f6e508814b944d99f112259bb28b1be /collectors/python.d.plugin/smartd_log
parent: New upstream version 1.10.0+dfsg (diff)
download: netdata-a64a253794ac64cb40befee54db53bde17dd0d49.tar.xz
netdata-a64a253794ac64cb40befee54db53bde17dd0d49.zip
4 files changed, 494 insertions, 0 deletions
diff --git a/collectors/python.d.plugin/smartd_log/Makefile.inc b/collectors/python.d.plugin/smartd_log/Makefile.inc
new file mode 100644
index 000000000..dc1d0f3fb
--- /dev/null
+++ b/collectors/python.d.plugin/smartd_log/Makefile.inc
@@ -0,0 +1,13 @@
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+# THIS IS NOT A COMPLETE Makefile
+# IT IS INCLUDED BY ITS PARENT'S Makefile.am
+# IT IS REQUIRED TO REFERENCE ALL FILES RELATIVE TO THE PARENT
+
+# install these files
+dist_python_DATA       += smartd_log/smartd_log.chart.py
+dist_pythonconfig_DATA += smartd_log/smartd_log.conf
+
+# do not install these files, but include them in the distribution
+dist_noinst_DATA       += smartd_log/README.md smartd_log/Makefile.inc
+
diff --git a/collectors/python.d.plugin/smartd_log/README.md b/collectors/python.d.plugin/smartd_log/README.md
new file mode 100644
index 000000000..121a63573
--- /dev/null
+++ b/collectors/python.d.plugin/smartd_log/README.md
@@ -0,0 +1,38 @@
+# smartd_log
+
+Module monitor `smartd` log files to collect HDD/SSD S.M.A.R.T attributes.
+
+It produces following charts (you can add additional attributes in the module configuration file):
+
+1. **Read Error Rate** attribute 1
+
+2. **Start/Stop Count** attribute 4
+
+3. **Reallocated Sectors Count** attribute 5
+
+4. **Seek Error Rate** attribute 7
+
+5. **Power-On Hours Count** attribute 9
+
+6. **Power Cycle Count** attribute 12
+
+7. **Load/Unload Cycles** attribute 193
+
+8. **Temperature** attribute 194
+
+9. **Current Pending Sectors** attribute 197
+
+10. **Off-Line Uncorrectable** attribute 198
+
+11. **Write Error Rate** attribute 200
+
+### configuration
+
+```yaml
+local:
+  log_path : '/var/log/smartd/'
+```
+
+If no configuration is given, module will attempt to read log files in /var/log/smartd/ directory.
+
+---
diff --git a/collectors/python.d.plugin/smartd_log/smartd_log.chart.py b/collectors/python.d.plugin/smartd_log/smartd_log.chart.py
new file mode 100644
index 000000000..21dbccecc
--- /dev/null
+++ b/collectors/python.d.plugin/smartd_log/smartd_log.chart.py
@@ -0,0 +1,353 @@
+# -*- coding: utf-8 -*-
+# Description: smart netdata python.d module
+# Author: l2isbad, vorph1
+# SPDX-License-Identifier: GPL-3.0-or-later
+
+import os
+import re
+
+from collections import namedtuple
+from time import time
+
+from bases.collection import read_last_line
+from bases.FrameworkServices.SimpleService import SimpleService
+
+# charts order (can be overridden if you want less charts, or different order)
+ORDER = ['1', '4', '5', '7', '9', '12', '193', '194', '197', '198', '200']
+
+SMART_ATTR = {
+    '1': 'Read Error Rate',
+    '2': 'Throughput Performance',
+    '3': 'Spin-Up Time',
+    '4': 'Start/Stop Count',
+    '5': 'Reallocated Sectors Count',
+    '6': 'Read Channel Margin',
+    '7': 'Seek Error Rate',
+    '8': 'Seek Time Performance',
+    '9': 'Power-On Hours Count',
+    '10': 'Spin-up Retries',
+    '11': 'Calibration Retries',
+    '12': 'Power Cycle Count',
+    '13': 'Soft Read Error Rate',
+    '100': 'Erase/Program Cycles',
+    '103': 'Translation Table Rebuild',
+    '108': 'Unknown (108)',
+    '170': 'Reserved Block Count',
+    '171': 'Program Fail Count',
+    '172': 'Erase Fail Count',
+    '173': 'Wear Leveller Worst Case Erase Count',
+    '174': 'Unexpected Power Loss',
+    '175': 'Program Fail Count',
+    '176': 'Erase Fail Count',
+    '177': 'Wear Leveling Count',
+    '178': 'Used Reserved Block Count',
+    '179': 'Used Reserved Block Count',
+    '180': 'Unused Reserved Block Count',
+    '181': 'Program Fail Count',
+    '182': 'Erase Fail Count',
+    '183': 'SATA Downshifts',
+    '184': 'End-to-End error',
+    '185': 'Head Stability',
+    '186': 'Induced Op-Vibration Detection',
+    '187': 'Reported Uncorrectable Errors',
+    '188': 'Command Timeout',
+    '189': 'High Fly Writes',
+    '190': 'Temperature',
+    '191': 'G-Sense Errors',
+    '192': 'Power-Off Retract Cycles',
+    '193': 'Load/Unload Cycles',
+    '194': 'Temperature',
+    '195': 'Hardware ECC Recovered',
+    '196': 'Reallocation Events',
+    '197': 'Current Pending Sectors',
+    '198': 'Off-line Uncorrectable',
+    '199': 'UDMA CRC Error Rate',
+    '200': 'Write Error Rate',
+    '201': 'Soft Read Errors',
+    '202': 'Data Address Mark Errors',
+    '203': 'Run Out Cancel',
+    '204': 'Soft ECC Corrections',
+    '205': 'Thermal Asperity Rate',
+    '206': 'Flying Height',
+    '207': 'Spin High Current',
+    '209': 'Offline Seek Performance',
+    '220': 'Disk Shift',
+    '221': 'G-Sense Error Rate',
+    '222': 'Loaded Hours',
+    '223': 'Load/Unload Retries',
+    '224': 'Load Friction',
+    '225': 'Load/Unload Cycles',
+    '226': 'Load-in Time',
+    '227': 'Torque Amplification Count',
+    '228': 'Power-Off Retracts',
+    '230': 'GMR Head Amplitude',
+    '231': 'Temperature',
+    '232': 'Available Reserved Space',
+    '233': 'Media Wearout Indicator',
+    '240': 'Head Flying Hours',
+    '241': 'Total LBAs Written',
+    '242': 'Total LBAs Read',
+    '250': 'Read Error Retry Rate'
+}
+
+LIMIT = namedtuple('LIMIT', ['min', 'max'])
+
+LIMITS = {
+    '194': LIMIT(0, 200)
+}
+
+RESCAN_INTERVAL = 60
+
+REGEX = re.compile(
+    '(\d+);'  # attribute
+    '(\d+);'  # normalized value
+    '(\d+)',  # raw value
+    re.X
+)
+
+
+def chart_template(chart_name):
+    units, attr_id = chart_name.split('_')[-2:]
+    title = '{value_type} {description}'.format(value_type=units.capitalize(),
+                                                description=SMART_ATTR[attr_id])
+    family = SMART_ATTR[attr_id].lower()
+
+    return {
+        chart_name: {
+            'options': [None, title, units, family, 'smartd_log.' + chart_name, 'line'],
+            'lines': []
+        }
+    }
+
+
+def handle_os_error(method):
+    def on_call(*args):
+        try:
+            return method(*args)
+        except OSError:
+            return None
+    return on_call
+
+
+class SmartAttribute(object):
+    def __init__(self, idx, normalized, raw):
+        self.id = idx
+        self.normalized = normalized
+        self._raw = raw
+
+    @property
+    def raw(self):
+        if self.id in LIMITS:
+            limit = LIMITS[self.id]
+            if limit.min <= int(self._raw) <= limit.max:
+                return self._raw
+            return None
+        return self._raw
+
+    @raw.setter
+    def raw(self, value):
+        self._raw = value
+
+
+class DiskLogFile:
+    def __init__(self, path):
+        self.path = path
+        self.size = os.path.getsize(path)
+
+    @handle_os_error
+    def is_changed(self):
+        new_size = os.path.getsize(self.path)
+        old_size, self.size = self.size, new_size
+
+        return new_size != old_size and new_size
+
+    @staticmethod
+    @handle_os_error
+    def is_valid(log_file, exclude):
+        return all([log_file.endswith('.csv'),
+                    not [p for p in exclude if p in log_file],
+                    os.access(log_file, os.R_OK),
+                    os.path.getsize(log_file)])
+
+
+class Disk:
+    def __init__(self, full_path, age):
+        self.log_file = DiskLogFile(full_path)
+        self.name = os.path.basename(full_path).split('.')[-3]
+        self.age = int(age)
+        self.status = True
+        self.attributes = dict()
+
+        self.get_attributes()
+
+    def __eq__(self, other):
+        if isinstance(other, Disk):
+            return self.name == other.name
+        return self.name == other
+
+    def __ne__(self, other):
+        return not self == other
+
+    def __hash__(self):
+        return hash(repr(self))
+
+    @handle_os_error
+    def is_active(self):
+        return (time() - os.path.getmtime(self.log_file.path)) / 60 < self.age
+
+    @handle_os_error
+    def get_attributes(self):
+        last_line = read_last_line(self.log_file.path)
+        self.attributes = dict((attr, SmartAttribute(attr, normalized, raw)) for attr, normalized, raw
+                               in REGEX.findall(last_line))
+        return True
+
+    def data(self):
+        data = dict()
+        for attr in self.attributes.values():
+            data['_'.join([self.name, 'normalized', attr.id])] = attr.normalized
+            if attr.raw is not None:
+                data['_'.join([self.name, 'raw', attr.id])] = attr.raw
+        return data
+
+
+class Service(SimpleService):
+    def __init__(self, configuration=None, name=None):
+        SimpleService.__init__(self, configuration=configuration, name=name)
+        self.log_path = self.configuration.get('log_path', '/var/log/smartd')
+        self.raw = self.configuration.get('raw_values', True)
+        self.exclude = self.configuration.get('exclude_disks', str()).split()
+        self.age = self.configuration.get('age', 30)
+
+        self.runs = 0
+        self.disks = list()
+        self.order = list()
+        self.definitions = dict()
+
+    def check(self):
+        self.disks = self.scan()
+
+        if not self.disks:
+            return None
+
+        user_defined_sa = self.configuration.get('smart_attributes')
+
+        if user_defined_sa:
+            order = user_defined_sa.split() or ORDER
+        else:
+            order = ORDER
+
+        self.create_charts(order)
+
+        return True
+
+    def get_data(self):
+        self.runs += 1
+
+        if self.runs % RESCAN_INTERVAL == 0:
+            self.cleanup_and_rescan()
+
+        data = dict()
+
+        for disk in self.disks:
+
+            if not disk.status:
+                continue
+
+            changed = disk.log_file.is_changed()
+
+            # True = changed, False = unchanged, None = Exception
+            if changed is None:
+                disk.status = False
+                continue
+
+            if changed:
+                success = disk.get_attributes()
+                if not success:
+                    disk.status = False
+                    continue
+
+            data.update(disk.data())
+
+        return data or None
+
+    def create_charts(self, order):
+        for attr in order:
+            raw_name, normalized_name = 'attr_id_raw_' + attr, 'attr_id_normalized_' + attr
+            raw, normalized = chart_template(raw_name), chart_template(normalized_name)
+            self.order.extend([normalized_name, raw_name])
+            self.definitions.update(raw)
+            self.definitions.update(normalized)
+
+            for disk in self.disks:
+                if attr not in disk.attributes:
+                    self.debug("'{disk}' has no attribute '{attr_id}'".format(disk=disk.name,
+                                                                              attr_id=attr))
+                    continue
+                normalized[normalized_name]['lines'].append(['_'.join([disk.name, 'normalized', attr]), disk.name])
+
+                if not self.raw:
+                    continue
+
+                if disk.attributes[attr].raw is not None:
+                    raw[raw_name]['lines'].append(['_'.join([disk.name, 'raw', attr]), disk.name])
+                    continue
+                self.debug("'{disk}' attribute '{attr_id}' value not in {limits}".format(disk=disk.name,
+                                                                                         attr_id=attr,
+                                                                                         limits=LIMITS[attr]))
+
+    def cleanup_and_rescan(self):
+        self.cleanup()
+        new_disks = self.scan(only_new=True)
+
+        for disk in new_disks:
+            valid = False
+
+            for chart in self.charts:
+                value_type, idx = chart.id.split('_')[2:]
+
+                if idx in disk.attributes:
+                    valid = True
+                    dimension_id = '_'.join([disk.name, value_type, idx])
+
+                    if dimension_id in chart:
+                        chart.hide_dimension(dimension_id=dimension_id, reverse=True)
+                    else:
+                        chart.add_dimension([dimension_id, disk.name])
+            if valid:
+                self.disks.append(disk)
+
+    def cleanup(self):
+        for disk in self.disks:
+
+            if not disk.is_active():
+                disk.status = False
+            if not disk.status:
+                for chart in self.charts:
+                    dimension_id = '_'.join([disk.name, chart.id[8:]])
+                    chart.hide_dimension(dimension_id=dimension_id)
+
+        self.disks = [disk for disk in self.disks if disk.status]
+
+    def scan(self, only_new=None):
+        new_disks = list()
+        for f in os.listdir(self.log_path):
+            full_path = os.path.join(self.log_path, f)
+
+            if DiskLogFile.is_valid(full_path, self.exclude):
+                disk = Disk(full_path, self.age)
+
+                active = disk.is_active()
+                if active is None:
+                    continue
+                if active:
+                    if not only_new:
+                        new_disks.append(disk)
+                    else:
+                        if disk not in self.disks:
+                            new_disks.append(disk)
+                else:
+                    if not only_new:
+                        self.debug("'{disk}' not updated in the last {age} minutes, "
+                                   "skipping it.".format(disk=disk.name, age=self.age))
+        return new_disks
diff --git a/collectors/python.d.plugin/smartd_log/smartd_log.conf b/collectors/python.d.plugin/smartd_log/smartd_log.conf
new file mode 100644
index 000000000..3fab3f1c0
--- /dev/null
+++ b/collectors/python.d.plugin/smartd_log/smartd_log.conf
@@ -0,0 +1,90 @@
+# netdata python.d.plugin configuration for smartd log
+#
+# This file is in YaML format. Generally the format is:
+#
+# name: value
+#
+# There are 2 sections:
+#  - global variables
+#  - one or more JOBS
+#
+# JOBS allow you to collect values from multiple sources.
+# Each source will have its own set of charts.
+#
+# JOB parameters have to be indented (using spaces only, example below).
+
+# ----------------------------------------------------------------------
+# Global Variables
+# These variables set the defaults for all JOBs, however each JOB
+# may define its own, overriding the defaults.
+
+# update_every sets the default data collection frequency.
+# If unset, the python.d.plugin default is used.
+# update_every: 1
+
+# priority controls the order of charts at the netdata dashboard.
+# Lower numbers move the charts towards the top of the page.
+# If unset, the default for python.d.plugin is used.
+# priority: 60000
+
+# retries sets the number of retries to be made in case of failures.
+# If unset, the default for python.d.plugin is used.
+# Attempts to restore the service are made once every update_every
+# and only if the module has collected values in the past.
+# retries: 60
+
+# autodetection_retry sets the job re-check interval in seconds.
+# The job is not deleted if check fails.
+# Attempts to start the job are made once every autodetection_retry.
+# This feature is disabled by default.
+# autodetection_retry: 0
+
+# ----------------------------------------------------------------------
+# JOBS (data collection sources)
+#
+# The default JOBS share the same *name*. JOBS with the same name
+# are mutually exclusive. Only one of them will be allowed running at
+# any time. This allows autodetection to try several alternatives and
+# pick the one that works.
+#
+# Any number of jobs is supported.
+#
+# All python.d.plugin JOBS (for all its modules) support a set of
+# predefined parameters. These are:
+#
+# job_name:
+#     name: myname            # the JOB's name as it will appear at the
+#                             # dashboard (by default is the job_name)
+#                             # JOBs sharing a name are mutually exclusive
+#     update_every: 1         # the JOB's data collection frequency
+#     priority: 60000         # the JOB's order on the dashboard
+#     retries: 60             # the JOB's number of restoration attempts
+#     autodetection_retry: 0  # the JOB's re-check interval in seconds
+#
+# Additionally to the above, smartd_log also supports the following:
+#
+#    log_path: '/path/to/smartdlogs'    # path to smartd log files. Default is /var/log/smartd
+#    raw_values: yes                    # enable/disable raw values charts. Enabled by default.
+#    smart_attributes: '1 2 3 4 44'     # smart attributes charts. Default are ['1', '4', '5', '7', '9', '12', '193', '194', '197', '198', '200'].
+#    exclude_disks: 'PATTERN1 PATTERN2' # space separated patterns. If the pattern is in the drive name, the module will not collect data for it.
+#
+# ----------------------------------------------------------------------
+# Additional information
+#  Plugin reads smartd log files (-A option).
+#  You need to add (man smartd) to /etc/default/smartmontools '-i 600 -A /var/log/smartd/' to pass additional options to smartd on startup
+#  Then restart smartd service and check /path/log/smartdlogs
+#  ls /var/log/smartd/
+#  CDC_WD10EZEX_00BN5A0-WD_WCC3F7FLVZS9.ata.csv  WDC_WD10EZEX_00BN5A0-WD_WCC3F7FLVZS9.ata.csv  ZDC_WD10EZEX_00BN5A0-WD_WCC3F7FLVZS9.ata.csv
+#
+# Smartd APPEND logs at every run. Its NOT RECOMMENDED to set '-i' option below 60 sec.
+# STRONGLY RECOMMENDED to create smartd conf file for logrotate
+#
+# RAW vs NORMALIZED values
+# "Normalized value", commonly referred to as just "value". This is a most universal measurement, on the scale from 0 (bad) to some maximum (good) value.
+# Maximum values are typically 100, 200 or 253. Rule of thumb is: high values are good, low values are bad.
+#
+# "Raw value" - the value of the attribute as it is tracked by the device, before any normalization takes place.
+# Some raw numbers provide valuable insight when properly interpreted. These cases will be discussed later on.
+# Raw values are typically listed in hexadecimal numbers. The raw value has different structure for different vendors and is often not meaningful as a decimal number.
+#
+# ----------------------------------------------------------------------
author	Daniel Baumann <daniel.baumann@progress-linux.org>	2018-11-07 12:19:29 +0000
committer	Daniel Baumann <daniel.baumann@progress-linux.org>	2018-11-07 12:20:17 +0000
commit	a64a253794ac64cb40befee54db53bde17dd0d49 (patch)
tree	c1024acc5f6e508814b944d99f112259bb28b1be /collectors/python.d.plugin/smartd_log
parent	New upstream version 1.10.0+dfsg (diff)
download	netdata-a64a253794ac64cb40befee54db53bde17dd0d49.tar.xz netdata-a64a253794ac64cb40befee54db53bde17dd0d49.zip