summaryrefslogtreecommitdiffstats
path: root/python.d/smartd_log.chart.py
diff options
context:
space:
mode:
authorFederico Ceratto <federico.ceratto@gmail.com>2017-12-19 23:39:21 +0000
committerFederico Ceratto <federico.ceratto@gmail.com>2017-12-19 23:39:21 +0000
commit61aedf201c2c4bf0e5aa4db32e74f4d860b88593 (patch)
treebcf4f9a0cd8bc2daf38b2ff9f29bfcc1e5ed8968 /python.d/smartd_log.chart.py
parentNew upstream version 1.8.0+dfsg (diff)
downloadnetdata-61aedf201c2c4bf0e5aa4db32e74f4d860b88593.tar.xz
netdata-61aedf201c2c4bf0e5aa4db32e74f4d860b88593.zip
New upstream version 1.9.0+dfsgupstream/1.9.0+dfsg
Diffstat (limited to 'python.d/smartd_log.chart.py')
-rw-r--r--python.d/smartd_log.chart.py508
1 files changed, 316 insertions, 192 deletions
diff --git a/python.d/smartd_log.chart.py b/python.d/smartd_log.chart.py
index 4039c153..07ad88cd 100644
--- a/python.d/smartd_log.chart.py
+++ b/python.d/smartd_log.chart.py
@@ -2,221 +2,345 @@
# Description: smart netdata python.d module
# Author: l2isbad, vorph1
-from re import compile as r_compile
-from os import listdir, access, R_OK
-from os.path import isfile, join, getsize, basename, isdir
-try:
- from queue import Queue
-except ImportError:
- from Queue import Queue
-from threading import Thread
-from base import SimpleService
+import os
+import re
+
from collections import namedtuple
+from time import time
-# default module values (can be overridden per job in `config`)
-update_every = 5
-priority = 60000
+from bases.collection import read_last_line
+from bases.FrameworkServices.SimpleService import SimpleService
# charts order (can be overridden if you want less charts, or different order)
ORDER = ['1', '4', '5', '7', '9', '12', '193', '194', '197', '198', '200']
SMART_ATTR = {
- '1': 'Read Error Rate',
- '2': 'Throughput Performance',
- '3': 'Spin-Up Time',
- '4': 'Start/Stop Count',
- '5': 'Reallocated Sectors Count',
- '6': 'Read Channel Margin',
- '7': 'Seek Error Rate',
- '8': 'Seek Time Performance',
- '9': 'Power-On Hours Count',
- '10': 'Spin-up Retries',
- '11': 'Calibration Retries',
- '12': 'Power Cycle Count',
- '13': 'Soft Read Error Rate',
- '100': 'Erase/Program Cycles',
- '103': 'Translation Table Rebuild',
- '108': 'Unknown (108)',
- '170': 'Reserved Block Count',
- '171': 'Program Fail Count',
- '172': 'Erase Fail Count',
- '173': 'Wear Leveller Worst Case Erase Count',
- '174': 'Unexpected Power Loss',
- '175': 'Program Fail Count',
- '176': 'Erase Fail Count',
- '177': 'Wear Leveling Count',
- '178': 'Used Reserved Block Count',
- '179': 'Used Reserved Block Count',
- '180': 'Unused Reserved Block Count',
- '181': 'Program Fail Count',
- '182': 'Erase Fail Count',
- '183': 'SATA Downshifts',
- '184': 'End-to-End error',
- '185': 'Head Stability',
- '186': 'Induced Op-Vibration Detection',
- '187': 'Reported Uncorrectable Errors',
- '188': 'Command Timeout',
- '189': 'High Fly Writes',
- '190': 'Temperature',
- '191': 'G-Sense Errors',
- '192': 'Power-Off Retract Cycles',
- '193': 'Load/Unload Cycles',
- '194': 'Temperature',
- '195': 'Hardware ECC Recovered',
- '196': 'Reallocation Events',
- '197': 'Current Pending Sectors',
- '198': 'Off-line Uncorrectable',
- '199': 'UDMA CRC Error Rate',
- '200': 'Write Error Rate',
- '201': 'Soft Read Errors',
- '202': 'Data Address Mark Errors',
- '203': 'Run Out Cancel',
- '204': 'Soft ECC Corrections',
- '205': 'Thermal Asperity Rate',
- '206': 'Flying Height',
- '207': 'Spin High Current',
- '209': 'Offline Seek Performance',
- '220': 'Disk Shift',
- '221': 'G-Sense Error Rate',
- '222': 'Loaded Hours',
- '223': 'Load/Unload Retries',
- '224': 'Load Friction',
- '225': 'Load/Unload Cycles',
- '226': 'Load-in Time',
- '227': 'Torque Amplification Count',
- '228': 'Power-Off Retracts',
- '230': 'GMR Head Amplitude',
- '231': 'Temperature',
- '232': 'Available Reserved Space',
- '233': 'Media Wearout Indicator',
- '240': 'Head Flying Hours',
- '241': 'Total LBAs Written',
- '242': 'Total LBAs Read',
- '250': 'Read Error Retry Rate'
+ '1': 'Read Error Rate',
+ '2': 'Throughput Performance',
+ '3': 'Spin-Up Time',
+ '4': 'Start/Stop Count',
+ '5': 'Reallocated Sectors Count',
+ '6': 'Read Channel Margin',
+ '7': 'Seek Error Rate',
+ '8': 'Seek Time Performance',
+ '9': 'Power-On Hours Count',
+ '10': 'Spin-up Retries',
+ '11': 'Calibration Retries',
+ '12': 'Power Cycle Count',
+ '13': 'Soft Read Error Rate',
+ '100': 'Erase/Program Cycles',
+ '103': 'Translation Table Rebuild',
+ '108': 'Unknown (108)',
+ '170': 'Reserved Block Count',
+ '171': 'Program Fail Count',
+ '172': 'Erase Fail Count',
+ '173': 'Wear Leveller Worst Case Erase Count',
+ '174': 'Unexpected Power Loss',
+ '175': 'Program Fail Count',
+ '176': 'Erase Fail Count',
+ '177': 'Wear Leveling Count',
+ '178': 'Used Reserved Block Count',
+ '179': 'Used Reserved Block Count',
+ '180': 'Unused Reserved Block Count',
+ '181': 'Program Fail Count',
+ '182': 'Erase Fail Count',
+ '183': 'SATA Downshifts',
+ '184': 'End-to-End error',
+ '185': 'Head Stability',
+ '186': 'Induced Op-Vibration Detection',
+ '187': 'Reported Uncorrectable Errors',
+ '188': 'Command Timeout',
+ '189': 'High Fly Writes',
+ '190': 'Temperature',
+ '191': 'G-Sense Errors',
+ '192': 'Power-Off Retract Cycles',
+ '193': 'Load/Unload Cycles',
+ '194': 'Temperature',
+ '195': 'Hardware ECC Recovered',
+ '196': 'Reallocation Events',
+ '197': 'Current Pending Sectors',
+ '198': 'Off-line Uncorrectable',
+ '199': 'UDMA CRC Error Rate',
+ '200': 'Write Error Rate',
+ '201': 'Soft Read Errors',
+ '202': 'Data Address Mark Errors',
+ '203': 'Run Out Cancel',
+ '204': 'Soft ECC Corrections',
+ '205': 'Thermal Asperity Rate',
+ '206': 'Flying Height',
+ '207': 'Spin High Current',
+ '209': 'Offline Seek Performance',
+ '220': 'Disk Shift',
+ '221': 'G-Sense Error Rate',
+ '222': 'Loaded Hours',
+ '223': 'Load/Unload Retries',
+ '224': 'Load Friction',
+ '225': 'Load/Unload Cycles',
+ '226': 'Load-in Time',
+ '227': 'Torque Amplification Count',
+ '228': 'Power-Off Retracts',
+ '230': 'GMR Head Amplitude',
+ '231': 'Temperature',
+ '232': 'Available Reserved Space',
+ '233': 'Media Wearout Indicator',
+ '240': 'Head Flying Hours',
+ '241': 'Total LBAs Written',
+ '242': 'Total LBAs Read',
+ '250': 'Read Error Retry Rate'
+}
+
+LIMIT = namedtuple('LIMIT', ['min', 'max'])
+
+LIMITS = {
+ '194': LIMIT(0, 200)
}
-NAMED_DISKS = namedtuple('disks', ['name', 'size', 'number'])
+RESCAN_INTERVAL = 60
+
+REGEX = re.compile(
+ '(\d+);' # attribute
+ '(\d+);' # normalized value
+ '(\d+)', # raw value
+ re.X
+)
+
+
+def chart_template(chart_name):
+ units, attr_id = chart_name.split('_')[-2:]
+ title = '{value_type} {description}'.format(value_type=units.capitalize(),
+ description=SMART_ATTR[attr_id])
+ family = SMART_ATTR[attr_id].lower()
+
+ return {
+ chart_name: {
+ 'options': [None, title, units, family, 'smartd_log.' + chart_name, 'line'],
+ 'lines': []
+ }
+ }
+
+
+def handle_os_error(method):
+ def on_call(*args):
+ try:
+ return method(*args)
+ except OSError:
+ return None
+ return on_call
+
+
+class SmartAttribute(object):
+ def __init__(self, idx, normalized, raw):
+ self.id = idx
+ self.normalized = normalized
+ self._raw = raw
+
+ @property
+ def raw(self):
+ if self.id in LIMITS:
+ limit = LIMITS[self.id]
+ if limit.min <= int(self._raw) <= limit.max:
+ return self._raw
+ return None
+ return self._raw
+
+ @raw.setter
+ def raw(self, value):
+ self._raw = value
+
+
+class DiskLogFile:
+ def __init__(self, path):
+ self.path = path
+ self.size = os.path.getsize(path)
+
+ @handle_os_error
+ def is_changed(self):
+ new_size = os.path.getsize(self.path)
+ old_size, self.size = self.size, new_size
+
+ return new_size != old_size and new_size
+
+ @staticmethod
+ @handle_os_error
+ def is_valid(log_file, exclude):
+ return all([log_file.endswith('.csv'),
+ not [p for p in exclude if p in log_file],
+ os.access(log_file, os.R_OK),
+ os.path.getsize(log_file)])
+
+
+class Disk:
+ def __init__(self, full_path, age):
+ self.log_file = DiskLogFile(full_path)
+ self.name = os.path.basename(full_path).split('.')[-3]
+ self.age = int(age)
+ self.status = True
+ self.attributes = dict()
+
+ self.get_attributes()
+
+ def __eq__(self, other):
+ if isinstance(other, Disk):
+ return self.name == other.name
+ return self.name == other
+
+ @handle_os_error
+ def is_active(self):
+ return (time() - os.path.getmtime(self.log_file.path)) / 60 < self.age
+
+ @handle_os_error
+ def get_attributes(self):
+ last_line = read_last_line(self.log_file.path)
+ self.attributes = dict((attr, SmartAttribute(attr, normalized, raw)) for attr, normalized, raw
+ in REGEX.findall(last_line))
+ return True
+
+ def data(self):
+ data = dict()
+ for attr in self.attributes.values():
+ data['_'.join([self.name, 'normalized', attr.id])] = attr.normalized
+ if attr.raw is not None:
+ data['_'.join([self.name, 'raw', attr.id])] = attr.raw
+ return data
class Service(SimpleService):
def __init__(self, configuration=None, name=None):
SimpleService.__init__(self, configuration=configuration, name=name)
- self.regex = r_compile(r'(\d+);(\d+);(\d+)')
self.log_path = self.configuration.get('log_path', '/var/log/smartd')
- self.raw_values = self.configuration.get('raw_values')
- self.attr = self.configuration.get('smart_attributes', [])
- self.previous_data = dict()
+ self.raw = self.configuration.get('raw_values', True)
+ self.exclude = self.configuration.get('exclude_disks', str()).split()
+ self.age = self.configuration.get('age', 30)
+
+ self.runs = 0
+ self.disks = list()
+ self.order = list()
+ self.definitions = dict()
def check(self):
- # Can\'t start without smartd readable diks log files
- disks = find_disks_in_log_path(self.log_path)
- if not disks:
- self.error('Can\'t locate any smartd log files in %s' % self.log_path)
- return False
-
- # List of namedtuples to track smartd log file size
- self.disks = [NAMED_DISKS(name=disks[i], size=0, number=i) for i in range(len(disks))]
-
- if self._get_data():
- self.create_charts()
- return True
- else:
- self.error('Can\'t collect any data. Sorry.')
- return False
-
- def _get_raw_data(self, queue, disk):
- # The idea is to open a file.
- # Jump to the end.
- # Seek backward until '\n' symbol appears
- # If '\n' is found or it's the beginning of the file
- # readline()! (last or first line)
- with open(disk, 'rb') as f:
- f.seek(-2, 2)
- while f.read(1) != b'\n':
- f.seek(-2, 1)
- if f.tell() == 0:
- break
- result = f.readline()
-
- result = result.decode()
- result = self.regex.findall(result)
-
- queue.put([basename(disk), result])
-
- def _get_data(self):
- threads, result = list(), list()
- queue = Queue()
- to_netdata = dict()
-
- # If the size has not changed there is no reason to poll log files.
- disks = [disk for disk in self.disks if self.size_changed(disk)]
- if disks:
- for disk in disks:
- th = Thread(target=self._get_raw_data, args=(queue, disk.name))
- th.start()
- threads.append(th)
-
- for thread in threads:
- thread.join()
- result.append(queue.get())
+ self.disks = self.scan()
+
+ if not self.disks:
+ return None
+
+ user_defined_sa = self.configuration.get('smart_attributes')
+
+ if user_defined_sa:
+ order = user_defined_sa.split() or ORDER
else:
- # Data from last real poll
- return self.previous_data or None
+ order = ORDER
- for elem in result:
- for a, n, r in elem[1]:
- to_netdata.update({'_'.join([elem[0], a]): r if self.raw_values else n})
+ self.create_charts(order)
- self.previous_data.update(to_netdata)
+ return True
- return to_netdata or None
+ def get_data(self):
+ self.runs += 1
- def size_changed(self, disk):
- # We are not interested in log files:
- # 1. zero size
- # 2. size is not changed since last poll
- try:
- size = getsize(disk.name)
- if size != disk.size and size:
- self.disks[disk.number] = disk._replace(size=size)
- return True
- else:
- return False
- except OSError:
- # Remove unreadable/nonexisting log files from list of disks and previous_data
- self.disks.remove(disk)
- self.previous_data = dict([(k, v) for k, v in self.previous_data.items() if basename(disk.name) not in k])
- return False
+ if self.runs % RESCAN_INTERVAL == 0:
+ self.cleanup_and_rescan()
+
+ data = dict()
+
+ for disk in self.disks:
+
+ if not disk.status:
+ continue
+
+ changed = disk.log_file.is_changed()
+
+ # True = changed, False = unchanged, None = Exception
+ if changed is None:
+ disk.status = False
+ continue
+
+ if changed:
+ success = disk.get_attributes()
+ if not success:
+ disk.status = False
+ continue
+
+ data.update(disk.data())
+
+ return data or None
- def create_charts(self):
+ def create_charts(self, order):
+ for attr in order:
+ raw_name, normalized_name = 'attr_id_raw_' + attr, 'attr_id_normalized_' + attr
+ raw, normalized = chart_template(raw_name), chart_template(normalized_name)
+ self.order.extend([normalized_name, raw_name])
+ self.definitions.update(raw)
+ self.definitions.update(normalized)
- def create_lines(attrid):
- result = list()
for disk in self.disks:
- name = basename(disk.name)
- result.append(['_'.join([name, attrid]), name[:name.index('.')], 'absolute'])
- return result
+ if attr not in disk.attributes:
+ self.debug("'{disk}' has no attribute '{attr_id}'".format(disk=disk.name,
+ attr_id=attr))
+ continue
+ normalized[normalized_name]['lines'].append(['_'.join([disk.name, 'normalized', attr]), disk.name])
- # Use configured attributes, if present. If something goes wrong we don't care.
- order = ORDER
- try:
- order = [attr for attr in self.attr.split() if attr in SMART_ATTR.keys()] or ORDER
- except Exception:
- pass
- self.order = [''.join(['attrid', i]) for i in order]
- self.definitions = dict()
- units = 'raw' if self.raw_values else 'normalized'
-
- for k, v in dict([(k, v) for k, v in SMART_ATTR.items() if k in ORDER]).items():
- self.definitions.update({''.join(['attrid', k]): {
- 'options': [None, v, units, v.lower(), 'smartd.attrid' + k, 'line'],
- 'lines': create_lines(k)}})
-
-def find_disks_in_log_path(log_path):
- # smartd log file is OK if:
- # 1. it is a file
- # 2. file name endswith with 'csv'
- # 3. file is readable
- if not isdir(log_path): return None
- return [join(log_path, f) for f in listdir(log_path)
- if all([isfile(join(log_path, f)), f.endswith('.csv'), access(join(log_path, f), R_OK)])]
+ if not self.raw:
+ continue
+
+ if disk.attributes[attr].raw is not None:
+ raw[raw_name]['lines'].append(['_'.join([disk.name, 'raw', attr]), disk.name])
+ continue
+ self.debug("'{disk}' attribute '{attr_id}' value not in {limits}".format(disk=disk.name,
+ attr_id=attr,
+ limits=LIMITS[attr]))
+
+ def cleanup_and_rescan(self):
+ self.cleanup()
+ new_disks = self.scan(only_new=True)
+
+ for disk in new_disks:
+ valid = False
+
+ for chart in self.charts:
+ value_type, idx = chart.id.split('_')[2:]
+
+ if idx in disk.attributes:
+ valid = True
+ dimension_id = '_'.join([disk.name, value_type, idx])
+
+ if dimension_id in chart:
+ chart.hide_dimension(dimension_id=dimension_id, reverse=True)
+ else:
+ chart.add_dimension([dimension_id, disk.name])
+ if valid:
+ self.disks.append(disk)
+
+ def cleanup(self):
+ for disk in self.disks:
+
+ if not disk.is_active():
+ disk.status = False
+ if not disk.status:
+ for chart in self.charts:
+ dimension_id = '_'.join([disk.name, chart.id[8:]])
+ chart.hide_dimension(dimension_id=dimension_id)
+
+ self.disks = [disk for disk in self.disks if disk.status]
+
+ def scan(self, only_new=None):
+ new_disks = list()
+ for f in os.listdir(self.log_path):
+ full_path = os.path.join(self.log_path, f)
+
+ if DiskLogFile.is_valid(full_path, self.exclude):
+ disk = Disk(full_path, self.age)
+
+ active = disk.is_active()
+ if active is None:
+ continue
+ if active:
+ if not only_new:
+ new_disks.append(disk)
+ else:
+ if disk not in self.disks:
+ new_disks.append(disk)
+ else:
+ if not only_new:
+ self.debug("'{disk}' not updated in the last {age} minutes, "
+ "skipping it.".format(disk=disk.name, age=self.age))
+ return new_disks