summaryrefslogtreecommitdiffstats
path: root/src/pybind/mgr/diskprediction_local/module.py
diff options
context:
space:
mode:
Diffstat (limited to 'src/pybind/mgr/diskprediction_local/module.py')
-rw-r--r--src/pybind/mgr/diskprediction_local/module.py319
1 files changed, 319 insertions, 0 deletions
diff --git a/src/pybind/mgr/diskprediction_local/module.py b/src/pybind/mgr/diskprediction_local/module.py
new file mode 100644
index 000000000..948c59bd7
--- /dev/null
+++ b/src/pybind/mgr/diskprediction_local/module.py
@@ -0,0 +1,319 @@
+"""
+diskprediction with local predictor
+"""
+import json
+import datetime
+import _strptime
+from threading import Event
+import time
+
+from mgr_module import MgrModule, CommandResult
+
+# Importing scipy early appears to avoid a future deadlock when
+# we try to do
+#
+# from .predictor import get_diskfailurepredictor_path
+#
+# in a command thread. See https://tracker.ceph.com/issues/42764
+import scipy
+
+
+TIME_FORMAT = '%Y%m%d-%H%M%S'
+TIME_DAYS = 24*60*60
+TIME_WEEK = TIME_DAYS * 7
+
+
+class Module(MgrModule):
+ MODULE_OPTIONS = [
+ {
+ 'name': 'sleep_interval',
+ 'default': str(600),
+ },
+ {
+ 'name': 'predict_interval',
+ 'default': str(86400),
+ },
+ {
+ 'name': 'predictor_model',
+ 'default': 'prophetstor',
+ },
+ ]
+
+ COMMANDS = []
+
+ def __init__(self, *args, **kwargs):
+ super(Module, self).__init__(*args, **kwargs)
+ # options
+ for opt in self.MODULE_OPTIONS:
+ setattr(self, opt['name'], opt['default'])
+ # other
+ self._run = True
+ self._event = Event()
+
+ def config_notify(self):
+ for opt in self.MODULE_OPTIONS:
+ setattr(self,
+ opt['name'],
+ self.get_module_option(opt['name']))
+ self.log.debug(' %s = %s', opt['name'], getattr(self, opt['name']))
+ if self.get_ceph_option('device_failure_prediction_mode') == 'local':
+ self._event.set()
+
+ def refresh_config(self):
+ for opt in self.MODULE_OPTIONS:
+ setattr(self,
+ opt['name'],
+ self.get_module_option(opt['name']))
+ self.log.debug(' %s = %s', opt['name'], getattr(self, opt['name']))
+
+ def handle_command(self, _, cmd):
+ self.log.debug('handle_command cmd: %s', cmd)
+ raise NotImplementedError(cmd['prefix'])
+
+ def self_test(self):
+ self.log.debug('self_test enter')
+ ret, out, err = self.predict_all_devices()
+ assert ret == 0
+ return 0, 'self test succeed', ''
+
+ def serve(self):
+ self.log.info('Starting diskprediction local module')
+ self.config_notify()
+ last_predicted = None
+ ls = self.get_store('last_predicted')
+ if ls:
+ try:
+ last_predicted = datetime.datetime.strptime(ls, TIME_FORMAT)
+ except ValueError:
+ pass
+ self.log.debug('Last predicted %s', last_predicted)
+
+ while self._run:
+ self.refresh_config()
+ mode = self.get_ceph_option('device_failure_prediction_mode')
+ if mode == 'local':
+ now = datetime.datetime.utcnow()
+ if not last_predicted:
+ next_predicted = now
+ else:
+ predicted_frequency = int(self.predict_interval) or 86400
+ seconds = (last_predicted - datetime.datetime.utcfromtimestamp(0)).total_seconds()
+ seconds -= seconds % predicted_frequency
+ seconds += predicted_frequency
+ next_predicted = datetime.datetime.utcfromtimestamp(seconds)
+ if last_predicted:
+ self.log.debug('Last scrape %s, next scrape due %s',
+ last_predicted.strftime(TIME_FORMAT),
+ next_predicted.strftime(TIME_FORMAT))
+ else:
+ self.log.debug('Last scrape never, next scrape due %s',
+ next_predicted.strftime(TIME_FORMAT))
+ if now >= next_predicted:
+ self.predict_all_devices()
+ last_predicted = now
+ self.set_store('last_predicted', last_predicted.strftime(TIME_FORMAT))
+
+ sleep_interval = int(self.sleep_interval) or 60
+ self.log.debug('Sleeping for %d seconds', sleep_interval)
+ self._event.wait(sleep_interval)
+ self._event.clear()
+
+ def shutdown(self):
+ self.log.info('Stopping')
+ self._run = False
+ self._event.set()
+
+ @staticmethod
+ def _convert_timestamp(predicted_timestamp, life_expectancy_day):
+ """
+ :param predicted_timestamp: unit is nanoseconds
+ :param life_expectancy_day: unit is seconds
+ :return:
+ date format '%Y-%m-%d' ex. 2018-01-01
+ """
+ return datetime.datetime.fromtimestamp(
+ predicted_timestamp / (1000 ** 3) + life_expectancy_day).strftime('%Y-%m-%d')
+
+ def _predict_life_expentancy(self, devid):
+ predicted_result = ''
+ health_data = {}
+ predict_datas = []
+ try:
+ r, outb, outs = self.remote('devicehealth', 'show_device_metrics', devid=devid, sample='')
+ if r != 0:
+ self.log.error('failed to get device %s health', devid)
+ health_data = {}
+ else:
+ health_data = json.loads(outb)
+ except Exception as e:
+ self.log.error('failed to get device %s health data due to %s', devid, str(e))
+
+ # initialize appropriate disk failure predictor model
+ from .predictor import get_diskfailurepredictor_path
+ if self.predictor_model == 'prophetstor':
+ from .predictor import PSDiskFailurePredictor
+ obj_predictor = PSDiskFailurePredictor()
+ ret = obj_predictor.initialize("{}/models/{}".format(get_diskfailurepredictor_path(), self.predictor_model))
+ if ret is not None:
+ self.log.error('Error initializing predictor')
+ return predicted_result
+ elif self.predictor_model == 'redhat':
+ from .predictor import RHDiskFailurePredictor
+ obj_predictor = RHDiskFailurePredictor()
+ ret = obj_predictor.initialize("{}/models/{}".format(get_diskfailurepredictor_path(), self.predictor_model))
+ if ret is not None:
+ self.log.error('Error initializing predictor')
+ return predicted_result
+ else:
+ self.log.error('invalid value received for MODULE_OPTIONS.predictor_model')
+ return predicted_result
+
+ if len(health_data) >= 6:
+ o_keys = sorted(health_data.keys(), reverse=True)
+ for o_key in o_keys:
+ # get values for current day (?)
+ dev_smart = {}
+ s_val = health_data[o_key]
+
+ # add all smart attributes
+ ata_smart = s_val.get('ata_smart_attributes', {})
+ for attr in ata_smart.get('table', []):
+ # get raw smart values
+ if attr.get('raw', {}).get('string') is not None:
+ if str(attr.get('raw', {}).get('string', '0')).isdigit():
+ dev_smart['smart_%s_raw' % attr.get('id')] = \
+ int(attr.get('raw', {}).get('string', '0'))
+ else:
+ if str(attr.get('raw', {}).get('string', '0')).split(' ')[0].isdigit():
+ dev_smart['smart_%s_raw' % attr.get('id')] = \
+ int(attr.get('raw', {}).get('string',
+ '0').split(' ')[0])
+ else:
+ dev_smart['smart_%s_raw' % attr.get('id')] = \
+ attr.get('raw', {}).get('value', 0)
+ # get normalized smart values
+ if attr.get('value') is not None:
+ dev_smart['smart_%s_normalized' % attr.get('id')] = \
+ attr.get('value')
+ # add power on hours manually if not available in smart attributes
+ if s_val.get('power_on_time', {}).get('hours') is not None:
+ dev_smart['smart_9_raw'] = int(s_val['power_on_time']['hours'])
+ # add device capacity
+ if s_val.get('user_capacity') is not None:
+ if s_val.get('user_capacity').get('bytes') is not None:
+ dev_smart['user_capacity'] = s_val.get('user_capacity').get('bytes')
+ else:
+ self.log.debug('user_capacity not found in smart attributes list')
+ # add device model
+ if s_val.get('model_name') is not None:
+ dev_smart['model_name'] = s_val.get('model_name')
+ # add vendor
+ if s_val.get('vendor') is not None:
+ dev_smart['vendor'] = s_val.get('vendor')
+ # if smart data was found, then add that to list
+ if dev_smart:
+ predict_datas.append(dev_smart)
+ if len(predict_datas) >= 12:
+ break
+ else:
+ self.log.error('unable to predict device due to health data records less than 6 days')
+
+ if len(predict_datas) >= 6:
+ predicted_result = obj_predictor.predict(predict_datas)
+ return predicted_result
+
+ def predict_life_expectancy(self, devid):
+ result = self._predict_life_expentancy(devid)
+ if result.lower() == 'good':
+ return 0, '>6w', ''
+ elif result.lower() == 'warning':
+ return 0, '>=2w and <=6w', ''
+ elif result.lower() == 'bad':
+ return 0, '<2w', ''
+ else:
+ return 0, 'unknown', ''
+
+ def _reset_device_life_expectancy(self, device_id):
+ result = CommandResult('')
+ self.send_command(result, 'mon', '', json.dumps({
+ 'prefix': 'device rm-life-expectancy',
+ 'devid': device_id
+ }), '')
+ ret, _, outs = result.wait()
+ if ret != 0:
+ self.log.error(
+ 'failed to reset device life expectancy, %s' % outs)
+ return ret
+
+ def _set_device_life_expectancy(self, device_id, from_date, to_date=None):
+ result = CommandResult('')
+
+ if to_date is None:
+ self.send_command(result, 'mon', '', json.dumps({
+ 'prefix': 'device set-life-expectancy',
+ 'devid': device_id,
+ 'from': from_date
+ }), '')
+ else:
+ self.send_command(result, 'mon', '', json.dumps({
+ 'prefix': 'device set-life-expectancy',
+ 'devid': device_id,
+ 'from': from_date,
+ 'to': to_date
+ }), '')
+ ret, _, outs = result.wait()
+ if ret != 0:
+ self.log.error(
+ 'failed to set device life expectancy, %s' % outs)
+ return ret
+
+ def predict_all_devices(self):
+ self.log.debug('predict_all_devices')
+ devices = self.get('devices').get('devices', [])
+ for devInfo in devices:
+ if not devInfo.get('daemons'):
+ continue
+ if not devInfo.get('devid'):
+ continue
+ self.log.debug('%s' % devInfo)
+ result = self._predict_life_expentancy(devInfo['devid'])
+ if result == 'unknown':
+ self._reset_device_life_expectancy(devInfo['devid'])
+ continue
+ predicted = int(time.time() * (1000 ** 3))
+
+ if result.lower() == 'good':
+ life_expectancy_day_min = (TIME_WEEK * 6) + TIME_DAYS
+ life_expectancy_day_max = None
+ elif result.lower() == 'warning':
+ life_expectancy_day_min = (TIME_WEEK * 2)
+ life_expectancy_day_max = (TIME_WEEK * 6)
+ elif result.lower() == 'bad':
+ life_expectancy_day_min = 0
+ life_expectancy_day_max = (TIME_WEEK * 2) - TIME_DAYS
+ else:
+ predicted = None
+ life_expectancy_day_min = None
+ life_expectancy_day_max = None
+
+ if predicted and devInfo['devid'] and life_expectancy_day_min:
+ from_date = None
+ to_date = None
+ try:
+ if life_expectancy_day_min:
+ from_date = self._convert_timestamp(predicted, life_expectancy_day_min)
+
+ if life_expectancy_day_max:
+ to_date = self._convert_timestamp(predicted, life_expectancy_day_max)
+
+ self._set_device_life_expectancy(devInfo['devid'], from_date, to_date)
+ self._logger.info(
+ 'succeed to set device {} life expectancy from: {}, to: {}'.format(
+ devInfo['devid'], from_date, to_date))
+ except Exception as e:
+ self._logger.error(
+ 'failed to set device {} life expectancy from: {}, to: {}, {}'.format(
+ devInfo['devid'], from_date, to_date, str(e)))
+ else:
+ self._reset_device_life_expectancy(devInfo['devid'])
+ return 0, 'succeed to predicted all devices', ''