diff options
Diffstat (limited to '')
-rw-r--r-- | src/pybind/mgr/insights/module.py | 327 |
1 files changed, 327 insertions, 0 deletions
diff --git a/src/pybind/mgr/insights/module.py b/src/pybind/mgr/insights/module.py new file mode 100644 index 000000000..12c88e73e --- /dev/null +++ b/src/pybind/mgr/insights/module.py @@ -0,0 +1,327 @@ +import datetime +import json +import re +import threading + +from mgr_module import CLICommand, CLIReadCommand, HandleCommandResult +from mgr_module import MgrModule, CommandResult, NotifyType +from . import health as health_util + +# hours of crash history to report +CRASH_HISTORY_HOURS = 24 +# hours of health history to report +HEALTH_HISTORY_HOURS = 24 +# how many hours of health history to keep +HEALTH_RETENTION_HOURS = 30 +# health check name for insights health +INSIGHTS_HEALTH_CHECK = "MGR_INSIGHTS_WARNING" +# version tag for persistent data format +ON_DISK_VERSION = 1 + + +class Module(MgrModule): + + NOTIFY_TYPES = [NotifyType.health] + + def __init__(self, *args, **kwargs): + super(Module, self).__init__(*args, **kwargs) + + self._shutdown = False + self._evt = threading.Event() + + # health history tracking + self._pending_health = [] + self._health_slot = None + self._store = {} + + # The following three functions, get_store, set_store, and get_store_prefix + # mask the functions defined in the parent to avoid storing large keys + # persistently to disk as that was proving problematic. Long term we may + # implement a different mechanism to make these persistent. When that day + # comes it should just be a matter of deleting these three functions. + def get_store(self, key): + return self._store.get(key) + + def set_store(self, key, value): + if value is None: + if key in self._store: + del self._store[key] + else: + self._store[key] = value + + def get_store_prefix(self, prefix): + return { k: v for k, v in self._store.items() if k.startswith(prefix) } + + + def notify(self, ttype: NotifyType, ident): + """Queue updates for processing""" + if ttype == NotifyType.health: + self.log.info("Received health check update {} pending".format( + len(self._pending_health))) + health = json.loads(self.get("health")["json"]) + self._pending_health.append(health) + self._evt.set() + + def serve(self): + self._health_reset() + while True: + self._evt.wait(health_util.PERSIST_PERIOD.total_seconds()) + self._evt.clear() + if self._shutdown: + break + + # when the current health slot expires, finalize it by flushing it to + # the store, and initializing a new empty slot. + if self._health_slot.expired(): + self.log.info("Health history slot expired {}".format( + self._health_slot)) + self._health_maybe_flush() + self._health_reset() + self._health_prune_history(HEALTH_RETENTION_HOURS) + + # fold in pending health snapshots and flush + self.log.info("Applying {} health updates to slot {}".format( + len(self._pending_health), self._health_slot)) + for health in self._pending_health: + self._health_slot.add(health) + self._pending_health = [] + self._health_maybe_flush() + + def shutdown(self): + self._shutdown = True + self._evt.set() + + def _health_reset(self): + """Initialize the current health slot + + The slot will be initialized with any state found to have already been + persisted, otherwise the slot will start empty. + """ + key = health_util.HealthHistorySlot.curr_key() + data = self.get_store(key) + if data: + init_health = json.loads(data) + self._health_slot = health_util.HealthHistorySlot(init_health) + else: + self._health_slot = health_util.HealthHistorySlot() + self.log.info("Reset curr health slot {}".format(self._health_slot)) + + def _health_maybe_flush(self): + """Store the health for the current time slot if needed""" + + self.log.info("Maybe flushing slot {} needed {}".format( + self._health_slot, self._health_slot.need_flush())) + + if self._health_slot.need_flush(): + key = self._health_slot.key() + + # build store data entry + slot = self._health_slot.health() + assert "version" not in slot + slot.update(dict(version = ON_DISK_VERSION)) + data = json.dumps(slot, cls=health_util.HealthEncoder) + + self.log.debug("Storing health key {} data {}".format( + key, json.dumps(slot, indent=2, cls=health_util.HealthEncoder))) + + self.set_store(key, data) + self._health_slot.mark_flushed() + + def _health_filter(self, f): + """Filter hourly health reports timestamp""" + matches = filter( + lambda t: f(health_util.HealthHistorySlot.key_to_time(t[0])), + self.get_store_prefix(health_util.HEALTH_HISTORY_KEY_PREFIX).items()) + return map(lambda t: t[0], matches) + + def _health_prune_history(self, hours): + """Prune old health entries""" + cutoff = datetime.datetime.utcnow() - datetime.timedelta(hours = hours) + for key in self._health_filter(lambda ts: ts <= cutoff): + self.log.info("Removing old health slot key {}".format(key)) + self.set_store(key, None) + if not hours: + self._health_slot = health_util.HealthHistorySlot() + + def _health_report(self, hours): + """ + Report a consolidated health report for the past N hours. + """ + # roll up the past N hours of health info + collector = health_util.HealthHistorySlot() + keys = health_util.HealthHistorySlot.key_range(hours) + for key in keys: + data = self.get_store(key) + self.log.info("Reporting health key {} found {}".format( + key, bool(data))) + health = json.loads(data) if data else {} + slot = health_util.HealthHistorySlot(health) + collector.merge(slot) + + # include history that hasn't yet been flushed + collector.merge(self._health_slot) + + return dict( + current = json.loads(self.get("health")["json"]), + history = collector.health() + ) + + def _version_parse(self, version): + """ + Return the components of a Ceph version string. + + This returns nothing when the version string cannot be parsed into its + constituent components, such as when Ceph has been built with + ENABLE_GIT_VERSION=OFF. + """ + r = r"ceph version (?P<release>\d+)\.(?P<major>\d+)\.(?P<minor>\d+)" + m = re.match(r, version) + ver = {} if not m else { + "release": m.group("release"), + "major": m.group("major"), + "minor": m.group("minor") + } + return {k: int(v) for k, v in ver.items()} + + def _crash_history(self, hours): + """ + Load crash history for the past N hours from the crash module. + """ + params = dict( + prefix="crash json_report", + hours=hours + ) + + result = dict( + summary={}, + hours=params["hours"], + ) + + health_check_details = [] + + try: + _, _, crashes = self.remote("crash", "handle_command", "", params) + result["summary"] = json.loads(crashes) + except Exception as e: + errmsg = "failed to invoke crash module" + self.log.warning("{}: {}".format(errmsg, str(e))) + health_check_details.append(errmsg) + else: + self.log.debug("Crash module invocation succeeded {}".format( + json.dumps(result["summary"], indent=2))) + + return result, health_check_details + + def _apply_osd_stats(self, osd_map): + # map from osd id to its index in the map structure + osd_id_to_idx = {} + for idx in range(len(osd_map["osds"])): + osd_id_to_idx[osd_map["osds"][idx]["osd"]] = idx + + # include stats, including space utilization performance counters. + # adapted from dashboard api controller + for s in self.get('osd_stats')['osd_stats']: + try: + idx = osd_id_to_idx[s["osd"]] + osd_map["osds"][idx].update({'osd_stats': s}) + except KeyError as e: + self.log.warning("inconsistent api state: {}".format(str(e))) + + for osd in osd_map["osds"]: + osd['stats'] = {} + for s in ['osd.numpg', 'osd.stat_bytes', 'osd.stat_bytes_used']: + osd['stats'][s.split('.')[1]] = self.get_latest('osd', str(osd["osd"]), s) + + + def _config_dump(self): + """Report cluster configuration + + This report is the standard `config dump` report. It does not include + configuration defaults; these can be inferred from the version number. + """ + result = CommandResult("") + args = dict(prefix = "config dump", format = "json") + self.send_command(result, "mon", "", json.dumps(args), "") + ret, outb, outs = result.wait() + if ret == 0: + return json.loads(outb), [] + else: + self.log.warning("send_command 'config dump' failed. \ + ret={}, outs=\"{}\"".format(ret, outs)) + return [], ["Failed to read monitor config dump"] + + @CLIReadCommand('insights') + def do_report(self): + ''' + Retrieve insights report + ''' + health_check_details = [] + report = {} + + report.update({ + "version": dict(full = self.version, + **self._version_parse(self.version)) + }) + + # crash history + crashes, health_details = self._crash_history(CRASH_HISTORY_HOURS) + report["crashes"] = crashes + health_check_details.extend(health_details) + + # health history + report["health"] = self._health_report(HEALTH_HISTORY_HOURS) + + # cluster configuration + config, health_details = self._config_dump() + report["config"] = config + health_check_details.extend(health_details) + + osd_map = self.get("osd_map") + del osd_map['pg_temp'] + self._apply_osd_stats(osd_map) + report["osd_dump"] = osd_map + + report["df"] = self.get("df") + report["osd_tree"] = self.get("osd_map_tree") + report["fs_map"] = self.get("fs_map") + report["crush_map"] = self.get("osd_map_crush") + report["mon_map"] = self.get("mon_map") + report["service_map"] = self.get("service_map") + report["manager_map"] = self.get("mgr_map") + report["mon_status"] = json.loads(self.get("mon_status")["json"]) + report["pg_summary"] = self.get("pg_summary") + report["osd_metadata"] = self.get("osd_metadata") + + report.update({ + "errors": health_check_details + }) + + if health_check_details: + self.set_health_checks({ + INSIGHTS_HEALTH_CHECK: { + "severity": "warning", + "summary": "Generated incomplete Insights report", + "detail": health_check_details + } + }) + + result = json.dumps(report, indent=2, cls=health_util.HealthEncoder) + return HandleCommandResult(stdout=result) + + @CLICommand('insights prune-health') + def do_prune_health(self, hours: int): + ''' + Remove health history older than <hours> hours + ''' + self._health_prune_history(hours) + return HandleCommandResult() + + def testing_set_now_time_offset(self, hours): + """ + Control what "now" time it is by applying an offset. This is called from + the selftest module to manage testing scenarios related to tracking + health history. + """ + hours = int(hours) + health_util.NOW_OFFSET = datetime.timedelta(hours=hours) + self.log.warning("Setting now time offset {}".format(health_util.NOW_OFFSET)) |