8 files changed, 862 insertions, 0 deletions
diff --git a/src/pybind/mgr/insights/CMakeLists.txt b/src/pybind/mgr/insights/CMakeLists.txt
new file mode 100644
index 00000000..00722a99
--- /dev/null
+++ b/src/pybind/mgr/insights/CMakeLists.txt
@@ -0,0 +1,7 @@
+set(MGR_INSIGHTS_VIRTUALENV ${CEPH_BUILD_VIRTUALENV}/mgr-insights-virtualenv)
+
+add_custom_target(mgr-insights-test-venv
+  COMMAND ${CMAKE_SOURCE_DIR}/src/tools/setup-virtualenv.sh --python=${MGR_PYTHON_EXECUTABLE} ${MGR_INSIGHTS_VIRTUALENV}
+  WORKING_DIRECTORY ${CMAKE_SOURCE_DIR}/src/pybind/mgr/insights
+  COMMENT "insights tests virtualenv is being created")
+add_dependencies(tests mgr-insights-test-venv)
diff --git a/src/pybind/mgr/insights/__init__.py b/src/pybind/mgr/insights/__init__.py
new file mode 100644
index 00000000..ea61a12f
--- /dev/null
+++ b/src/pybind/mgr/insights/__init__.py
@@ -0,0 +1,9 @@
+from __future__ import absolute_import
+import os
+
+if 'UNITTEST' not in os.environ:
+    from .module import Module
+else:
+    import sys
+    import mock
+    sys.modules['ceph_module'] = mock.Mock()
diff --git a/src/pybind/mgr/insights/health.py b/src/pybind/mgr/insights/health.py
new file mode 100644
index 00000000..5235ca84
--- /dev/null
+++ b/src/pybind/mgr/insights/health.py
@@ -0,0 +1,191 @@
+import json
+import six
+from collections import defaultdict
+import datetime
+
+# freq to write cached state to disk
+PERSIST_PERIOD = datetime.timedelta(seconds = 10)
+# on disk key prefix
+HEALTH_HISTORY_KEY_PREFIX = "health_history/"
+# apply on offset to "now": used for testing
+NOW_OFFSET = None
+
+class HealthEncoder(json.JSONEncoder):
+    def default(self, obj):
+        if isinstance(obj, set):
+            return list(obj)
+        return json.JSONEncoder.default(self, obj)
+
+class HealthCheckAccumulator(object):
+    """
+    Deuplicated storage of health checks.
+    """
+    def __init__(self, init_checks = None):
+        # check : severity : { summary, detail }
+        # summary and detail are deduplicated
+        self._checks = defaultdict(lambda:
+            defaultdict(lambda: {
+                "summary": set(),
+                "detail": set()
+            }))
+
+        if init_checks:
+            self._update(init_checks)
+
+    def __str__(self):
+        return "check count {}".format(len(self._checks))
+
+    def add(self, checks):
+        """
+        Add health checks to the current state
+
+        Returns:
+            bool: True if the state changed, False otherwise.
+        """
+        changed = False
+
+        for check, info in six.iteritems(checks):
+
+            # only keep the icky stuff
+            severity = info["severity"]
+            if severity == "HEALTH_OK":
+                continue
+
+            summary = info["summary"]["message"]
+            details = map(lambda d: d["message"], info["detail"])
+
+            if self._add_check(check, severity, [summary], details):
+                changed = True
+
+        return changed
+
+    def checks(self):
+        return self._checks
+
+    def merge(self, other):
+        assert isinstance(other, HealthCheckAccumulator)
+        self._update(other._checks)
+
+    def _update(self, checks):
+        """Merge checks with same structure. Does not set dirty bit"""
+        for check in checks:
+            for severity in checks[check]:
+                summaries = set(checks[check][severity]["summary"])
+                details = set(checks[check][severity]["detail"])
+                self._add_check(check, severity, summaries, details)
+
+    def _add_check(self, check, severity, summaries, details):
+        changed = False
+
+        for summary in summaries:
+            if summary not in self._checks[check][severity]["summary"]:
+                changed = True
+                self._checks[check][severity]["summary"].add(summary)
+
+        for detail in details:
+            if detail not in self._checks[check][severity]["detail"]:
+                changed = True
+                self._checks[check][severity]["detail"].add(detail)
+
+        return changed
+
+class HealthHistorySlot(object):
+    """
+    Manage the life cycle of a health history time slot.
+
+    A time slot is a fixed slice of wall clock time (e.g. every hours, from :00
+    to :59), and all health updates that occur during this time are deduplicated
+    together. A slot is initially in a clean state, and becomes dirty when a new
+    health check is observed. The state of a slot should be persisted when
+    need_flush returns true. Once the state has been flushed, reset the dirty
+    bit by calling mark_flushed.
+    """
+    def __init__(self, init_health = dict()):
+        self._checks = HealthCheckAccumulator(init_health.get("checks"))
+        self._slot = self._curr_slot()
+        self._next_flush = None
+
+    def __str__(self):
+        return "key {} next flush {} checks {}".format(
+            self.key(), self._next_flush, self._checks)
+
+    def health(self):
+        return dict(checks = self._checks.checks())
+
+    def key(self):
+        """Identifier in the persist store"""
+        return self._key(self._slot)
+
+    def expired(self):
+        """True if this slot is the current slot, False otherwise"""
+        return self._slot != self._curr_slot()
+
+    def need_flush(self):
+        """True if this slot needs to be flushed, False otherwise"""
+        now = HealthHistorySlot._now()
+        if self._next_flush is not None:
+            if self._next_flush <= now or self.expired():
+                return True
+        return False
+
+    def mark_flushed(self):
+        """Reset the dirty bit. Caller persists state"""
+        assert self._next_flush
+        self._next_flush = None
+
+    def add(self, health):
+        """
+        Add health to the underlying health accumulator. When the slot
+        transitions from clean to dirty a target flush time is computed.
+        """
+        changed = self._checks.add(health["checks"])
+        if changed and not self._next_flush:
+            self._next_flush = HealthHistorySlot._now() + PERSIST_PERIOD
+        return changed
+
+    def merge(self, other):
+        assert isinstance(other, HealthHistorySlot)
+        self._checks.merge(other._checks)
+
+    @staticmethod
+    def key_range(hours):
+        """Return the time slot keys for the past N hours"""
+        def inner(curr, hours):
+            slot = curr - datetime.timedelta(hours = hours)
+            return HealthHistorySlot._key(slot)
+        curr = HealthHistorySlot._curr_slot()
+        return map(lambda i: inner(curr, i), range(hours))
+
+    @staticmethod
+    def curr_key():
+        """Key for the current UTC time slot"""
+        return HealthHistorySlot._key(HealthHistorySlot._curr_slot())
+
+    @staticmethod
+    def key_to_time(key):
+        """Return key converted into datetime"""
+        timestr = key[len(HEALTH_HISTORY_KEY_PREFIX):]
+        return datetime.datetime.strptime(timestr, "%Y-%m-%d_%H")
+
+    @staticmethod
+    def _key(dt):
+        """Key format. Example: health_2018_11_05_00"""
+        return HEALTH_HISTORY_KEY_PREFIX + dt.strftime("%Y-%m-%d_%H")
+
+    @staticmethod
+    def _now():
+        """Control now time for easier testing"""
+        now = datetime.datetime.utcnow()
+        if NOW_OFFSET is not None:
+            now = now + NOW_OFFSET
+        return now
+
+    @staticmethod
+    def _curr_slot():
+        """Slot for the current UTC time"""
+        dt = HealthHistorySlot._now()
+        return datetime.datetime(
+            year  = dt.year,
+            month = dt.month,
+            day   = dt.day,
+            hour  = dt.hour)
diff --git a/src/pybind/mgr/insights/module.py b/src/pybind/mgr/insights/module.py
new file mode 100644
index 00000000..052295a1
--- /dev/null
+++ b/src/pybind/mgr/insights/module.py
@@ -0,0 +1,322 @@
+import datetime
+import json
+import re
+import threading
+import six
+from mgr_module import MgrModule, CommandResult
+from . import health as health_util
+
+# hours of crash history to report
+CRASH_HISTORY_HOURS = 24
+# hours of health history to report
+HEALTH_HISTORY_HOURS = 24
+# how many hours of health history to keep
+HEALTH_RETENTION_HOURS = 30
+# health check name for insights health
+INSIGHTS_HEALTH_CHECK = "MGR_INSIGHTS_WARNING"
+# version tag for persistent data format
+ON_DISK_VERSION = 1
+
+class Module(MgrModule):
+    COMMANDS = [
+        {
+            "cmd": "insights",
+            "desc": "Retrieve insights report",
+            "perm": "r",
+            "poll": "false",
+        },
+        {
+            'cmd': 'insights prune-health name=hours,type=CephString',
+            'desc': 'Remove health history older than <hours> hours',
+            'perm': 'rw',
+            "poll": "false",
+        },
+    ]
+
+    def __init__(self, *args, **kwargs):
+        super(Module, self).__init__(*args, **kwargs)
+
+        self._shutdown = False
+        self._evt = threading.Event()
+
+        # health history tracking
+        self._pending_health = []
+        self._health_slot = None
+
+    def notify(self, ttype, ident):
+        """Queue updates for processing"""
+        if ttype == "health":
+            self.log.info("Received health check update {} pending".format(
+                len(self._pending_health)))
+            health = json.loads(self.get("health")["json"])
+            self._pending_health.append(health)
+            self._evt.set()
+
+    def serve(self):
+        self._health_reset()
+        while True:
+            self._evt.wait(health_util.PERSIST_PERIOD.total_seconds())
+            self._evt.clear()
+            if self._shutdown:
+                break
+
+            # when the current health slot expires, finalize it by flushing it to
+            # the store, and initializing a new empty slot.
+            if self._health_slot.expired():
+                self.log.info("Health history slot expired {}".format(
+                    self._health_slot))
+                self._health_maybe_flush()
+                self._health_reset()
+                self._health_prune_history(HEALTH_RETENTION_HOURS)
+
+            # fold in pending health snapshots and flush
+            self.log.info("Applying {} health updates to slot {}".format(
+                len(self._pending_health), self._health_slot))
+            for health in self._pending_health:
+                self._health_slot.add(health)
+            self._pending_health = []
+            self._health_maybe_flush()
+
+    def shutdown(self):
+        self._shutdown = True
+        self._evt.set()
+
+    def _health_reset(self):
+        """Initialize the current health slot
+
+        The slot will be initialized with any state found to have already been
+        persisted, otherwise the slot will start empty.
+        """
+        key = health_util.HealthHistorySlot.curr_key()
+        data = self.get_store(key)
+        if data:
+            init_health = json.loads(data)
+            self._health_slot = health_util.HealthHistorySlot(init_health)
+        else:
+            self._health_slot = health_util.HealthHistorySlot()
+        self.log.info("Reset curr health slot {}".format(self._health_slot))
+
+    def _health_maybe_flush(self):
+        """Store the health for the current time slot if needed"""
+
+        self.log.info("Maybe flushing slot {} needed {}".format(
+            self._health_slot, self._health_slot.need_flush()))
+
+        if self._health_slot.need_flush():
+            key = self._health_slot.key()
+
+            # build store data entry
+            slot = self._health_slot.health()
+            assert "version" not in slot
+            slot.update(dict(version = ON_DISK_VERSION))
+            data = json.dumps(slot, cls=health_util.HealthEncoder)
+
+            self.log.debug("Storing health key {} data {}".format(
+                key, json.dumps(slot, indent=2, cls=health_util.HealthEncoder)))
+
+            self.set_store(key, data)
+            self._health_slot.mark_flushed()
+
+    def _health_filter(self, f):
+        """Filter hourly health reports timestamp"""
+        matches = filter(
+            lambda t: f(health_util.HealthHistorySlot.key_to_time(t[0])),
+            six.iteritems(self.get_store_prefix(health_util.HEALTH_HISTORY_KEY_PREFIX)))
+        return map(lambda t: t[0], matches)
+
+    def _health_prune_history(self, hours):
+        """Prune old health entries"""
+        cutoff = datetime.datetime.utcnow() - datetime.timedelta(hours = hours)
+        for key in self._health_filter(lambda ts: ts <= cutoff):
+            self.log.info("Removing old health slot key {}".format(key))
+            self.set_store(key, None)
+        if not hours:
+            self._health_slot = health_util.HealthHistorySlot()
+
+    def _health_report(self, hours):
+        """
+        Report a consolidated health report for the past N hours.
+        """
+        # roll up the past N hours of health info
+        collector = health_util.HealthHistorySlot()
+        keys = health_util.HealthHistorySlot.key_range(hours)
+        for key in keys:
+            data = self.get_store(key)
+            self.log.info("Reporting health key {} found {}".format(
+                key, bool(data)))
+            health = json.loads(data) if data else {}
+            slot = health_util.HealthHistorySlot(health)
+            collector.merge(slot)
+
+        # include history that hasn't yet been flushed
+        collector.merge(self._health_slot)
+
+        return dict(
+           current = json.loads(self.get("health")["json"]),
+           history = collector.health()
+        )
+
+    def _version_parse(self, version):
+        """
+        Return the components of a Ceph version string.
+
+        This returns nothing when the version string cannot be parsed into its
+        constituent components, such as when Ceph has been built with
+        ENABLE_GIT_VERSION=OFF.
+        """
+        r = r"ceph version (?P<release>\d+)\.(?P<major>\d+)\.(?P<minor>\d+)"
+        m = re.match(r, version)
+        ver = {} if not m else {
+            "release": m.group("release"),
+            "major": m.group("major"),
+            "minor": m.group("minor")
+        }
+        return { k:int(v) for k,v in six.iteritems(ver) }
+
+    def _crash_history(self, hours):
+        """
+        Load crash history for the past N hours from the crash module.
+        """
+        params = dict(
+            prefix = "crash json_report",
+            hours = hours
+        )
+
+        result = dict(
+            summary = {},
+            hours = params["hours"],
+        )
+
+        health_check_details = []
+
+        try:
+            _, _, crashes = self.remote("crash", "handle_command", "", params)
+            result["summary"] = json.loads(crashes)
+        except Exception as e:
+            errmsg = "failed to invoke crash module"
+            self.log.warning("{}: {}".format(errmsg, str(e)))
+            health_check_details.append(errmsg)
+        else:
+            self.log.debug("Crash module invocation succeeded {}".format(
+                json.dumps(result["summary"], indent=2)))
+
+        return result, health_check_details
+
+    def _apply_osd_stats(self, osd_map):
+        # map from osd id to its index in the map structure
+        osd_id_to_idx = {}
+        for idx in range(len(osd_map["osds"])):
+            osd_id_to_idx[osd_map["osds"][idx]["osd"]] = idx
+
+        # include stats, including space utilization performance counters.
+        # adapted from dashboard api controller
+        for s in self.get('osd_stats')['osd_stats']:
+            try:
+                idx = osd_id_to_idx[s["osd"]]
+                osd_map["osds"][idx].update({'osd_stats': s})
+            except KeyError as e:
+                self.log.warning("inconsistent api state: {}".format(str(e)))
+
+        for osd in osd_map["osds"]:
+            osd['stats'] = {}
+            for s in ['osd.numpg', 'osd.stat_bytes', 'osd.stat_bytes_used']:
+                osd['stats'][s.split('.')[1]] = self.get_latest('osd', str(osd["osd"]), s)
+
+
+    def _config_dump(self):
+        """Report cluster configuration
+
+        This report is the standard `config dump` report. It does not include
+        configuration defaults; these can be inferred from the version number.
+        """
+        result = CommandResult("")
+        args = dict(prefix = "config dump", format = "json")
+        self.send_command(result, "mon", "", json.dumps(args), "")
+        ret, outb, outs = result.wait()
+        if ret == 0:
+            return json.loads(outb), []
+        else:
+            self.log.warning("send_command 'config dump' failed. \
+                    ret={}, outs=\"{}\"".format(ret, outs))
+            return [], ["Failed to read monitor config dump"]
+
+    def do_report(self, inbuf, command):
+        health_check_details = []
+        report = {}
+
+        report.update({
+            "version": dict(full = self.version,
+                **self._version_parse(self.version))
+        })
+
+        # crash history
+        crashes, health_details = self._crash_history(CRASH_HISTORY_HOURS)
+        report["crashes"] = crashes
+        health_check_details.extend(health_details)
+
+        # health history
+        report["health"] = self._health_report(HEALTH_HISTORY_HOURS)
+
+        # cluster configuration
+        config, health_details = self._config_dump()
+        report["config"] = config
+        health_check_details.extend(health_details)
+
+        osd_map = self.get("osd_map")
+        del osd_map['pg_temp']
+        self._apply_osd_stats(osd_map)
+        report["osd_dump"] = osd_map
+
+        report["df"] = self.get("df")
+        report["osd_tree"] = self.get("osd_map_tree")
+        report["fs_map"] = self.get("fs_map")
+        report["crush_map"] = self.get("osd_map_crush")
+        report["mon_map"] = self.get("mon_map")
+        report["service_map"] = self.get("service_map")
+        report["manager_map"] = self.get("mgr_map")
+        report["mon_status"] = json.loads(self.get("mon_status")["json"])
+        report["pg_summary"] = self.get("pg_summary")
+        report["osd_metadata"] = self.get("osd_metadata")
+
+        report.update({
+            "errors": health_check_details
+        })
+
+        if health_check_details:
+            self.set_health_checks({
+                INSIGHTS_HEALTH_CHECK: {
+                    "severity": "warning",
+                    "summary": "Generated incomplete Insights report",
+                    "detail": health_check_details
+                }
+            })
+
+        return 0, json.dumps(report, indent=2, cls=health_util.HealthEncoder), ""
+
+    def do_prune_health(self, inbuf, command):
+        try:
+            hours = int(command['hours'])
+        except ValueError:
+            return errno.EINVAL, '', 'hours argument must be integer'
+
+        self._health_prune_history(hours)
+
+        return 0, "", ""
+
+    def testing_set_now_time_offset(self, hours):
+        """
+        Control what "now" time it is by applying an offset. This is called from
+        the selftest module to manage testing scenarios related to tracking
+        health history.
+        """
+        hours = long(hours)
+        health_util.NOW_OFFSET = datetime.timedelta(hours = hours)
+        self.log.warning("Setting now time offset {}".format(health_util.NOW_OFFSET))
+
+    def handle_command(self, inbuf, command):
+        if command["prefix"] == "insights":
+            return self.do_report(inbuf, command)
+        elif command["prefix"] == "insights prune-health":
+            return self.do_prune_health(inbuf, command)
+        else:
+            raise NotImplementedError(cmd["prefix"])
diff --git a/src/pybind/mgr/insights/run-tox.sh b/src/pybind/mgr/insights/run-tox.sh
new file mode 100644
index 00000000..7d621450
--- /dev/null
+++ b/src/pybind/mgr/insights/run-tox.sh
@@ -0,0 +1,43 @@
+#!/usr/bin/env bash
+
+function dump_envvars {
+  echo "WITH_PYTHON2: ->$WITH_PYTHON2<-"
+  echo "WITH_PYTHON3: ->$WITH_PYTHON3<-"
+  echo "TOX_PATH: ->$TOX_PATH<-"
+  echo "ENV_LIST: ->$ENV_LIST<-"
+}
+
+# run from ./ or from ../
+: ${MGR_INSIGHTS_VIRTUALENV:=$CEPH_BUILD_DIR/mgr-insights-virtualenv}
+: ${WITH_PYTHON2:=ON}
+: ${WITH_PYTHON3:=3}
+: ${CEPH_BUILD_DIR:=$PWD/.tox}
+test -d insights && cd insights
+
+if [ -e tox.ini ]; then
+    TOX_PATH=$(readlink -f tox.ini)
+else
+    TOX_PATH=$(readlink -f $(dirname $0)/tox.ini)
+fi
+
+# tox.ini will take care of this.
+unset PYTHONPATH
+export CEPH_BUILD_DIR=$CEPH_BUILD_DIR
+
+source ${MGR_INSIGHTS_VIRTUALENV}/bin/activate
+
+if [ "$WITH_PYTHON2" = "ON" ]; then
+  ENV_LIST+="py27,"
+fi
+# WITH_PYTHON3 might be set to "ON" or to the python3 RPM version number
+# prevailing on the system - e.g. "3", "36"
+if [[ "$WITH_PYTHON3" =~ (^3|^ON) ]]; then
+  ENV_LIST+="py3,"
+fi
+# use bash string manipulation to strip off any trailing comma
+ENV_LIST=${ENV_LIST%,}
+
+tox -c "${TOX_PATH}" -e "${ENV_LIST}" "$@"
+TOX_STATUS="$?"
+test "$TOX_STATUS" -ne "0" && dump_envvars
+exit $TOX_STATUS
diff --git a/src/pybind/mgr/insights/tests/__init__.py b/src/pybind/mgr/insights/tests/__init__.py
new file mode 100644
index 00000000..e69de29b
--- /dev/null
+++ b/src/pybind/mgr/insights/tests/__init__.py
diff --git a/src/pybind/mgr/insights/tests/test_health.py b/src/pybind/mgr/insights/tests/test_health.py
new file mode 100644
index 00000000..9b34786d
--- /dev/null
+++ b/src/pybind/mgr/insights/tests/test_health.py
@@ -0,0 +1,273 @@
+import unittest
+import mock
+from ..health import *
+
+class HealthChecksTest(unittest.TestCase):
+    def test_check_accum_empty(self):
+        # health checks accum initially empty reports empty
+        h = HealthCheckAccumulator()
+        self.assertEqual(h.checks(), {})
+
+        h = HealthCheckAccumulator({})
+        self.assertEqual(h.checks(), {})
+
+    def _get_init_checks(self):
+        return HealthCheckAccumulator({
+            "C0": {
+                "S0": {
+                    "summary": ["s0", "s1"],
+                    "detail": ("d0", "d1")
+                }
+            }
+        })
+
+    def test_check_init(self):
+        # initialization with lists and tuples is OK
+        h = self._get_init_checks()
+        self.assertEqual(h.checks(), {
+            "C0": {
+                "S0": {
+                    "summary": set(["s0", "s1"]),
+                    "detail": set(["d0", "d1"])
+                }
+            }
+        })
+
+    def _get_merged_checks(self):
+        h = self._get_init_checks()
+        h.merge(HealthCheckAccumulator({
+            "C0": {
+                "S0": {
+                    "summary": ["s0", "s1", "s2"],
+                    "detail": ("d2",)
+                },
+                "S1": {
+                    "summary": ["s0", "s1", "s2"],
+                    "detail": ()
+                }
+            },
+            "C1": {
+                "S0": {
+                    "summary": [],
+                    "detail": ("d0", "d1", "d2")
+                }
+            }
+        }))
+        return h
+
+    def test_check_merge(self):
+        # merging combines and de-duplicates
+        h = self._get_merged_checks()
+        self.assertEqual(h.checks(), {
+            "C0": {
+                "S0": {
+                    "summary": set(["s0", "s1", "s2"]),
+                    "detail": set(["d0", "d1", "d2"])
+                },
+                "S1": {
+                    "summary": set(["s0", "s1", "s2"]),
+                    "detail": set([])
+                }
+            },
+            "C1": {
+                "S0": {
+                    "summary": set([]),
+                    "detail": set(["d0", "d1", "d2"])
+                }
+            }
+        })
+
+    def test_check_add_no_change(self):
+        # returns false when nothing changes
+        h = self._get_merged_checks()
+
+        self.assertFalse(h.add({}))
+
+        self.assertFalse(h.add({
+            "C0": {
+                "severity": "S0",
+                "summary": { "message": "s0" },
+                "detail": []
+            }
+        }))
+
+        self.assertFalse(h.add({
+            "C0": {
+                "severity": "S0",
+                "summary": { "message": "s1" },
+                "detail": [{ "message": "d1" }]
+            }
+        }))
+
+        self.assertFalse(h.add({
+            "C0": {
+                "severity": "S0",
+                "summary": { "message": "s0" },
+                "detail": [{ "message": "d1" }, { "message": "d2" }]
+            }
+        }))
+
+    def test_check_add_changed(self):
+        # new checks report change
+        h = self._get_merged_checks()
+
+        self.assertTrue(h.add({
+            "C0": {
+                "severity": "S0",
+                "summary": { "message": "s3" },
+                "detail": []
+            }
+        }))
+
+        self.assertTrue(h.add({
+            "C0": {
+                "severity": "S0",
+                "summary": { "message": "s1" },
+                "detail": [{ "message": "d4" }]
+            }
+        }))
+
+        self.assertTrue(h.add({
+            "C0": {
+                "severity": "S2",
+                "summary": { "message": "s0" },
+                "detail": [{ "message": "d0" }]
+            }
+        }))
+
+        self.assertTrue(h.add({
+            "C2": {
+                "severity": "S0",
+                "summary": { "message": "s0" },
+                "detail": [{ "message": "d0" }, { "message": "d1" }]
+            }
+        }))
+
+        self.assertEqual(h.checks(), {
+            "C0": {
+                "S0": {
+                    "summary": set(["s0", "s1", "s2", "s3"]),
+                    "detail": set(["d0", "d1", "d2", "d4"])
+                },
+                "S1": {
+                    "summary": set(["s0", "s1", "s2"]),
+                    "detail": set([])
+                },
+                "S2": {
+                    "summary": set(["s0"]),
+                    "detail": set(["d0"])
+                }
+            },
+            "C1": {
+                "S0": {
+                    "summary": set([]),
+                    "detail": set(["d0", "d1", "d2"])
+                }
+            },
+            "C2": {
+                "S0": {
+                    "summary": set(["s0"]),
+                    "detail": set(["d0", "d1"])
+                }
+            }
+        })
+
+class HealthHistoryTest(unittest.TestCase):
+    def _now(self):
+        # return some time truncated at 30 minutes past the hour. this lets us
+        # fiddle with time offsets without worrying about accidentally landing
+        # on exactly the top of the hour which is the edge of a time slot for
+        # tracking health history.
+        dt = datetime.datetime.utcnow()
+        return datetime.datetime(
+            year   = dt.year,
+            month  = dt.month,
+            day    = dt.day,
+            hour   = dt.hour,
+            minute = 30)
+
+    def test_empty_slot(self):
+        now = self._now()
+
+        HealthHistorySlot._now = mock.Mock(return_value=now)
+        h = HealthHistorySlot()
+
+        # reports no historical checks
+        self.assertEqual(h.health(), { "checks": {} })
+
+        # an empty slot doesn't need to be flushed
+        self.assertFalse(h.need_flush())
+
+    def test_expires(self):
+        now = self._now()
+
+        HealthHistorySlot._now = mock.Mock(return_value=now)
+        h = HealthHistorySlot()
+        self.assertFalse(h.expired())
+
+        # an hour from now it would be expired
+        future = now + datetime.timedelta(hours = 1)
+        HealthHistorySlot._now = mock.Mock(return_value=future)
+        self.assertTrue(h.expired())
+
+    def test_need_flush(self):
+        now = self._now()
+
+        HealthHistorySlot._now = mock.Mock(return_value=now)
+        h = HealthHistorySlot()
+        self.assertFalse(h.need_flush())
+
+        self.assertTrue(h.add(dict(checks = {
+            "C0": {
+                "severity": "S0",
+                "summary": { "message": "s0" },
+                "detail": [{ "message": "d0" }]
+            }
+        })))
+        # no flush needed, yet...
+        self.assertFalse(h.need_flush())
+
+        # after persist period time elapses, a flush is needed
+        future = now + PERSIST_PERIOD
+        HealthHistorySlot._now = mock.Mock(return_value=future)
+        self.assertTrue(h.need_flush())
+
+        # mark flush resets
+        h.mark_flushed()
+        self.assertFalse(h.need_flush())
+
+    def test_need_flush_edge(self):
+        # test needs flush is true because it has expired, not because it has
+        # been dirty for the persistence period
+        dt = datetime.datetime.utcnow()
+        now = datetime.datetime(
+            year   = dt.year,
+            month  = dt.month,
+            day    = dt.day,
+            hour   = dt.hour,
+            minute = 59,
+            second = 59)
+        HealthHistorySlot._now = mock.Mock(return_value=now)
+        h = HealthHistorySlot()
+        self.assertFalse(h.expired())
+        self.assertFalse(h.need_flush())
+
+        # now it is dirty, but it doesn't need a flush
+        self.assertTrue(h.add(dict(checks = {
+            "C0": {
+                "severity": "S0",
+                "summary": { "message": "s0" },
+                "detail": [{ "message": "d0" }]
+            }
+        })))
+        self.assertFalse(h.expired())
+        self.assertFalse(h.need_flush())
+
+        # advance time past the hour so it expires, but not past the persistence
+        # period deadline for the last event that set the dirty bit
+        self.assertTrue(PERSIST_PERIOD.total_seconds() > 5)
+        future = now + datetime.timedelta(seconds = 5)
+        HealthHistorySlot._now = mock.Mock(return_value=future)
+
+        self.assertTrue(h.expired())
+        self.assertTrue(h.need_flush())
diff --git a/src/pybind/mgr/insights/tox.ini b/src/pybind/mgr/insights/tox.ini
new file mode 100644
index 00000000..c02393af
--- /dev/null
+++ b/src/pybind/mgr/insights/tox.ini
@@ -0,0 +1,17 @@
+[tox]
+envlist = py27,py3
+skipsdist = true
+toxworkdir = {env:CEPH_BUILD_DIR}/insights
+minversion = 2.8.1
+
+[testenv]
+deps =
+    pytest
+    mock
+    six>=1.14.0
+setenv=
+    UNITTEST = true
+    py27: PYTHONPATH = {env:CEPH_LIB}/cython_modules/lib.2
+    py3:  PYTHONPATH = {env:CEPH_LIB}/cython_modules/lib.3
+commands=
+    {envbindir}/py.test tests/