summaryrefslogtreecommitdiffstats
path: root/qa/tasks/daemonwatchdog.py
diff options
context:
space:
mode:
Diffstat (limited to 'qa/tasks/daemonwatchdog.py')
-rw-r--r--qa/tasks/daemonwatchdog.py128
1 files changed, 128 insertions, 0 deletions
diff --git a/qa/tasks/daemonwatchdog.py b/qa/tasks/daemonwatchdog.py
new file mode 100644
index 000000000..c8fa9f3c2
--- /dev/null
+++ b/qa/tasks/daemonwatchdog.py
@@ -0,0 +1,128 @@
+import logging
+import signal
+import time
+
+from gevent import sleep
+from gevent.greenlet import Greenlet
+from gevent.event import Event
+
+log = logging.getLogger(__name__)
+
+class DaemonWatchdog(Greenlet):
+ """
+ DaemonWatchdog::
+
+ Watch Ceph daemons for failures. If an extended failure is detected (i.e.
+ not intentional), then the watchdog will unmount file systems and send
+ SIGTERM to all daemons. The duration of an extended failure is configurable
+ with watchdog_daemon_timeout.
+
+ ceph:
+ watchdog:
+ daemon_restart [default: no]: restart daemon if "normal" exit (status==0).
+
+ daemon_timeout [default: 300]: number of seconds a daemon
+ is allowed to be failed before the
+ watchdog will bark.
+ """
+
+ def __init__(self, ctx, config, thrashers):
+ super(DaemonWatchdog, self).__init__()
+ self.config = ctx.config.get('watchdog', {})
+ self.ctx = ctx
+ self.e = None
+ self.logger = log.getChild('daemon_watchdog')
+ self.cluster = config.get('cluster', 'ceph')
+ self.name = 'watchdog'
+ self.stopping = Event()
+ self.thrashers = thrashers
+
+ def _run(self):
+ try:
+ self.watch()
+ except Exception as e:
+ # See _run exception comment for MDSThrasher
+ self.e = e
+ self.logger.exception("exception:")
+ # allow successful completion so gevent doesn't see an exception...
+
+ def log(self, x):
+ """Write data to logger"""
+ self.logger.info(x)
+
+ def stop(self):
+ self.stopping.set()
+
+ def bark(self):
+ self.log("BARK! unmounting mounts and killing all daemons")
+ if hasattr(self.ctx, 'mounts'):
+ for mount in self.ctx.mounts.values():
+ try:
+ mount.umount_wait(force=True)
+ except:
+ self.logger.exception("ignoring exception:")
+ daemons = []
+ daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('osd', cluster=self.cluster)))
+ daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mds', cluster=self.cluster)))
+ daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mon', cluster=self.cluster)))
+ daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('rgw', cluster=self.cluster)))
+ daemons.extend(filter(lambda daemon: daemon.running() and not daemon.proc.finished, self.ctx.daemons.iter_daemons_of_role('mgr', cluster=self.cluster)))
+
+ for daemon in daemons:
+ try:
+ daemon.signal(signal.SIGTERM)
+ except:
+ self.logger.exception("ignoring exception:")
+
+ def watch(self):
+ self.log("watchdog starting")
+ daemon_timeout = int(self.config.get('daemon_timeout', 300))
+ daemon_restart = self.config.get('daemon_restart', False)
+ daemon_failure_time = {}
+ while not self.stopping.is_set():
+ bark = False
+ now = time.time()
+
+ osds = self.ctx.daemons.iter_daemons_of_role('osd', cluster=self.cluster)
+ mons = self.ctx.daemons.iter_daemons_of_role('mon', cluster=self.cluster)
+ mdss = self.ctx.daemons.iter_daemons_of_role('mds', cluster=self.cluster)
+ rgws = self.ctx.daemons.iter_daemons_of_role('rgw', cluster=self.cluster)
+ mgrs = self.ctx.daemons.iter_daemons_of_role('mgr', cluster=self.cluster)
+
+ daemon_failures = []
+ daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, osds))
+ daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mons))
+ daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mdss))
+ daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, rgws))
+ daemon_failures.extend(filter(lambda daemon: daemon.running() and daemon.proc.finished, mgrs))
+
+ for daemon in daemon_failures:
+ name = daemon.role + '.' + daemon.id_
+ dt = daemon_failure_time.setdefault(name, (daemon, now))
+ assert dt[0] is daemon
+ delta = now-dt[1]
+ self.log("daemon {name} is failed for ~{t:.0f}s".format(name=name, t=delta))
+ if delta > daemon_timeout:
+ bark = True
+ if daemon_restart == 'normal' and daemon.proc.exitstatus == 0:
+ self.log(f"attempting to restart daemon {name}")
+ daemon.restart()
+
+ # If a daemon is no longer failed, remove it from tracking:
+ for name in list(daemon_failure_time.keys()):
+ if name not in [d.role + '.' + d.id_ for d in daemon_failures]:
+ self.log("daemon {name} has been restored".format(name=name))
+ del daemon_failure_time[name]
+
+ for thrasher in self.thrashers:
+ if thrasher.exception is not None:
+ self.log("{name} failed".format(name=thrasher.name))
+ bark = True
+
+ if bark:
+ self.bark()
+ return
+
+ sleep(5)
+
+ self.log("watchdog finished")