diff options
Diffstat (limited to 'qa/tasks/die_on_err.py')
-rw-r--r-- | qa/tasks/die_on_err.py | 70 |
1 files changed, 70 insertions, 0 deletions
diff --git a/qa/tasks/die_on_err.py b/qa/tasks/die_on_err.py new file mode 100644 index 00000000..a6aa4c63 --- /dev/null +++ b/qa/tasks/die_on_err.py @@ -0,0 +1,70 @@ +""" +Raise exceptions on osd coredumps or test err directories +""" +import contextlib +import logging +import time +from teuthology.orchestra import run + +from tasks import ceph_manager +from teuthology import misc as teuthology + +log = logging.getLogger(__name__) + +@contextlib.contextmanager +def task(ctx, config): + """ + Die if {testdir}/err exists or if an OSD dumps core + """ + if config is None: + config = {} + + first_mon = teuthology.get_first_mon(ctx, config) + (mon,) = ctx.cluster.only(first_mon).remotes.keys() + + num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd') + log.info('num_osds is %s' % num_osds) + + manager = ceph_manager.CephManager( + mon, + ctx=ctx, + logger=log.getChild('ceph_manager'), + ) + + while len(manager.get_osd_status()['up']) < num_osds: + time.sleep(10) + + testdir = teuthology.get_testdir(ctx) + + while True: + for i in range(num_osds): + (osd_remote,) = ctx.cluster.only('osd.%d' % i).remotes.keys() + p = osd_remote.run( + args = [ 'test', '-e', '{tdir}/err'.format(tdir=testdir) ], + wait=True, + check_status=False, + ) + exit_status = p.exitstatus + + if exit_status == 0: + log.info("osd %d has an error" % i) + raise Exception("osd %d error" % i) + + log_path = '/var/log/ceph/osd.%d.log' % (i) + + p = osd_remote.run( + args = [ + 'tail', '-1', log_path, + run.Raw('|'), + 'grep', '-q', 'end dump' + ], + wait=True, + check_status=False, + ) + exit_status = p.exitstatus + + if exit_status == 0: + log.info("osd %d dumped core" % i) + raise Exception("osd %d dumped core" % i) + + time.sleep(5) |