qa/tasks/die_on_err.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70

"""
Raise exceptions on osd coredumps or test err directories
"""
import contextlib
import logging
import time
from teuthology.orchestra import run

from tasks import ceph_manager
from teuthology import misc as teuthology

log = logging.getLogger(__name__)

@contextlib.contextmanager
def task(ctx, config):
    """
    Die if {testdir}/err exists or if an OSD dumps core
    """
    if config is None:
        config = {}

    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.keys()

    num_osds = teuthology.num_instances_of_type(ctx.cluster, 'osd')
    log.info('num_osds is %s' % num_osds)

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )

    while len(manager.get_osd_status()['up']) < num_osds:
        time.sleep(10)

    testdir = teuthology.get_testdir(ctx)

    while True:
        for i in range(num_osds):
            (osd_remote,) = ctx.cluster.only('osd.%d' % i).remotes.keys()
            p = osd_remote.run(
                args = [ 'test', '-e', '{tdir}/err'.format(tdir=testdir) ],
                wait=True,
                check_status=False,
            )
            exit_status = p.exitstatus

            if exit_status == 0:
                log.info("osd %d has an error" % i)
                raise Exception("osd %d error" % i)

            log_path = '/var/log/ceph/osd.%d.log' % (i)

            p = osd_remote.run(
                args = [
                         'tail', '-1', log_path,
                         run.Raw('|'),
                         'grep', '-q', 'end dump'
                       ],
                wait=True,
                check_status=False,
            )
            exit_status = p.exitstatus

            if exit_status == 0:
                log.info("osd %d dumped core" % i)
                raise Exception("osd %d dumped core" % i)

        time.sleep(5)