summaryrefslogtreecommitdiffstats
path: root/qa/tasks/mon_recovery.py
blob: fa7aa1a8da2f4539ece7469ebc182bc08c78d175 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
"""
Monitor recovery
"""
import logging
from tasks import ceph_manager
from teuthology import misc as teuthology


log = logging.getLogger(__name__)

def task(ctx, config):
    """
    Test monitor recovery.
    """
    if config is None:
        config = {}
    assert isinstance(config, dict), \
        'task only accepts a dict for configuration'
    first_mon = teuthology.get_first_mon(ctx, config)
    (mon,) = ctx.cluster.only(first_mon).remotes.keys()

    manager = ceph_manager.CephManager(
        mon,
        ctx=ctx,
        logger=log.getChild('ceph_manager'),
        )

    mons = [f.split('.')[1] for f in teuthology.get_mon_names(ctx)]
    log.info("mon ids = %s" % mons)

    manager.wait_for_mon_quorum_size(len(mons))

    log.info('verifying all monitors are in the quorum')
    for m in mons:
        s = manager.get_mon_status(m)
        assert s['state'] == 'leader' or s['state'] == 'peon'
        assert len(s['quorum']) == len(mons)

    log.info('restarting each monitor in turn')
    for m in mons:
        # stop a monitor
        manager.kill_mon(m)
        manager.wait_for_mon_quorum_size(len(mons) - 1)

        # restart
        manager.revive_mon(m)
        manager.wait_for_mon_quorum_size(len(mons))

    # in forward and reverse order,
    rmons = mons
    rmons.reverse()
    for mons in mons, rmons:
        log.info('stopping all monitors')
        for m in mons:
            manager.kill_mon(m)

        log.info('forming a minimal quorum for %s, then adding monitors' % mons)
        qnum = (len(mons) // 2) + 1
        num = 0
        for m in mons:
            manager.revive_mon(m)
            num += 1
            if num >= qnum:
                manager.wait_for_mon_quorum_size(num)

    # on both leader and non-leader ranks...
    for rank in [0, 1]:
        # take one out
        log.info('removing mon %s' % mons[rank])
        manager.kill_mon(mons[rank])
        manager.wait_for_mon_quorum_size(len(mons) - 1)

        log.info('causing some monitor log activity')
        m = 30
        for n in range(1, m):
            manager.raw_cluster_cmd('log', '%d of %d' % (n, m))

        log.info('adding mon %s back in' % mons[rank])
        manager.revive_mon(mons[rank])
        manager.wait_for_mon_quorum_size(len(mons))