qa/tasks/mgr/test_failover.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182

import logging
import json

from .mgr_test_case import MgrTestCase


log = logging.getLogger(__name__)


class TestFailover(MgrTestCase):
    MGRS_REQUIRED = 2

    def setUp(self):
        super(TestFailover, self).setUp()
        self.setup_mgrs()

    def test_timeout(self):
        """
        That when an active mgr stops responding, a standby is promoted
        after mon_mgr_beacon_grace.
        """

        # Query which mgr is active
        original_active = self.mgr_cluster.get_active_id()
        original_standbys = self.mgr_cluster.get_standby_ids()

        # Stop that daemon
        self.mgr_cluster.mgr_stop(original_active)

        # Assert that the other mgr becomes active
        self.wait_until_true(
            lambda: self.mgr_cluster.get_active_id() in original_standbys,
            timeout=60
        )

        self.mgr_cluster.mgr_restart(original_active)
        self.wait_until_true(
            lambda: original_active in self.mgr_cluster.get_standby_ids(),
            timeout=10
        )

    def test_timeout_nostandby(self):
        """
        That when an active mgr stop responding, and no standby is
        available, the active mgr is removed from the map anyway.
        """
        # Query which mgr is active
        original_active = self.mgr_cluster.get_active_id()
        original_standbys = self.mgr_cluster.get_standby_ids()

        for s in original_standbys:
            self.mgr_cluster.mgr_stop(s)
            self.mgr_cluster.mgr_fail(s)

        self.assertListEqual(self.mgr_cluster.get_standby_ids(), [])
        self.assertEqual(self.mgr_cluster.get_active_id(), original_active)

        grace = int(self.mgr_cluster.get_config("mon_mgr_beacon_grace"))
        log.info("Should time out in about {0} seconds".format(grace))

        self.mgr_cluster.mgr_stop(original_active)

        # Now wait for the mon to notice the mgr is gone and remove it
        # from the map.
        self.wait_until_equal(
            lambda: self.mgr_cluster.get_active_id(),
            "",
            timeout=grace * 2
        )

        self.assertListEqual(self.mgr_cluster.get_standby_ids(), [])
        self.assertEqual(self.mgr_cluster.get_active_id(), "")

    def test_explicit_fail(self):
        """
        That when a user explicitly fails a daemon, a standby immediately
        replaces it.
        :return:
        """
        # Query which mgr is active
        original_active = self.mgr_cluster.get_active_id()
        original_standbys = self.mgr_cluster.get_standby_ids()

        self.mgr_cluster.mgr_fail(original_active)

        # A standby should take over
        self.wait_until_true(
            lambda: self.mgr_cluster.get_active_id() in original_standbys,
            timeout=60
        )

        # The one we failed should come back as a standby (he isn't
        # really dead)
        self.wait_until_true(
            lambda: original_active in self.mgr_cluster.get_standby_ids(),
            timeout=10
        )

        # Both daemons should have fully populated metadata
        # (regression test for http://tracker.ceph.com/issues/21260)
        meta = json.loads(self.mgr_cluster.mon_manager.raw_cluster_cmd(
            "mgr", "metadata"))
        id_to_meta = dict([(i['name'], i) for i in meta])
        for i in [original_active] + original_standbys:
            self.assertIn(i, id_to_meta)
            self.assertIn('ceph_version', id_to_meta[i])

        # We should be able to fail back over again: the exercises
        # our re-initialization of the python runtime within
        # a single process lifetime.

        # Get rid of any bystander standbys so that the original_active
        # will be selected as next active.
        new_active = self.mgr_cluster.get_active_id()
        for daemon in original_standbys:
            if daemon != new_active:
                self.mgr_cluster.mgr_stop(daemon)
                self.mgr_cluster.mgr_fail(daemon)

        self.assertListEqual(self.mgr_cluster.get_standby_ids(),
                             [original_active])

        self.mgr_cluster.mgr_stop(new_active)
        self.mgr_cluster.mgr_fail(new_active)

        self.assertEqual(self.mgr_cluster.get_active_id(), original_active)
        self.assertEqual(self.mgr_cluster.get_standby_ids(), [])

    def test_standby_timeout(self):
        """
        That when a standby daemon stops sending beacons, it is
        removed from the list of standbys
        :return:
        """
        original_active = self.mgr_cluster.get_active_id()
        original_standbys = self.mgr_cluster.get_standby_ids()

        victim = original_standbys[0]
        self.mgr_cluster.mgr_stop(victim)

        expect_standbys = set(original_standbys) - {victim}

        self.wait_until_true(
            lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys,
            timeout=60
        )
        self.assertEqual(self.mgr_cluster.get_active_id(), original_active)

class TestLibCephSQLiteFailover(MgrTestCase):
    MGRS_REQUIRED = 1

    def setUp(self):
        super(TestLibCephSQLiteFailover, self).setUp()
        self.setup_mgrs()

    def get_libcephsqlite(self):
        mgr_map = self.mgr_cluster.get_mgr_map()
        addresses = self.mgr_cluster.get_registered_clients('libcephsqlite', mgr_map=mgr_map)
        self.assertEqual(len(addresses), 1)
        return addresses[0]

    def test_maybe_reonnect(self):
        """
        That the devicehealth module can recover after losing its libcephsqlite lock.
        """

        # make sure the database is populated and loaded by the module
        self.mgr_cluster.mon_manager.ceph("device scrape-health-metrics")

        oldaddr = self.get_libcephsqlite()
        self.mgr_cluster.mon_manager.ceph(f"osd blocklist add {oldaddr['addr']}/{oldaddr['nonce']}")

        def test():
            self.mgr_cluster.mon_manager.ceph("device scrape-health-metrics")
            newaddr = self.get_libcephsqlite()
            return oldaddr != newaddr

        self.wait_until_true(
            test,
            timeout=30
        )