summaryrefslogtreecommitdiffstats
path: root/qa/tasks/mgr/test_failover.py
diff options
context:
space:
mode:
Diffstat (limited to '')
-rw-r--r--qa/tasks/mgr/test_failover.py182
1 files changed, 182 insertions, 0 deletions
diff --git a/qa/tasks/mgr/test_failover.py b/qa/tasks/mgr/test_failover.py
new file mode 100644
index 000000000..bfff11262
--- /dev/null
+++ b/qa/tasks/mgr/test_failover.py
@@ -0,0 +1,182 @@
+
+import logging
+import json
+
+from .mgr_test_case import MgrTestCase
+
+
+log = logging.getLogger(__name__)
+
+
+class TestFailover(MgrTestCase):
+ MGRS_REQUIRED = 2
+
+ def setUp(self):
+ super(TestFailover, self).setUp()
+ self.setup_mgrs()
+
+ def test_timeout(self):
+ """
+ That when an active mgr stops responding, a standby is promoted
+ after mon_mgr_beacon_grace.
+ """
+
+ # Query which mgr is active
+ original_active = self.mgr_cluster.get_active_id()
+ original_standbys = self.mgr_cluster.get_standby_ids()
+
+ # Stop that daemon
+ self.mgr_cluster.mgr_stop(original_active)
+
+ # Assert that the other mgr becomes active
+ self.wait_until_true(
+ lambda: self.mgr_cluster.get_active_id() in original_standbys,
+ timeout=60
+ )
+
+ self.mgr_cluster.mgr_restart(original_active)
+ self.wait_until_true(
+ lambda: original_active in self.mgr_cluster.get_standby_ids(),
+ timeout=10
+ )
+
+ def test_timeout_nostandby(self):
+ """
+ That when an active mgr stop responding, and no standby is
+ available, the active mgr is removed from the map anyway.
+ """
+ # Query which mgr is active
+ original_active = self.mgr_cluster.get_active_id()
+ original_standbys = self.mgr_cluster.get_standby_ids()
+
+ for s in original_standbys:
+ self.mgr_cluster.mgr_stop(s)
+ self.mgr_cluster.mgr_fail(s)
+
+ self.assertListEqual(self.mgr_cluster.get_standby_ids(), [])
+ self.assertEqual(self.mgr_cluster.get_active_id(), original_active)
+
+ grace = int(self.mgr_cluster.get_config("mon_mgr_beacon_grace"))
+ log.info("Should time out in about {0} seconds".format(grace))
+
+ self.mgr_cluster.mgr_stop(original_active)
+
+ # Now wait for the mon to notice the mgr is gone and remove it
+ # from the map.
+ self.wait_until_equal(
+ lambda: self.mgr_cluster.get_active_id(),
+ "",
+ timeout=grace * 2
+ )
+
+ self.assertListEqual(self.mgr_cluster.get_standby_ids(), [])
+ self.assertEqual(self.mgr_cluster.get_active_id(), "")
+
+ def test_explicit_fail(self):
+ """
+ That when a user explicitly fails a daemon, a standby immediately
+ replaces it.
+ :return:
+ """
+ # Query which mgr is active
+ original_active = self.mgr_cluster.get_active_id()
+ original_standbys = self.mgr_cluster.get_standby_ids()
+
+ self.mgr_cluster.mgr_fail(original_active)
+
+ # A standby should take over
+ self.wait_until_true(
+ lambda: self.mgr_cluster.get_active_id() in original_standbys,
+ timeout=60
+ )
+
+ # The one we failed should come back as a standby (he isn't
+ # really dead)
+ self.wait_until_true(
+ lambda: original_active in self.mgr_cluster.get_standby_ids(),
+ timeout=10
+ )
+
+ # Both daemons should have fully populated metadata
+ # (regression test for http://tracker.ceph.com/issues/21260)
+ meta = json.loads(self.mgr_cluster.mon_manager.raw_cluster_cmd(
+ "mgr", "metadata"))
+ id_to_meta = dict([(i['name'], i) for i in meta])
+ for i in [original_active] + original_standbys:
+ self.assertIn(i, id_to_meta)
+ self.assertIn('ceph_version', id_to_meta[i])
+
+ # We should be able to fail back over again: the exercises
+ # our re-initialization of the python runtime within
+ # a single process lifetime.
+
+ # Get rid of any bystander standbys so that the original_active
+ # will be selected as next active.
+ new_active = self.mgr_cluster.get_active_id()
+ for daemon in original_standbys:
+ if daemon != new_active:
+ self.mgr_cluster.mgr_stop(daemon)
+ self.mgr_cluster.mgr_fail(daemon)
+
+ self.assertListEqual(self.mgr_cluster.get_standby_ids(),
+ [original_active])
+
+ self.mgr_cluster.mgr_stop(new_active)
+ self.mgr_cluster.mgr_fail(new_active)
+
+ self.assertEqual(self.mgr_cluster.get_active_id(), original_active)
+ self.assertEqual(self.mgr_cluster.get_standby_ids(), [])
+
+ def test_standby_timeout(self):
+ """
+ That when a standby daemon stops sending beacons, it is
+ removed from the list of standbys
+ :return:
+ """
+ original_active = self.mgr_cluster.get_active_id()
+ original_standbys = self.mgr_cluster.get_standby_ids()
+
+ victim = original_standbys[0]
+ self.mgr_cluster.mgr_stop(victim)
+
+ expect_standbys = set(original_standbys) - {victim}
+
+ self.wait_until_true(
+ lambda: set(self.mgr_cluster.get_standby_ids()) == expect_standbys,
+ timeout=60
+ )
+ self.assertEqual(self.mgr_cluster.get_active_id(), original_active)
+
+class TestLibCephSQLiteFailover(MgrTestCase):
+ MGRS_REQUIRED = 1
+
+ def setUp(self):
+ super(TestLibCephSQLiteFailover, self).setUp()
+ self.setup_mgrs()
+
+ def get_libcephsqlite(self):
+ mgr_map = self.mgr_cluster.get_mgr_map()
+ addresses = self.mgr_cluster.get_registered_clients('libcephsqlite', mgr_map=mgr_map)
+ self.assertEqual(len(addresses), 1)
+ return addresses[0]
+
+ def test_maybe_reonnect(self):
+ """
+ That the devicehealth module can recover after losing its libcephsqlite lock.
+ """
+
+ # make sure the database is populated and loaded by the module
+ self.mgr_cluster.mon_manager.ceph("device scrape-health-metrics")
+
+ oldaddr = self.get_libcephsqlite()
+ self.mgr_cluster.mon_manager.ceph(f"osd blocklist add {oldaddr['addr']}/{oldaddr['nonce']}")
+
+ def test():
+ self.mgr_cluster.mon_manager.ceph("device scrape-health-metrics")
+ newaddr = self.get_libcephsqlite()
+ return oldaddr != newaddr
+
+ self.wait_until_true(
+ test,
+ timeout=30
+ )