summaryrefslogtreecommitdiffstats
path: root/qa/tasks/dump_stuck.py
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-21 11:54:28 +0000
commite6918187568dbd01842d8d1d2c808ce16a894239 (patch)
tree64f88b554b444a49f656b6c656111a145cbbaa28 /qa/tasks/dump_stuck.py
parentInitial commit. (diff)
downloadceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz
ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'qa/tasks/dump_stuck.py')
-rw-r--r--qa/tasks/dump_stuck.py161
1 files changed, 161 insertions, 0 deletions
diff --git a/qa/tasks/dump_stuck.py b/qa/tasks/dump_stuck.py
new file mode 100644
index 000000000..4971f1916
--- /dev/null
+++ b/qa/tasks/dump_stuck.py
@@ -0,0 +1,161 @@
+"""
+Dump_stuck command
+"""
+import logging
+import time
+
+from tasks import ceph_manager
+from teuthology import misc as teuthology
+
+
+log = logging.getLogger(__name__)
+
+def check_stuck(manager, num_inactive, num_unclean, num_stale, timeout=10):
+ """
+ Do checks. Make sure get_stuck_pgs return the right amount of information, then
+ extract health information from the raw_cluster_cmd and compare the results with
+ values passed in. This passes if all asserts pass.
+
+ :param num_manager: Ceph manager
+ :param num_inactive: number of inaactive pages that are stuck
+ :param num_unclean: number of unclean pages that are stuck
+ :param num_stale: number of stale pages that are stuck
+ :param timeout: timeout value for get_stuck_pgs calls
+ """
+ inactive = manager.get_stuck_pgs('inactive', timeout)
+ unclean = manager.get_stuck_pgs('unclean', timeout)
+ stale = manager.get_stuck_pgs('stale', timeout)
+ log.info('inactive %s / %d, unclean %s / %d, stale %s / %d',
+ len(inactive), num_inactive,
+ len(unclean), num_unclean,
+ len(stale), num_stale)
+ assert len(inactive) == num_inactive
+ assert len(unclean) == num_unclean
+ assert len(stale) == num_stale
+
+def task(ctx, config):
+ """
+ Test the dump_stuck command.
+
+ :param ctx: Context
+ :param config: Configuration
+ """
+ assert config is None, \
+ 'dump_stuck requires no configuration'
+ assert teuthology.num_instances_of_type(ctx.cluster, 'osd') == 2, \
+ 'dump_stuck requires exactly 2 osds'
+
+ timeout = 60
+ first_mon = teuthology.get_first_mon(ctx, config)
+ (mon,) = ctx.cluster.only(first_mon).remotes.keys()
+
+ manager = ceph_manager.CephManager(
+ mon,
+ ctx=ctx,
+ logger=log.getChild('ceph_manager'),
+ )
+
+ manager.flush_pg_stats([0, 1])
+ manager.wait_for_clean(timeout)
+
+ manager.raw_cluster_cmd('tell', 'mon.a', 'injectargs', '--',
+# '--mon-osd-report-timeout 90',
+ '--mon-pg-stuck-threshold 10')
+
+ # all active+clean
+ check_stuck(
+ manager,
+ num_inactive=0,
+ num_unclean=0,
+ num_stale=0,
+ )
+ num_pgs = manager.get_num_pgs()
+
+ manager.mark_out_osd(0)
+ time.sleep(timeout)
+ manager.flush_pg_stats([1])
+ manager.wait_for_recovery(timeout)
+
+ # all active+clean+remapped
+ check_stuck(
+ manager,
+ num_inactive=0,
+ num_unclean=0,
+ num_stale=0,
+ )
+
+ manager.mark_in_osd(0)
+ manager.flush_pg_stats([0, 1])
+ manager.wait_for_clean(timeout)
+
+ # all active+clean
+ check_stuck(
+ manager,
+ num_inactive=0,
+ num_unclean=0,
+ num_stale=0,
+ )
+
+ log.info('stopping first osd')
+ manager.kill_osd(0)
+ manager.mark_down_osd(0)
+ manager.wait_for_active(timeout)
+
+ log.info('waiting for all to be unclean')
+ starttime = time.time()
+ done = False
+ while not done:
+ try:
+ check_stuck(
+ manager,
+ num_inactive=0,
+ num_unclean=num_pgs,
+ num_stale=0,
+ )
+ done = True
+ except AssertionError:
+ # wait up to 15 minutes to become stale
+ if time.time() - starttime > 900:
+ raise
+
+
+ log.info('stopping second osd')
+ manager.kill_osd(1)
+ manager.mark_down_osd(1)
+
+ log.info('waiting for all to be stale')
+ starttime = time.time()
+ done = False
+ while not done:
+ try:
+ check_stuck(
+ manager,
+ num_inactive=0,
+ num_unclean=num_pgs,
+ num_stale=num_pgs,
+ )
+ done = True
+ except AssertionError:
+ # wait up to 15 minutes to become stale
+ if time.time() - starttime > 900:
+ raise
+
+ log.info('reviving')
+ for id_ in teuthology.all_roles_of_type(ctx.cluster, 'osd'):
+ manager.revive_osd(id_)
+ manager.mark_in_osd(id_)
+ while True:
+ try:
+ manager.flush_pg_stats([0, 1])
+ break
+ except Exception:
+ log.exception('osds must not be started yet, waiting...')
+ time.sleep(1)
+ manager.wait_for_clean(timeout)
+
+ check_stuck(
+ manager,
+ num_inactive=0,
+ num_unclean=0,
+ num_stale=0,
+ )