summaryrefslogtreecommitdiffstats
path: root/qa/tasks/mds_creation_failure.py
diff options
context:
space:
mode:
authorDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
committerDaniel Baumann <daniel.baumann@progress-linux.org>2024-04-07 18:45:59 +0000
commit19fcec84d8d7d21e796c7624e521b60d28ee21ed (patch)
tree42d26aa27d1e3f7c0b8bd3fd14e7d7082f5008dc /qa/tasks/mds_creation_failure.py
parentInitial commit. (diff)
downloadceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.tar.xz
ceph-19fcec84d8d7d21e796c7624e521b60d28ee21ed.zip
Adding upstream version 16.2.11+ds.upstream/16.2.11+dsupstream
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'qa/tasks/mds_creation_failure.py')
-rw-r--r--qa/tasks/mds_creation_failure.py69
1 files changed, 69 insertions, 0 deletions
diff --git a/qa/tasks/mds_creation_failure.py b/qa/tasks/mds_creation_failure.py
new file mode 100644
index 000000000..58314086c
--- /dev/null
+++ b/qa/tasks/mds_creation_failure.py
@@ -0,0 +1,69 @@
+# FIXME: this file has many undefined vars which are accessed!
+# flake8: noqa
+import logging
+import contextlib
+import time
+from tasks import ceph_manager
+from teuthology import misc
+from teuthology.orchestra.run import CommandFailedError, Raw
+
+log = logging.getLogger(__name__)
+
+
+@contextlib.contextmanager
+def task(ctx, config):
+ """
+ Go through filesystem creation with a synthetic failure in an MDS
+ in its 'up:creating' state, to exercise the retry behaviour.
+ """
+ # Grab handles to the teuthology objects of interest
+ mdslist = list(misc.all_roles_of_type(ctx.cluster, 'mds'))
+ if len(mdslist) != 1:
+ # Require exactly one MDS, the code path for creation failure when
+ # a standby is available is different
+ raise RuntimeError("This task requires exactly one MDS")
+
+ mds_id = mdslist[0]
+ (mds_remote,) = ctx.cluster.only('mds.{_id}'.format(_id=mds_id)).remotes.keys()
+ manager = ceph_manager.CephManager(
+ mds_remote, ctx=ctx, logger=log.getChild('ceph_manager'),
+ )
+
+ # Stop MDS
+ self.fs.set_max_mds(0)
+ self.fs.mds_stop(mds_id)
+ self.fs.mds_fail(mds_id)
+
+ # Reset the filesystem so that next start will go into CREATING
+ manager.raw_cluster_cmd('fs', 'rm', "default", "--yes-i-really-mean-it")
+ manager.raw_cluster_cmd('fs', 'new', "default", "metadata", "data")
+
+ # Start the MDS with mds_kill_create_at set, it will crash during creation
+ mds.restart_with_args(["--mds_kill_create_at=1"])
+ try:
+ mds.wait_for_exit()
+ except CommandFailedError as e:
+ if e.exitstatus == 1:
+ log.info("MDS creation killed as expected")
+ else:
+ log.error("Unexpected status code %s" % e.exitstatus)
+ raise
+
+ # Since I have intentionally caused a crash, I will clean up the resulting core
+ # file to avoid task.internal.coredump seeing it as a failure.
+ log.info("Removing core file from synthetic MDS failure")
+ mds_remote.run(args=['rm', '-f', Raw("{archive}/coredump/*.core".format(archive=misc.get_archive_dir(ctx)))])
+
+ # It should have left the MDS map state still in CREATING
+ status = self.fs.status().get_mds(mds_id)
+ assert status['state'] == 'up:creating'
+
+ # Start the MDS again without the kill flag set, it should proceed with creation successfully
+ mds.restart()
+
+ # Wait for state ACTIVE
+ self.fs.wait_for_state("up:active", timeout=120, mds_id=mds_id)
+
+ # The system should be back up in a happy healthy state, go ahead and run any further tasks
+ # inside this context.
+ yield