From 19fcec84d8d7d21e796c7624e521b60d28ee21ed Mon Sep 17 00:00:00 2001 From: Daniel Baumann Date: Sun, 7 Apr 2024 20:45:59 +0200 Subject: Adding upstream version 16.2.11+ds. Signed-off-by: Daniel Baumann --- qa/tasks/mds_creation_failure.py | 69 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 69 insertions(+) create mode 100644 qa/tasks/mds_creation_failure.py (limited to 'qa/tasks/mds_creation_failure.py') diff --git a/qa/tasks/mds_creation_failure.py b/qa/tasks/mds_creation_failure.py new file mode 100644 index 000000000..58314086c --- /dev/null +++ b/qa/tasks/mds_creation_failure.py @@ -0,0 +1,69 @@ +# FIXME: this file has many undefined vars which are accessed! +# flake8: noqa +import logging +import contextlib +import time +from tasks import ceph_manager +from teuthology import misc +from teuthology.orchestra.run import CommandFailedError, Raw + +log = logging.getLogger(__name__) + + +@contextlib.contextmanager +def task(ctx, config): + """ + Go through filesystem creation with a synthetic failure in an MDS + in its 'up:creating' state, to exercise the retry behaviour. + """ + # Grab handles to the teuthology objects of interest + mdslist = list(misc.all_roles_of_type(ctx.cluster, 'mds')) + if len(mdslist) != 1: + # Require exactly one MDS, the code path for creation failure when + # a standby is available is different + raise RuntimeError("This task requires exactly one MDS") + + mds_id = mdslist[0] + (mds_remote,) = ctx.cluster.only('mds.{_id}'.format(_id=mds_id)).remotes.keys() + manager = ceph_manager.CephManager( + mds_remote, ctx=ctx, logger=log.getChild('ceph_manager'), + ) + + # Stop MDS + self.fs.set_max_mds(0) + self.fs.mds_stop(mds_id) + self.fs.mds_fail(mds_id) + + # Reset the filesystem so that next start will go into CREATING + manager.raw_cluster_cmd('fs', 'rm', "default", "--yes-i-really-mean-it") + manager.raw_cluster_cmd('fs', 'new', "default", "metadata", "data") + + # Start the MDS with mds_kill_create_at set, it will crash during creation + mds.restart_with_args(["--mds_kill_create_at=1"]) + try: + mds.wait_for_exit() + except CommandFailedError as e: + if e.exitstatus == 1: + log.info("MDS creation killed as expected") + else: + log.error("Unexpected status code %s" % e.exitstatus) + raise + + # Since I have intentionally caused a crash, I will clean up the resulting core + # file to avoid task.internal.coredump seeing it as a failure. + log.info("Removing core file from synthetic MDS failure") + mds_remote.run(args=['rm', '-f', Raw("{archive}/coredump/*.core".format(archive=misc.get_archive_dir(ctx)))]) + + # It should have left the MDS map state still in CREATING + status = self.fs.status().get_mds(mds_id) + assert status['state'] == 'up:creating' + + # Start the MDS again without the kill flag set, it should proceed with creation successfully + mds.restart() + + # Wait for state ACTIVE + self.fs.wait_for_state("up:active", timeout=120, mds_id=mds_id) + + # The system should be back up in a happy healthy state, go ahead and run any further tasks + # inside this context. + yield -- cgit v1.2.3