diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /qa/tasks/cephfs/test_recovery_pool.py | |
parent | Initial commit. (diff) | |
download | ceph-upstream/18.2.2.tar.xz ceph-upstream/18.2.2.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'qa/tasks/cephfs/test_recovery_pool.py')
-rw-r--r-- | qa/tasks/cephfs/test_recovery_pool.py | 179 |
1 files changed, 179 insertions, 0 deletions
diff --git a/qa/tasks/cephfs/test_recovery_pool.py b/qa/tasks/cephfs/test_recovery_pool.py new file mode 100644 index 000000000..8c4e1967d --- /dev/null +++ b/qa/tasks/cephfs/test_recovery_pool.py @@ -0,0 +1,179 @@ +""" +Test our tools for recovering metadata from the data pool into an alternate pool +""" + +import logging +import traceback +from collections import namedtuple + +from teuthology.exceptions import CommandFailedError +from tasks.cephfs.cephfs_test_case import CephFSTestCase + +log = logging.getLogger(__name__) + + +ValidationError = namedtuple("ValidationError", ["exception", "backtrace"]) + + +class OverlayWorkload(object): + def __init__(self): + self._initial_state = None + + # Accumulate backtraces for every failed validation, and return them. Backtraces + # are rather verbose, but we only see them when something breaks, and they + # let us see which check failed without having to decorate each check with + # a string + self._errors = [] + + def assert_equal(self, a, b): + try: + if a != b: + raise AssertionError("{0} != {1}".format(a, b)) + except AssertionError as e: + self._errors.append( + ValidationError(e, traceback.format_exc(3)) + ) + + def write(self): + """ + Write the workload files to the mount + """ + raise NotImplementedError() + + def validate(self): + """ + Read from the mount and validate that the workload files are present (i.e. have + survived or been reconstructed from the test scenario) + """ + raise NotImplementedError() + + def damage(self, fs): + """ + Damage the filesystem pools in ways that will be interesting to recover from. By + default just wipe everything in the metadata pool + """ + + pool = fs.get_metadata_pool_name() + fs.rados(["purge", pool, '--yes-i-really-really-mean-it']) + + def flush(self, fs): + """ + Called after client unmount, after write: flush whatever you want + """ + fs.rank_asok(["flush", "journal"]) + + +class SimpleOverlayWorkload(OverlayWorkload): + """ + Single file, single directory, check that it gets recovered and so does its size + """ + def write(self, mount): + mount.run_shell(["mkdir", "subdir"]) + mount.write_n_mb("subdir/sixmegs", 6) + self._initial_state = mount.stat("subdir/sixmegs") + + def validate(self, recovery_mount): + recovery_mount.run_shell(["ls", "subdir"]) + st = recovery_mount.stat("subdir/sixmegs") + self.assert_equal(st['st_size'], self._initial_state['st_size']) + return self._errors + +class TestRecoveryPool(CephFSTestCase): + MDSS_REQUIRED = 2 + CLIENTS_REQUIRED = 1 + REQUIRE_RECOVERY_FILESYSTEM = True + + def is_marked_damaged(self, rank): + mds_map = self.fs.get_mds_map() + return rank in mds_map['damaged'] + + def _rebuild_metadata(self, workload, other_pool=None, workers=1): + """ + That when all objects in metadata pool are removed, we can rebuild a metadata pool + based on the contents of a data pool, and a client can see and read our files. + """ + + # First, inject some files + + workload.write(self.mount_a) + + # Unmount the client and flush the journal: the tool should also cope with + # situations where there is dirty metadata, but we'll test that separately + self.mount_a.umount_wait() + workload.flush(self.fs) + self.fs.fail() + + # After recovery, we need the MDS to not be strict about stats (in production these options + # are off by default, but in QA we need to explicitly disable them) + # Note: these have to be written to ceph.conf to override existing ceph.conf values. + self.fs.set_ceph_conf('mds', 'mds verify scatter', False) + self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False) + self.fs.mds_restart() + + # Apply any data damage the workload wants + workload.damage(self.fs) + + # Create the alternate pool if requested + recovery_fs = self.mds_cluster.newfs(name="recovery_fs", create=False) + recovery_fs.set_data_pool_name(self.fs.get_data_pool_name()) + recovery_fs.create(recover=True, metadata_overlay=True) + + recovery_pool = recovery_fs.get_metadata_pool_name() + recovery_fs.mon_manager.raw_cluster_cmd('-s') + + # Reset the MDS map in case multiple ranks were in play: recovery procedure + # only understands how to rebuild metadata under rank 0 + #self.fs.reset() + #self.fs.table_tool([self.fs.name + ":0", "reset", "session"]) + #self.fs.table_tool([self.fs.name + ":0", "reset", "snap"]) + #self.fs.table_tool([self.fs.name + ":0", "reset", "inode"]) + + # Run the recovery procedure + recovery_fs.data_scan(['init', '--force-init', + '--filesystem', recovery_fs.name, + '--alternate-pool', recovery_pool]) + recovery_fs.table_tool([recovery_fs.name + ":0", "reset", "session"]) + recovery_fs.table_tool([recovery_fs.name + ":0", "reset", "snap"]) + recovery_fs.table_tool([recovery_fs.name + ":0", "reset", "inode"]) + if False: + with self.assertRaises(CommandFailedError): + # Normal reset should fail when no objects are present, we'll use --force instead + self.fs.journal_tool(["journal", "reset"], 0) + + recovery_fs.data_scan(['scan_extents', '--alternate-pool', + recovery_pool, '--filesystem', self.fs.name, + self.fs.get_data_pool_name()]) + recovery_fs.data_scan(['scan_inodes', '--alternate-pool', + recovery_pool, '--filesystem', self.fs.name, + '--force-corrupt', '--force-init', + self.fs.get_data_pool_name()]) + recovery_fs.data_scan(['scan_links', '--filesystem', recovery_fs.name]) + recovery_fs.journal_tool(['event', 'recover_dentries', 'list', + '--alternate-pool', recovery_pool], 0) + recovery_fs.journal_tool(["journal", "reset", "--force"], 0) + + # Start the MDS + recovery_fs.set_joinable() + status = recovery_fs.wait_for_daemons() + + self.config_set('mds', 'debug_mds', '20') + for rank in recovery_fs.get_ranks(status=status): + recovery_fs.rank_tell(['scrub', 'start', '/', 'force,recursive,repair'], rank=rank['rank'], status=status) + log.info(str(recovery_fs.status())) + + # Mount a client + self.mount_a.mount_wait(cephfs_name=recovery_fs.name) + + # See that the files are present and correct + errors = workload.validate(self.mount_a) + if errors: + log.error("Validation errors found: {0}".format(len(errors))) + for e in errors: + log.error(e.exception) + log.error(e.backtrace) + raise AssertionError("Validation failed, first error: {0}\n{1}".format( + errors[0].exception, errors[0].backtrace + )) + + def test_rebuild_simple(self): + self._rebuild_metadata(SimpleOverlayWorkload()) |