1 files changed, 203 insertions, 0 deletions
diff --git a/qa/tasks/cephfs/test_recovery_pool.py b/qa/tasks/cephfs/test_recovery_pool.py
new file mode 100644
index 000000000..9926b3670
--- /dev/null
+++ b/qa/tasks/cephfs/test_recovery_pool.py
@@ -0,0 +1,203 @@
+"""
+Test our tools for recovering metadata from the data pool into an alternate pool
+"""
+
+import logging
+import traceback
+from collections import namedtuple
+
+from teuthology.orchestra.run import CommandFailedError
+from tasks.cephfs.cephfs_test_case import CephFSTestCase
+
+log = logging.getLogger(__name__)
+
+
+ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])
+
+
+class OverlayWorkload(object):
+    def __init__(self, orig_fs, recovery_fs, orig_mount, recovery_mount):
+        self._orig_fs = orig_fs
+        self._recovery_fs = recovery_fs
+        self._orig_mount = orig_mount
+        self._recovery_mount = recovery_mount
+        self._initial_state = None
+
+        # Accumulate backtraces for every failed validation, and return them.  Backtraces
+        # are rather verbose, but we only see them when something breaks, and they
+        # let us see which check failed without having to decorate each check with
+        # a string
+        self._errors = []
+
+    def assert_equal(self, a, b):
+        try:
+            if a != b:
+                raise AssertionError("{0} != {1}".format(a, b))
+        except AssertionError as e:
+            self._errors.append(
+                ValidationError(e, traceback.format_exc(3))
+            )
+
+    def write(self):
+        """
+        Write the workload files to the mount
+        """
+        raise NotImplementedError()
+
+    def validate(self):
+        """
+        Read from the mount and validate that the workload files are present (i.e. have
+        survived or been reconstructed from the test scenario)
+        """
+        raise NotImplementedError()
+
+    def damage(self):
+        """
+        Damage the filesystem pools in ways that will be interesting to recover from.  By
+        default just wipe everything in the metadata pool
+        """
+
+        pool = self._orig_fs.get_metadata_pool_name()
+        self._orig_fs.rados(["purge", pool, '--yes-i-really-really-mean-it'])
+
+    def flush(self):
+        """
+        Called after client unmount, after write: flush whatever you want
+        """
+        self._orig_fs.mds_asok(["flush", "journal"])
+        self._recovery_fs.mds_asok(["flush", "journal"])
+
+
+class SimpleOverlayWorkload(OverlayWorkload):
+    """
+    Single file, single directory, check that it gets recovered and so does its size
+    """
+    def write(self):
+        self._orig_mount.run_shell(["mkdir", "subdir"])
+        self._orig_mount.write_n_mb("subdir/sixmegs", 6)
+        self._initial_state = self._orig_mount.stat("subdir/sixmegs")
+
+    def validate(self):
+        self._recovery_mount.run_shell(["ls", "subdir"])
+        st = self._recovery_mount.stat("subdir/sixmegs")
+        self.assert_equal(st['st_size'], self._initial_state['st_size'])
+        return self._errors
+
+class TestRecoveryPool(CephFSTestCase):
+    MDSS_REQUIRED = 2
+    CLIENTS_REQUIRED = 2
+    REQUIRE_RECOVERY_FILESYSTEM = True
+
+    def is_marked_damaged(self, rank):
+        mds_map = self.fs.get_mds_map()
+        return rank in mds_map['damaged']
+
+    def _rebuild_metadata(self, workload, other_pool=None, workers=1):
+        """
+        That when all objects in metadata pool are removed, we can rebuild a metadata pool
+        based on the contents of a data pool, and a client can see and read our files.
+        """
+
+        # First, inject some files
+
+        workload.write()
+
+        # Unmount the client and flush the journal: the tool should also cope with
+        # situations where there is dirty metadata, but we'll test that separately
+        self.mount_a.umount_wait()
+        self.mount_b.umount_wait()
+        workload.flush()
+
+        # Create the alternate pool if requested
+        recovery_fs = self.recovery_fs.name
+        recovery_pool = self.recovery_fs.get_metadata_pool_name()
+        self.recovery_fs.data_scan(['init', '--force-init',
+                                    '--filesystem', recovery_fs,
+                                    '--alternate-pool', recovery_pool])
+        self.recovery_fs.mon_manager.raw_cluster_cmd('-s')
+        self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "session"])
+        self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "snap"])
+        self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "inode"])
+
+        # Stop the MDS
+        self.fs.mds_stop() # otherwise MDS will join once the fs is reset
+        self.fs.fail()
+
+        # After recovery, we need the MDS to not be strict about stats (in production these options
+        # are off by default, but in QA we need to explicitly disable them)
+        self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
+        self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)
+
+        # Apply any data damage the workload wants
+        workload.damage()
+
+        # Reset the MDS map in case multiple ranks were in play: recovery procedure
+        # only understands how to rebuild metadata under rank 0
+        self.fs.reset()
+
+        self.fs.table_tool([self.fs.name + ":0", "reset", "session"])
+        self.fs.table_tool([self.fs.name + ":0", "reset", "snap"])
+        self.fs.table_tool([self.fs.name + ":0", "reset", "inode"])
+
+        # Run the recovery procedure
+        if False:
+            with self.assertRaises(CommandFailedError):
+                # Normal reset should fail when no objects are present, we'll use --force instead
+                self.fs.journal_tool(["journal", "reset"], 0)
+
+        self.fs.data_scan(['scan_extents', '--alternate-pool',
+                           recovery_pool, '--filesystem', self.fs.name,
+                           self.fs.get_data_pool_name()])
+        self.fs.data_scan(['scan_inodes', '--alternate-pool',
+                           recovery_pool, '--filesystem', self.fs.name,
+                           '--force-corrupt', '--force-init',
+                           self.fs.get_data_pool_name()])
+        self.fs.journal_tool(['event', 'recover_dentries', 'list',
+                              '--alternate-pool', recovery_pool], 0)
+
+        self.fs.data_scan(['init', '--force-init', '--filesystem',
+                           self.fs.name])
+        self.fs.data_scan(['scan_inodes', '--filesystem', self.fs.name,
+                           '--force-corrupt', '--force-init',
+                           self.fs.get_data_pool_name()])
+        self.fs.journal_tool(['event', 'recover_dentries', 'list'], 0)
+
+        self.recovery_fs.journal_tool(['journal', 'reset', '--force'], 0)
+        self.fs.journal_tool(['journal', 'reset', '--force'], 0)
+        self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired',
+                                            recovery_fs + ":0")
+
+        # Mark the MDS repaired
+        self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')
+
+        # Start the MDS
+        self.fs.mds_restart()
+        self.fs.set_joinable()
+        self.recovery_fs.mds_restart()
+        self.fs.wait_for_daemons()
+        self.recovery_fs.wait_for_daemons()
+        status = self.recovery_fs.status()
+        for rank in self.recovery_fs.get_ranks(status=status):
+            self.fs.mon_manager.raw_cluster_cmd('tell', "mds." + rank['name'],
+                                                'injectargs', '--debug-mds=20')
+            self.fs.rank_tell(['scrub', 'start', '/', 'recursive,repair'], rank=rank['rank'], status=status)
+        log.info(str(self.mds_cluster.status()))
+
+        # Mount a client
+        self.mount_a.mount_wait()
+        self.mount_b.mount_wait(cephfs_name=recovery_fs)
+
+        # See that the files are present and correct
+        errors = workload.validate()
+        if errors:
+            log.error("Validation errors found: {0}".format(len(errors)))
+            for e in errors:
+                log.error(e.exception)
+                log.error(e.backtrace)
+            raise AssertionError("Validation failed, first error: {0}\n{1}".format(
+                errors[0].exception, errors[0].backtrace
+            ))
+
+    def test_rebuild_simple(self):
+        self._rebuild_metadata(SimpleOverlayWorkload(self.fs, self.recovery_fs,
+                                                     self.mount_a, self.mount_b))