qa/tasks/cephfs/test_recovery_pool.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203

"""
Test our tools for recovering metadata from the data pool into an alternate pool
"""

import logging
import traceback
from collections import namedtuple

from teuthology.orchestra.run import CommandFailedError
from tasks.cephfs.cephfs_test_case import CephFSTestCase

log = logging.getLogger(__name__)


ValidationError = namedtuple("ValidationError", ["exception", "backtrace"])


class OverlayWorkload(object):
    def __init__(self, orig_fs, recovery_fs, orig_mount, recovery_mount):
        self._orig_fs = orig_fs
        self._recovery_fs = recovery_fs
        self._orig_mount = orig_mount
        self._recovery_mount = recovery_mount
        self._initial_state = None

        # Accumulate backtraces for every failed validation, and return them.  Backtraces
        # are rather verbose, but we only see them when something breaks, and they
        # let us see which check failed without having to decorate each check with
        # a string
        self._errors = []

    def assert_equal(self, a, b):
        try:
            if a != b:
                raise AssertionError("{0} != {1}".format(a, b))
        except AssertionError as e:
            self._errors.append(
                ValidationError(e, traceback.format_exc(3))
            )

    def write(self):
        """
        Write the workload files to the mount
        """
        raise NotImplementedError()

    def validate(self):
        """
        Read from the mount and validate that the workload files are present (i.e. have
        survived or been reconstructed from the test scenario)
        """
        raise NotImplementedError()

    def damage(self):
        """
        Damage the filesystem pools in ways that will be interesting to recover from.  By
        default just wipe everything in the metadata pool
        """

        pool = self._orig_fs.get_metadata_pool_name()
        self._orig_fs.rados(["purge", pool, '--yes-i-really-really-mean-it'])

    def flush(self):
        """
        Called after client unmount, after write: flush whatever you want
        """
        self._orig_fs.mds_asok(["flush", "journal"])
        self._recovery_fs.mds_asok(["flush", "journal"])


class SimpleOverlayWorkload(OverlayWorkload):
    """
    Single file, single directory, check that it gets recovered and so does its size
    """
    def write(self):
        self._orig_mount.run_shell(["mkdir", "subdir"])
        self._orig_mount.write_n_mb("subdir/sixmegs", 6)
        self._initial_state = self._orig_mount.stat("subdir/sixmegs")

    def validate(self):
        self._recovery_mount.run_shell(["ls", "subdir"])
        st = self._recovery_mount.stat("subdir/sixmegs")
        self.assert_equal(st['st_size'], self._initial_state['st_size'])
        return self._errors

class TestRecoveryPool(CephFSTestCase):
    MDSS_REQUIRED = 2
    CLIENTS_REQUIRED = 2
    REQUIRE_RECOVERY_FILESYSTEM = True

    def is_marked_damaged(self, rank):
        mds_map = self.fs.get_mds_map()
        return rank in mds_map['damaged']

    def _rebuild_metadata(self, workload, other_pool=None, workers=1):
        """
        That when all objects in metadata pool are removed, we can rebuild a metadata pool
        based on the contents of a data pool, and a client can see and read our files.
        """

        # First, inject some files

        workload.write()

        # Unmount the client and flush the journal: the tool should also cope with
        # situations where there is dirty metadata, but we'll test that separately
        self.mount_a.umount_wait()
        self.mount_b.umount_wait()
        workload.flush()

        # Create the alternate pool if requested
        recovery_fs = self.recovery_fs.name
        recovery_pool = self.recovery_fs.get_metadata_pool_name()
        self.recovery_fs.data_scan(['init', '--force-init',
                                    '--filesystem', recovery_fs,
                                    '--alternate-pool', recovery_pool])
        self.recovery_fs.mon_manager.raw_cluster_cmd('-s')
        self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "session"])
        self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "snap"])
        self.recovery_fs.table_tool([recovery_fs + ":0", "reset", "inode"])

        # Stop the MDS
        self.fs.mds_stop() # otherwise MDS will join once the fs is reset
        self.fs.fail()

        # After recovery, we need the MDS to not be strict about stats (in production these options
        # are off by default, but in QA we need to explicitly disable them)
        self.fs.set_ceph_conf('mds', 'mds verify scatter', False)
        self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False)

        # Apply any data damage the workload wants
        workload.damage()

        # Reset the MDS map in case multiple ranks were in play: recovery procedure
        # only understands how to rebuild metadata under rank 0
        self.fs.reset()

        self.fs.table_tool([self.fs.name + ":0", "reset", "session"])
        self.fs.table_tool([self.fs.name + ":0", "reset", "snap"])
        self.fs.table_tool([self.fs.name + ":0", "reset", "inode"])

        # Run the recovery procedure
        if False:
            with self.assertRaises(CommandFailedError):
                # Normal reset should fail when no objects are present, we'll use --force instead
                self.fs.journal_tool(["journal", "reset"], 0)

        self.fs.data_scan(['scan_extents', '--alternate-pool',
                           recovery_pool, '--filesystem', self.fs.name,
                           self.fs.get_data_pool_name()])
        self.fs.data_scan(['scan_inodes', '--alternate-pool',
                           recovery_pool, '--filesystem', self.fs.name,
                           '--force-corrupt', '--force-init',
                           self.fs.get_data_pool_name()])
        self.fs.journal_tool(['event', 'recover_dentries', 'list',
                              '--alternate-pool', recovery_pool], 0)

        self.fs.data_scan(['init', '--force-init', '--filesystem',
                           self.fs.name])
        self.fs.data_scan(['scan_inodes', '--filesystem', self.fs.name,
                           '--force-corrupt', '--force-init',
                           self.fs.get_data_pool_name()])
        self.fs.journal_tool(['event', 'recover_dentries', 'list'], 0)

        self.recovery_fs.journal_tool(['journal', 'reset', '--force'], 0)
        self.fs.journal_tool(['journal', 'reset', '--force'], 0)
        self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired',
                                            recovery_fs + ":0")

        # Mark the MDS repaired
        self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0')

        # Start the MDS
        self.fs.mds_restart()
        self.fs.set_joinable()
        self.recovery_fs.mds_restart()
        self.fs.wait_for_daemons()
        self.recovery_fs.wait_for_daemons()
        status = self.recovery_fs.status()
        for rank in self.recovery_fs.get_ranks(status=status):
            self.fs.mon_manager.raw_cluster_cmd('tell', "mds." + rank['name'],
                                                'injectargs', '--debug-mds=20')
            self.fs.rank_tell(['scrub', 'start', '/', 'recursive,repair'], rank=rank['rank'], status=status)
        log.info(str(self.mds_cluster.status()))

        # Mount a client
        self.mount_a.mount_wait()
        self.mount_b.mount_wait(cephfs_name=recovery_fs)

        # See that the files are present and correct
        errors = workload.validate()
        if errors:
            log.error("Validation errors found: {0}".format(len(errors)))
            for e in errors:
                log.error(e.exception)
                log.error(e.backtrace)
            raise AssertionError("Validation failed, first error: {0}\n{1}".format(
                errors[0].exception, errors[0].backtrace
            ))

    def test_rebuild_simple(self):
        self._rebuild_metadata(SimpleOverlayWorkload(self.fs, self.recovery_fs,
                                                     self.mount_a, self.mount_b))