diff options
author | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
---|---|---|
committer | Daniel Baumann <daniel.baumann@progress-linux.org> | 2024-04-21 11:54:28 +0000 |
commit | e6918187568dbd01842d8d1d2c808ce16a894239 (patch) | |
tree | 64f88b554b444a49f656b6c656111a145cbbaa28 /qa/tasks/cephfs | |
parent | Initial commit. (diff) | |
download | ceph-e6918187568dbd01842d8d1d2c808ce16a894239.tar.xz ceph-e6918187568dbd01842d8d1d2c808ce16a894239.zip |
Adding upstream version 18.2.2.upstream/18.2.2
Signed-off-by: Daniel Baumann <daniel.baumann@progress-linux.org>
Diffstat (limited to 'qa/tasks/cephfs')
52 files changed, 29976 insertions, 0 deletions
diff --git a/qa/tasks/cephfs/__init__.py b/qa/tasks/cephfs/__init__.py new file mode 100644 index 000000000..e69de29bb --- /dev/null +++ b/qa/tasks/cephfs/__init__.py diff --git a/qa/tasks/cephfs/caps_helper.py b/qa/tasks/cephfs/caps_helper.py new file mode 100644 index 000000000..ac9bc4401 --- /dev/null +++ b/qa/tasks/cephfs/caps_helper.py @@ -0,0 +1,195 @@ +""" +Helper methods to test that MON and MDS caps are enforced properly. +""" +from os.path import join as os_path_join +from logging import getLogger + +from tasks.cephfs.cephfs_test_case import CephFSTestCase + +from teuthology.orchestra.run import Raw + + +log = getLogger(__name__) + + +class CapTester(CephFSTestCase): + """ + Test that MON and MDS caps are enforced. + + MDS caps are tested by exercising read-write permissions and MON caps are + tested using output of command "ceph fs ls". Besides, it provides + write_test_files() which creates test files at the given path on CephFS + mounts passed to it. + + USAGE: Call write_test_files() method at the beginning of the test and + once the caps that needs to be tested are assigned to the client and + CephFS be remount for caps to effective, call run_cap_tests(), + run_mon_cap_tests() or run_mds_cap_tests() as per the need. + """ + + def write_test_files(self, mounts, testpath=''): + """ + Exercising 'r' and 'w' access levels on a file on CephFS mount is + pretty routine across all tests for caps. Adding to method to write + that file will reduce clutter in these tests. + + This methods writes a fixed data in a file with a fixed name located + at the path passed in testpath for the given list of mounts. If + testpath is empty, the file is created at the root of the CephFS. + """ + dirname, filename = 'testdir', 'testfile' + self.test_set = [] + # XXX: The reason behind testpath[1:] below is that the testpath is + # supposed to contain a path inside CephFS (which might be passed as + # an absolute path). os.path.join() deletes all previous path + # components when it encounters a path component starting with '/'. + # Deleting the first '/' from the string in testpath ensures that + # previous path components are not deleted by os.path.join(). + if testpath: + testpath = testpath[1:] if testpath[0] == '/' else testpath + # XXX: passing just '/' screw up os.path.join() ahead. + if testpath == '/': + testpath = '' + + for mount_x in mounts: + log.info(f'creating test file on FS {mount_x.cephfs_name} ' + f'mounted at {mount_x.mountpoint}...') + dirpath = os_path_join(mount_x.hostfs_mntpt, testpath, dirname) + mount_x.run_shell(f'mkdir {dirpath}') + filepath = os_path_join(dirpath, filename) + # XXX: the reason behind adding filepathm, cephfs_name and both + # mntpts is to avoid a test bug where we mount cephfs1 but what + # ends up being mounted cephfs2. since filepath and filedata are + # identical, how would tests figure otherwise that they are + # accessing the right filename but on wrong CephFS. + filedata = (f'filepath = {filepath}\n' + f'cephfs_name = {mount_x.cephfs_name}\n' + f'cephfs_mntpt = {mount_x.cephfs_mntpt}\n' + f'hostfs_mntpt = {mount_x.hostfs_mntpt}') + mount_x.write_file(filepath, filedata) + self.test_set.append((mount_x, filepath, filedata)) + log.info('test file created at {path} with data "{data}.') + + def run_cap_tests(self, perm, mntpt=None): + # TODO + #self.run_mon_cap_tests() + self.run_mds_cap_tests(perm, mntpt=mntpt) + + def _get_fsnames_from_moncap(self, moncap): + fsnames = [] + while moncap.find('fsname=') != -1: + fsname_first_char = moncap.index('fsname=') + len('fsname=') + + if ',' in moncap: + last = moncap.index(',') + fsname = moncap[fsname_first_char : last] + moncap = moncap.replace(moncap[0 : last+1], '') + else: + fsname = moncap[fsname_first_char : ] + moncap = moncap.replace(moncap[0 : ], '') + + fsnames.append(fsname) + + return fsnames + + def run_mon_cap_tests(self, def_fs, client_id): + """ + Check that MON cap is enforced for a client by searching for a Ceph + FS name in output of cmd "fs ls" executed with that client's caps. + + def_fs stands for default FS on Ceph cluster. + """ + get_cluster_cmd_op = def_fs.mon_manager.raw_cluster_cmd + + keyring = get_cluster_cmd_op(args=f'auth get client.{client_id}') + + moncap = None + for line in keyring.split('\n'): + if 'caps mon' in line: + moncap = line[line.find(' = "') + 4 : -1] + break + else: + raise RuntimeError('run_mon_cap_tests(): mon cap not found in ' + 'keyring. keyring -\n' + keyring) + + keyring_path = def_fs.admin_remote.mktemp(data=keyring) + + fsls = get_cluster_cmd_op( + args=f'fs ls --id {client_id} -k {keyring_path}') + log.info(f'output of fs ls cmd run by client.{client_id} -\n{fsls}') + + if 'fsname=' not in moncap: + log.info('no FS name is mentioned in moncap, client has ' + 'permission to list all files. moncap -\n{moncap}') + log.info('testing for presence of all FS names in output of ' + '"fs ls" command run by client.') + + fsls_admin = get_cluster_cmd_op(args='fs ls') + log.info('output of fs ls cmd run by admin -\n{fsls_admin}') + + self.assertEqual(fsls, fsls_admin) + return + + log.info('FS names are mentioned in moncap. moncap -\n{moncap}') + log.info('testing for presence of these FS names in output of ' + '"fs ls" command run by client.') + for fsname in self._get_fsnames_from_moncap(moncap): + self.assertIn('name: ' + fsname, fsls) + + def run_mds_cap_tests(self, perm, mntpt=None): + """ + Run test for read perm and, for write perm, run positive test if it + is present and run negative test if not. + """ + # XXX: mntpt is path inside cephfs that serves as root for current + # mount. Therefore, this path must me deleted from self.filepaths. + # Example - + # orignal path: /mnt/cephfs_x/dir1/dir2/testdir + # cephfs dir serving as root for current mnt: /dir1/dir2 + # therefore, final path: /mnt/cephfs_x//testdir + if mntpt: + self.test_set = [(x, y.replace(mntpt, ''), z) for x, y, z in \ + self.test_set] + + self.conduct_pos_test_for_read_caps() + + if perm == 'rw': + self.conduct_pos_test_for_write_caps() + elif perm == 'r': + self.conduct_neg_test_for_write_caps() + else: + raise RuntimeError(f'perm = {perm}\nIt should be "r" or "rw".') + + def conduct_pos_test_for_read_caps(self): + for mount, path, data in self.test_set: + log.info(f'test read perm: read file {path} and expect data ' + f'"{data}"') + contents = mount.read_file(path) + self.assertEqual(data, contents) + log.info(f'read perm was tested successfully: "{data}" was ' + f'successfully read from path {path}') + + def conduct_pos_test_for_write_caps(self): + for mount, path, data in self.test_set: + log.info(f'test write perm: try writing data "{data}" to ' + f'file {path}.') + mount.write_file(path=path, data=data) + contents = mount.read_file(path=path) + self.assertEqual(data, contents) + log.info(f'write perm was tested was successfully: data ' + f'"{data}" was successfully written to file "{path}".') + + def conduct_neg_test_for_write_caps(self, sudo_write=False): + possible_errmsgs = ('permission denied', 'operation not permitted') + cmdargs = ['echo', 'some random data', Raw('|')] + cmdargs += ['sudo', 'tee'] if sudo_write else ['tee'] + + # don't use data, cmd args to write are set already above. + for mount, path, data in self.test_set: + log.info('test absence of write perm: expect failure ' + f'writing data to file {path}.') + cmdargs.append(path) + mount.negtestcmd(args=cmdargs, retval=1, errmsgs=possible_errmsgs) + cmdargs.pop(-1) + log.info('absence of write perm was tested successfully: ' + f'failed to be write data to file {path}.') diff --git a/qa/tasks/cephfs/cephfs_test_case.py b/qa/tasks/cephfs/cephfs_test_case.py new file mode 100644 index 000000000..d2688929c --- /dev/null +++ b/qa/tasks/cephfs/cephfs_test_case.py @@ -0,0 +1,442 @@ +import json +import logging +import os +import re + +from shlex import split as shlex_split + +from tasks.ceph_test_case import CephTestCase + +from teuthology import contextutil +from teuthology.orchestra import run +from teuthology.exceptions import CommandFailedError + +log = logging.getLogger(__name__) + +def classhook(m): + def dec(cls): + getattr(cls, m)() + return cls + return dec + +def for_teuthology(f): + """ + Decorator that adds an "is_for_teuthology" attribute to the wrapped function + """ + f.is_for_teuthology = True + return f + + +def needs_trimming(f): + """ + Mark fn as requiring a client capable of trimming its cache (i.e. for ceph-fuse + this means it needs to be able to run as root, currently) + """ + f.needs_trimming = True + return f + + +class MountDetails(): + + def __init__(self, mntobj): + self.client_id = mntobj.client_id + self.client_keyring_path = mntobj.client_keyring_path + self.client_remote = mntobj.client_remote + self.cephfs_name = mntobj.cephfs_name + self.cephfs_mntpt = mntobj.cephfs_mntpt + self.hostfs_mntpt = mntobj.hostfs_mntpt + + def restore(self, mntobj): + mntobj.client_id = self.client_id + mntobj.client_keyring_path = self.client_keyring_path + mntobj.client_remote = self.client_remote + mntobj.cephfs_name = self.cephfs_name + mntobj.cephfs_mntpt = self.cephfs_mntpt + mntobj.hostfs_mntpt = self.hostfs_mntpt + + +class CephFSTestCase(CephTestCase): + """ + Test case for Ceph FS, requires caller to populate Filesystem and Mounts, + into the fs, mount_a, mount_b class attributes (setting mount_b is optional) + + Handles resetting the cluster under test between tests. + """ + + # FIXME weird explicit naming + mount_a = None + mount_b = None + recovery_mount = None + + # Declarative test requirements: subclasses should override these to indicate + # their special needs. If not met, tests will be skipped. + CLIENTS_REQUIRED = 1 + MDSS_REQUIRED = 1 + REQUIRE_ONE_CLIENT_REMOTE = False + + # Whether to create the default filesystem during setUp + REQUIRE_FILESYSTEM = True + + # create a backup filesystem if required. + # required REQUIRE_FILESYSTEM enabled + REQUIRE_BACKUP_FILESYSTEM = False + + LOAD_SETTINGS = [] # type: ignore + + def _save_mount_details(self): + """ + XXX: Tests may change details of mount objects, so let's stash them so + that these details are restored later to ensure smooth setUps and + tearDowns for upcoming tests. + """ + self._orig_mount_details = [MountDetails(m) for m in self.mounts] + log.info(self._orig_mount_details) + + def _remove_blocklist(self): + # In case anything is in the OSD blocklist list, clear it out. This is to avoid + # the OSD map changing in the background (due to blocklist expiry) while tests run. + try: + self.mds_cluster.mon_manager.run_cluster_cmd(args="osd blocklist clear") + except CommandFailedError: + # Fallback for older Ceph cluster + try: + blocklist = json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd("osd", + "dump", "--format=json-pretty"))['blocklist'] + log.info(f"Removing {len(blocklist)} blocklist entries") + for addr, blocklisted_at in blocklist.items(): + self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blocklist", "rm", addr) + except KeyError: + # Fallback for more older Ceph clusters, who will use 'blacklist' instead. + blacklist = json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd("osd", + "dump", "--format=json-pretty"))['blacklist'] + log.info(f"Removing {len(blacklist)} blacklist entries") + for addr, blocklisted_at in blacklist.items(): + self.mds_cluster.mon_manager.raw_cluster_cmd("osd", "blacklist", "rm", addr) + + def setUp(self): + super(CephFSTestCase, self).setUp() + + self.config_set('mon', 'mon_allow_pool_delete', True) + + if len(self.mds_cluster.mds_ids) < self.MDSS_REQUIRED: + self.skipTest("Only have {0} MDSs, require {1}".format( + len(self.mds_cluster.mds_ids), self.MDSS_REQUIRED + )) + + if len(self.mounts) < self.CLIENTS_REQUIRED: + self.skipTest("Only have {0} clients, require {1}".format( + len(self.mounts), self.CLIENTS_REQUIRED + )) + + if self.REQUIRE_ONE_CLIENT_REMOTE: + if self.mounts[0].client_remote.hostname in self.mds_cluster.get_mds_hostnames(): + self.skipTest("Require first client to be on separate server from MDSs") + + # Create friendly mount_a, mount_b attrs + for i in range(0, self.CLIENTS_REQUIRED): + setattr(self, "mount_{0}".format(chr(ord('a') + i)), self.mounts[i]) + + self.mds_cluster.clear_firewall() + + # Unmount all clients, we are about to blow away the filesystem + for mount in self.mounts: + if mount.is_mounted(): + mount.umount_wait(force=True) + self._save_mount_details() + + # To avoid any issues with e.g. unlink bugs, we destroy and recreate + # the filesystem rather than just doing a rm -rf of files + self.mds_cluster.delete_all_filesystems() + self.mds_cluster.mds_restart() # to reset any run-time configs, etc. + self.fs = None # is now invalid! + self.backup_fs = None + self.recovery_fs = None + + self._remove_blocklist() + + client_mount_ids = [m.client_id for m in self.mounts] + # In case there were any extra auth identities around from a previous + # test, delete them + for entry in self.auth_list(): + ent_type, ent_id = entry['entity'].split(".") + if ent_type == "client" and ent_id not in client_mount_ids and not (ent_id == "admin" or ent_id[:6] == 'mirror'): + self.mds_cluster.mon_manager.raw_cluster_cmd("auth", "del", entry['entity']) + + if self.REQUIRE_FILESYSTEM: + self.fs = self.mds_cluster.newfs(create=True) + + # In case some test messed with auth caps, reset them + for client_id in client_mount_ids: + cmd = ['auth', 'caps', f'client.{client_id}', 'mon','allow r', + 'osd', f'allow rw tag cephfs data={self.fs.name}', + 'mds', 'allow'] + + if self.run_cluster_cmd_result(cmd) == 0: + break + + cmd[1] = 'add' + if self.run_cluster_cmd_result(cmd) != 0: + raise RuntimeError(f'Failed to create new client {cmd[2]}') + + # wait for ranks to become active + self.fs.wait_for_daemons() + + # Mount the requested number of clients + for i in range(0, self.CLIENTS_REQUIRED): + self.mounts[i].mount_wait() + + if self.REQUIRE_BACKUP_FILESYSTEM: + if not self.REQUIRE_FILESYSTEM: + self.skipTest("backup filesystem requires a primary filesystem as well") + self.fs.mon_manager.raw_cluster_cmd('fs', 'flag', 'set', + 'enable_multiple', 'true', + '--yes-i-really-mean-it') + self.backup_fs = self.mds_cluster.newfs(name="backup_fs") + self.backup_fs.wait_for_daemons() + + # Load an config settings of interest + for setting in self.LOAD_SETTINGS: + setattr(self, setting, float(self.fs.mds_asok( + ['config', 'get', setting], list(self.mds_cluster.mds_ids)[0] + )[setting])) + + self.configs_set = set() + + def tearDown(self): + self.mds_cluster.clear_firewall() + for m in self.mounts: + m.teardown() + + # To prevent failover messages during Unwind of ceph task + self.mds_cluster.delete_all_filesystems() + + for m, md in zip(self.mounts, self._orig_mount_details): + md.restore(m) + + for subsys, key in self.configs_set: + self.mds_cluster.clear_ceph_conf(subsys, key) + + return super(CephFSTestCase, self).tearDown() + + def set_conf(self, subsys, key, value): + self.configs_set.add((subsys, key)) + self.mds_cluster.set_ceph_conf(subsys, key, value) + + def auth_list(self): + """ + Convenience wrapper on "ceph auth ls" + """ + return json.loads(self.mds_cluster.mon_manager.raw_cluster_cmd( + "auth", "ls", "--format=json-pretty" + ))['auth_dump'] + + def assert_session_count(self, expected, ls_data=None, mds_id=None): + if ls_data is None: + ls_data = self.fs.mds_asok(['session', 'ls'], mds_id=mds_id) + + alive_count = len([s for s in ls_data if s['state'] != 'killing']) + + self.assertEqual(expected, alive_count, "Expected {0} sessions, found {1}".format( + expected, alive_count + )) + + def assert_session_state(self, client_id, expected_state): + self.assertEqual( + self._session_by_id( + self.fs.mds_asok(['session', 'ls'])).get(client_id, {'state': None})['state'], + expected_state) + + def get_session_data(self, client_id): + return self._session_by_id(client_id) + + def _session_list(self): + ls_data = self.fs.mds_asok(['session', 'ls']) + ls_data = [s for s in ls_data if s['state'] not in ['stale', 'closed']] + return ls_data + + def get_session(self, client_id, session_ls=None): + if session_ls is None: + session_ls = self.fs.mds_asok(['session', 'ls']) + + return self._session_by_id(session_ls)[client_id] + + def _session_by_id(self, session_ls): + return dict([(s['id'], s) for s in session_ls]) + + def perf_dump(self, rank=None, status=None): + return self.fs.rank_asok(['perf', 'dump'], rank=rank, status=status) + + def wait_until_evicted(self, client_id, timeout=30): + def is_client_evicted(): + ls = self._session_list() + for s in ls: + if s['id'] == client_id: + return False + return True + self.wait_until_true(is_client_evicted, timeout) + + def wait_for_daemon_start(self, daemon_ids=None): + """ + Wait until all the daemons appear in the FSMap, either assigned + MDS ranks or in the list of standbys + """ + def get_daemon_names(): + return [info['name'] for info in self.mds_cluster.status().get_all()] + + if daemon_ids is None: + daemon_ids = self.mds_cluster.mds_ids + + try: + self.wait_until_true( + lambda: set(daemon_ids) & set(get_daemon_names()) == set(daemon_ids), + timeout=30 + ) + except RuntimeError: + log.warning("Timeout waiting for daemons {0}, while we have {1}".format( + daemon_ids, get_daemon_names() + )) + raise + + def delete_mds_coredump(self, daemon_id): + # delete coredump file, otherwise teuthology.internal.coredump will + # catch it later and treat it as a failure. + core_pattern = self.mds_cluster.mds_daemons[daemon_id].remote.sh( + "sudo sysctl -n kernel.core_pattern") + core_dir = os.path.dirname(core_pattern.strip()) + if core_dir: # Non-default core_pattern with a directory in it + # We have seen a core_pattern that looks like it's from teuthology's coredump + # task, so proceed to clear out the core file + if core_dir[0] == '|': + log.info("Piped core dumps to program {0}, skip cleaning".format(core_dir[1:])) + return; + + log.info("Clearing core from directory: {0}".format(core_dir)) + + # Verify that we see the expected single coredump + ls_output = self.mds_cluster.mds_daemons[daemon_id].remote.sh([ + "cd", core_dir, run.Raw('&&'), + "sudo", "ls", run.Raw('|'), "sudo", "xargs", "file" + ]) + cores = [l.partition(":")[0] + for l in ls_output.strip().split("\n") + if re.match(r'.*ceph-mds.* -i +{0}'.format(daemon_id), l)] + + log.info("Enumerated cores: {0}".format(cores)) + self.assertEqual(len(cores), 1) + + log.info("Found core file {0}, deleting it".format(cores[0])) + + self.mds_cluster.mds_daemons[daemon_id].remote.run(args=[ + "cd", core_dir, run.Raw('&&'), "sudo", "rm", "-f", cores[0] + ]) + else: + log.info("No core_pattern directory set, nothing to clear (internal.coredump not enabled?)") + + def _get_subtrees(self, status=None, rank=None, path=None): + if path is None: + path = "/" + try: + with contextutil.safe_while(sleep=1, tries=3) as proceed: + while proceed(): + try: + if rank == "all": + subtrees = [] + for r in self.fs.get_ranks(status=status): + s = self.fs.rank_asok(["get", "subtrees"], status=status, rank=r['rank']) + s = filter(lambda s: s['auth_first'] == r['rank'] and s['auth_second'] == -2, s) + subtrees += s + else: + subtrees = self.fs.rank_asok(["get", "subtrees"], status=status, rank=rank) + subtrees = filter(lambda s: s['dir']['path'].startswith(path), subtrees) + return list(subtrees) + except CommandFailedError as e: + # Sometimes we get transient errors + if e.exitstatus == 22: + pass + else: + raise + except contextutil.MaxWhileTries as e: + raise RuntimeError(f"could not get subtree state from rank {rank}") from e + + def _wait_subtrees(self, test, status=None, rank=None, timeout=30, sleep=2, action=None, path=None): + test = sorted(test) + try: + with contextutil.safe_while(sleep=sleep, tries=timeout//sleep) as proceed: + while proceed(): + subtrees = self._get_subtrees(status=status, rank=rank, path=path) + filtered = sorted([(s['dir']['path'], s['auth_first']) for s in subtrees]) + log.info("%s =?= %s", filtered, test) + if filtered == test: + # Confirm export_pin in output is correct: + for s in subtrees: + if s['export_pin_target'] >= 0: + self.assertTrue(s['export_pin_target'] == s['auth_first']) + return subtrees + if action is not None: + action() + except contextutil.MaxWhileTries as e: + raise RuntimeError("rank {0} failed to reach desired subtree state".format(rank)) from e + + def _wait_until_scrub_complete(self, path="/", recursive=True, timeout=100): + out_json = self.fs.run_scrub(["start", path] + ["recursive"] if recursive else []) + if not self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"], + sleep=10, timeout=timeout): + log.info("timed out waiting for scrub to complete") + + def _wait_distributed_subtrees(self, count, status=None, rank=None, path=None): + try: + with contextutil.safe_while(sleep=5, tries=20) as proceed: + while proceed(): + subtrees = self._get_subtrees(status=status, rank=rank, path=path) + subtrees = list(filter(lambda s: s['distributed_ephemeral_pin'] == True and + s['auth_first'] == s['export_pin_target'], + subtrees)) + log.info(f"len={len(subtrees)} {subtrees}") + if len(subtrees) >= count: + return subtrees + except contextutil.MaxWhileTries as e: + raise RuntimeError("rank {0} failed to reach desired subtree state".format(rank)) from e + + def _wait_random_subtrees(self, count, status=None, rank=None, path=None): + try: + with contextutil.safe_while(sleep=5, tries=20) as proceed: + while proceed(): + subtrees = self._get_subtrees(status=status, rank=rank, path=path) + subtrees = list(filter(lambda s: s['random_ephemeral_pin'] == True and + s['auth_first'] == s['export_pin_target'], + subtrees)) + log.info(f"len={len(subtrees)} {subtrees}") + if len(subtrees) >= count: + return subtrees + except contextutil.MaxWhileTries as e: + raise RuntimeError("rank {0} failed to reach desired subtree state".format(rank)) from e + + def run_cluster_cmd(self, cmd): + if isinstance(cmd, str): + cmd = shlex_split(cmd) + return self.fs.mon_manager.raw_cluster_cmd(*cmd) + + def run_cluster_cmd_result(self, cmd): + if isinstance(cmd, str): + cmd = shlex_split(cmd) + return self.fs.mon_manager.raw_cluster_cmd_result(*cmd) + + def create_client(self, client_id, moncap=None, osdcap=None, mdscap=None): + if not (moncap or osdcap or mdscap): + if self.fs: + return self.fs.authorize(client_id, ('/', 'rw')) + else: + raise RuntimeError('no caps were passed and the default FS ' + 'is not created yet to allow client auth ' + 'for it.') + + cmd = ['auth', 'add', f'client.{client_id}'] + if moncap: + cmd += ['mon', moncap] + if osdcap: + cmd += ['osd', osdcap] + if mdscap: + cmd += ['mds', mdscap] + + self.run_cluster_cmd(cmd) + return self.run_cluster_cmd(f'auth get {self.client_name}') diff --git a/qa/tasks/cephfs/filesystem.py b/qa/tasks/cephfs/filesystem.py new file mode 100644 index 000000000..777ba8249 --- /dev/null +++ b/qa/tasks/cephfs/filesystem.py @@ -0,0 +1,1712 @@ + +import json +import logging +from gevent import Greenlet +import os +import time +import datetime +import re +import errno +import random + +from io import BytesIO, StringIO +from errno import EBUSY + +from teuthology.exceptions import CommandFailedError +from teuthology import misc +from teuthology.nuke import clear_firewall +from teuthology.parallel import parallel +from teuthology import contextutil +from tasks.ceph_manager import write_conf +from tasks import ceph_manager + + +log = logging.getLogger(__name__) + + +DAEMON_WAIT_TIMEOUT = 120 +ROOT_INO = 1 + +class FileLayout(object): + def __init__(self, pool=None, pool_namespace=None, stripe_unit=None, stripe_count=None, object_size=None): + self.pool = pool + self.pool_namespace = pool_namespace + self.stripe_unit = stripe_unit + self.stripe_count = stripe_count + self.object_size = object_size + + @classmethod + def load_from_ceph(layout_str): + # TODO + pass + + def items(self): + if self.pool is not None: + yield ("pool", self.pool) + if self.pool_namespace: + yield ("pool_namespace", self.pool_namespace) + if self.stripe_unit is not None: + yield ("stripe_unit", self.stripe_unit) + if self.stripe_count is not None: + yield ("stripe_count", self.stripe_count) + if self.object_size is not None: + yield ("object_size", self.stripe_size) + +class ObjectNotFound(Exception): + def __init__(self, object_name): + self._object_name = object_name + + def __str__(self): + return "Object not found: '{0}'".format(self._object_name) + +class FSMissing(Exception): + def __init__(self, ident): + self.ident = ident + + def __str__(self): + return f"File system {self.ident} does not exist in the map" + +class FSStatus(object): + """ + Operations on a snapshot of the FSMap. + """ + def __init__(self, mon_manager, epoch=None): + self.mon = mon_manager + cmd = ["fs", "dump", "--format=json"] + if epoch is not None: + cmd.append(str(epoch)) + self.map = json.loads(self.mon.raw_cluster_cmd(*cmd)) + + def __str__(self): + return json.dumps(self.map, indent = 2, sort_keys = True) + + # Expose the fsmap for manual inspection. + def __getitem__(self, key): + """ + Get a field from the fsmap. + """ + return self.map[key] + + def get_filesystems(self): + """ + Iterator for all filesystems. + """ + for fs in self.map['filesystems']: + yield fs + + def get_all(self): + """ + Iterator for all the mds_info components in the FSMap. + """ + for info in self.map['standbys']: + yield info + for fs in self.map['filesystems']: + for info in fs['mdsmap']['info'].values(): + yield info + + def get_standbys(self): + """ + Iterator for all standbys. + """ + for info in self.map['standbys']: + yield info + + def get_fsmap(self, fscid): + """ + Get the fsmap for the given FSCID. + """ + for fs in self.map['filesystems']: + if fscid is None or fs['id'] == fscid: + return fs + raise FSMissing(fscid) + + def get_fsmap_byname(self, name): + """ + Get the fsmap for the given file system name. + """ + for fs in self.map['filesystems']: + if name is None or fs['mdsmap']['fs_name'] == name: + return fs + raise FSMissing(name) + + def get_replays(self, fscid): + """ + Get the standby:replay MDS for the given FSCID. + """ + fs = self.get_fsmap(fscid) + for info in fs['mdsmap']['info'].values(): + if info['state'] == 'up:standby-replay': + yield info + + def get_ranks(self, fscid): + """ + Get the ranks for the given FSCID. + """ + fs = self.get_fsmap(fscid) + for info in fs['mdsmap']['info'].values(): + if info['rank'] >= 0 and info['state'] != 'up:standby-replay': + yield info + + def get_damaged(self, fscid): + """ + Get the damaged ranks for the given FSCID. + """ + fs = self.get_fsmap(fscid) + return fs['mdsmap']['damaged'] + + def get_rank(self, fscid, rank): + """ + Get the rank for the given FSCID. + """ + for info in self.get_ranks(fscid): + if info['rank'] == rank: + return info + raise RuntimeError("FSCID {0} has no rank {1}".format(fscid, rank)) + + def get_mds(self, name): + """ + Get the info for the given MDS name. + """ + for info in self.get_all(): + if info['name'] == name: + return info + return None + + def get_mds_addr(self, name): + """ + Return the instance addr as a string, like "10.214.133.138:6807\/10825" + """ + info = self.get_mds(name) + if info: + return info['addr'] + else: + log.warning(json.dumps(list(self.get_all()), indent=2)) # dump for debugging + raise RuntimeError("MDS id '{0}' not found in map".format(name)) + + def get_mds_addrs(self, name): + """ + Return the instance addr as a string, like "[10.214.133.138:6807 10.214.133.138:6808]" + """ + info = self.get_mds(name) + if info: + return [e['addr'] for e in info['addrs']['addrvec']] + else: + log.warn(json.dumps(list(self.get_all()), indent=2)) # dump for debugging + raise RuntimeError("MDS id '{0}' not found in map".format(name)) + + def get_mds_gid(self, gid): + """ + Get the info for the given MDS gid. + """ + for info in self.get_all(): + if info['gid'] == gid: + return info + return None + + def hadfailover(self, status): + """ + Compares two statuses for mds failovers. + Returns True if there is a failover. + """ + for fs in status.map['filesystems']: + for info in fs['mdsmap']['info'].values(): + oldinfo = self.get_mds_gid(info['gid']) + if oldinfo is None or oldinfo['incarnation'] != info['incarnation']: + return True + #all matching + return False + +class CephCluster(object): + @property + def admin_remote(self): + first_mon = misc.get_first_mon(self._ctx, None) + (result,) = self._ctx.cluster.only(first_mon).remotes.keys() + return result + + def __init__(self, ctx) -> None: + self._ctx = ctx + self.mon_manager = ceph_manager.CephManager(self.admin_remote, ctx=ctx, logger=log.getChild('ceph_manager')) + + def get_config(self, key, service_type=None): + """ + Get config from mon by default, or a specific service if caller asks for it + """ + if service_type is None: + service_type = 'mon' + + service_id = sorted(misc.all_roles_of_type(self._ctx.cluster, service_type))[0] + return self.json_asok(['config', 'get', key], service_type, service_id)[key] + + def set_ceph_conf(self, subsys, key, value): + if subsys not in self._ctx.ceph['ceph'].conf: + self._ctx.ceph['ceph'].conf[subsys] = {} + self._ctx.ceph['ceph'].conf[subsys][key] = value + write_conf(self._ctx) # XXX because we don't have the ceph task's config object, if they + # used a different config path this won't work. + + def clear_ceph_conf(self, subsys, key): + del self._ctx.ceph['ceph'].conf[subsys][key] + write_conf(self._ctx) + + def json_asok(self, command, service_type, service_id, timeout=None): + if timeout is None: + timeout = 300 + command.insert(0, '--format=json') + proc = self.mon_manager.admin_socket(service_type, service_id, command, timeout=timeout) + response_data = proc.stdout.getvalue().strip() + if len(response_data) > 0: + + def get_nonnumeric_values(value): + c = {"NaN": float("nan"), "Infinity": float("inf"), + "-Infinity": -float("inf")} + return c[value] + + j = json.loads(response_data.replace('inf', 'Infinity'), + parse_constant=get_nonnumeric_values) + pretty = json.dumps(j, sort_keys=True, indent=2) + log.debug(f"_json_asok output\n{pretty}") + return j + else: + log.debug("_json_asok output empty") + return None + + def is_addr_blocklisted(self, addr): + blocklist = json.loads(self.mon_manager.raw_cluster_cmd( + "osd", "dump", "--format=json"))['blocklist'] + if addr in blocklist: + return True + log.warn(f'The address {addr} is not blocklisted') + return False + + +class MDSCluster(CephCluster): + """ + Collective operations on all the MDS daemons in the Ceph cluster. These + daemons may be in use by various Filesystems. + + For the benefit of pre-multi-filesystem tests, this class is also + a parent of Filesystem. The correct way to use MDSCluster going forward is + as a separate instance outside of your (multiple) Filesystem instances. + """ + + def __init__(self, ctx): + super(MDSCluster, self).__init__(ctx) + + @property + def mds_ids(self): + # do this dynamically because the list of ids may change periodically with cephadm + return list(misc.all_roles_of_type(self._ctx.cluster, 'mds')) + + @property + def mds_daemons(self): + return dict([(mds_id, self._ctx.daemons.get_daemon('mds', mds_id)) for mds_id in self.mds_ids]) + + def _one_or_all(self, mds_id, cb, in_parallel=True): + """ + Call a callback for a single named MDS, or for all. + + Note that the parallelism here isn't for performance, it's to avoid being overly kind + to the cluster by waiting a graceful ssh-latency of time between doing things, and to + avoid being overly kind by executing them in a particular order. However, some actions + don't cope with being done in parallel, so it's optional (`in_parallel`) + + :param mds_id: MDS daemon name, or None + :param cb: Callback taking single argument of MDS daemon name + :param in_parallel: whether to invoke callbacks concurrently (else one after the other) + """ + + if mds_id is None: + if in_parallel: + with parallel() as p: + for mds_id in self.mds_ids: + p.spawn(cb, mds_id) + else: + for mds_id in self.mds_ids: + cb(mds_id) + else: + cb(mds_id) + + def get_config(self, key, service_type=None): + """ + get_config specialization of service_type="mds" + """ + if service_type != "mds": + return super(MDSCluster, self).get_config(key, service_type) + + # Some tests stop MDS daemons, don't send commands to a dead one: + running_daemons = [i for i, mds in self.mds_daemons.items() if mds.running()] + service_id = random.sample(running_daemons, 1)[0] + return self.json_asok(['config', 'get', key], service_type, service_id)[key] + + def mds_stop(self, mds_id=None): + """ + Stop the MDS daemon process(se). If it held a rank, that rank + will eventually go laggy. + """ + self._one_or_all(mds_id, lambda id_: self.mds_daemons[id_].stop()) + + def mds_fail(self, mds_id=None): + """ + Inform MDSMonitor of the death of the daemon process(es). If it held + a rank, that rank will be relinquished. + """ + self._one_or_all(mds_id, lambda id_: self.mon_manager.raw_cluster_cmd("mds", "fail", id_)) + + def mds_restart(self, mds_id=None): + self._one_or_all(mds_id, lambda id_: self.mds_daemons[id_].restart()) + + def mds_fail_restart(self, mds_id=None): + """ + Variation on restart that includes marking MDSs as failed, so that doing this + operation followed by waiting for healthy daemon states guarantees that they + have gone down and come up, rather than potentially seeing the healthy states + that existed before the restart. + """ + def _fail_restart(id_): + self.mds_daemons[id_].stop() + self.mon_manager.raw_cluster_cmd("mds", "fail", id_) + self.mds_daemons[id_].restart() + + self._one_or_all(mds_id, _fail_restart) + + def mds_signal(self, mds_id, sig, silent=False): + """ + signal a MDS daemon + """ + self.mds_daemons[mds_id].signal(sig, silent); + + def mds_is_running(self, mds_id): + return self.mds_daemons[mds_id].running() + + def newfs(self, name='cephfs', create=True): + return Filesystem(self._ctx, name=name, create=create) + + def status(self, epoch=None): + return FSStatus(self.mon_manager, epoch) + + def get_standby_daemons(self): + return set([s['name'] for s in self.status().get_standbys()]) + + def get_mds_hostnames(self): + result = set() + for mds_id in self.mds_ids: + mds_remote = self.mon_manager.find_remote('mds', mds_id) + result.add(mds_remote.hostname) + + return list(result) + + def set_clients_block(self, blocked, mds_id=None): + """ + Block (using iptables) client communications to this MDS. Be careful: if + other services are running on this MDS, or other MDSs try to talk to this + MDS, their communications may also be blocked as collatoral damage. + + :param mds_id: Optional ID of MDS to block, default to all + :return: + """ + da_flag = "-A" if blocked else "-D" + + def set_block(_mds_id): + remote = self.mon_manager.find_remote('mds', _mds_id) + status = self.status() + + addr = status.get_mds_addr(_mds_id) + ip_str, port_str, inst_str = re.match("(.+):(.+)/(.+)", addr).groups() + + remote.run( + args=["sudo", "iptables", da_flag, "OUTPUT", "-p", "tcp", "--sport", port_str, "-j", "REJECT", "-m", + "comment", "--comment", "teuthology"]) + remote.run( + args=["sudo", "iptables", da_flag, "INPUT", "-p", "tcp", "--dport", port_str, "-j", "REJECT", "-m", + "comment", "--comment", "teuthology"]) + + self._one_or_all(mds_id, set_block, in_parallel=False) + + def set_inter_mds_block(self, blocked, mds_rank_1, mds_rank_2): + """ + Block (using iptables) communications from a provided MDS to other MDSs. + Block all ports that an MDS uses for communication. + + :param blocked: True to block the MDS, False otherwise + :param mds_rank_1: MDS rank + :param mds_rank_2: MDS rank + :return: + """ + da_flag = "-A" if blocked else "-D" + + def set_block(mds_ids): + status = self.status() + + mds = mds_ids[0] + remote = self.mon_manager.find_remote('mds', mds) + addrs = status.get_mds_addrs(mds) + for addr in addrs: + ip_str, port_str = re.match("(.+):(.+)", addr).groups() + remote.run( + args=["sudo", "iptables", da_flag, "INPUT", "-p", "tcp", "--dport", port_str, "-j", "REJECT", "-m", + "comment", "--comment", "teuthology"], omit_sudo=False) + + + mds = mds_ids[1] + remote = self.mon_manager.find_remote('mds', mds) + addrs = status.get_mds_addrs(mds) + for addr in addrs: + ip_str, port_str = re.match("(.+):(.+)", addr).groups() + remote.run( + args=["sudo", "iptables", da_flag, "OUTPUT", "-p", "tcp", "--sport", port_str, "-j", "REJECT", "-m", + "comment", "--comment", "teuthology"], omit_sudo=False) + remote.run( + args=["sudo", "iptables", da_flag, "INPUT", "-p", "tcp", "--dport", port_str, "-j", "REJECT", "-m", + "comment", "--comment", "teuthology"], omit_sudo=False) + + self._one_or_all((mds_rank_1, mds_rank_2), set_block, in_parallel=False) + + def clear_firewall(self): + clear_firewall(self._ctx) + + def get_mds_info(self, mds_id): + return FSStatus(self.mon_manager).get_mds(mds_id) + + def is_pool_full(self, pool_name): + pools = json.loads(self.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['pools'] + for pool in pools: + if pool['pool_name'] == pool_name: + return 'full' in pool['flags_names'].split(",") + + raise RuntimeError("Pool not found '{0}'".format(pool_name)) + + def delete_all_filesystems(self): + """ + Remove all filesystems that exist, and any pools in use by them. + """ + for fs in self.status().get_filesystems(): + Filesystem(ctx=self._ctx, fscid=fs['id']).destroy() + + @property + def beacon_timeout(self): + """ + Generate an acceptable timeout for the mons to drive some MDSMap change + because of missed beacons from some MDS. This involves looking up the + grace period in use by the mons and adding an acceptable buffer. + """ + + grace = float(self.get_config("mds_beacon_grace", service_type="mon")) + return grace*2+15 + + +class Filesystem(MDSCluster): + + """ + Generator for all Filesystems in the cluster. + """ + @classmethod + def get_all_fs(cls, ctx): + mdsc = MDSCluster(ctx) + status = mdsc.status() + for fs in status.get_filesystems(): + yield cls(ctx, fscid=fs['id']) + + """ + This object is for driving a CephFS filesystem. The MDS daemons driven by + MDSCluster may be shared with other Filesystems. + """ + def __init__(self, ctx, fs_config={}, fscid=None, name=None, create=False): + super(Filesystem, self).__init__(ctx) + + self.name = name + self.id = None + self.metadata_pool_name = None + self.data_pool_name = None + self.data_pools = None + self.fs_config = fs_config + self.ec_profile = fs_config.get('ec_profile') + + client_list = list(misc.all_roles_of_type(self._ctx.cluster, 'client')) + self.client_id = client_list[0] + self.client_remote = list(misc.get_clients(ctx=ctx, roles=["client.{0}".format(self.client_id)]))[0][1] + + if name is not None: + if fscid is not None: + raise RuntimeError("cannot specify fscid when creating fs") + if create and not self.legacy_configured(): + self.create() + else: + if fscid is not None: + self.id = fscid + self.getinfo(refresh = True) + + # Stash a reference to the first created filesystem on ctx, so + # that if someone drops to the interactive shell they can easily + # poke our methods. + if not hasattr(self._ctx, "filesystem"): + self._ctx.filesystem = self + + def dead(self): + try: + return not bool(self.get_mds_map()) + except FSMissing: + return True + + def get_task_status(self, status_key): + return self.mon_manager.get_service_task_status("mds", status_key) + + def getinfo(self, refresh = False): + status = self.status() + if self.id is not None: + fsmap = status.get_fsmap(self.id) + elif self.name is not None: + fsmap = status.get_fsmap_byname(self.name) + else: + fss = [fs for fs in status.get_filesystems()] + if len(fss) == 1: + fsmap = fss[0] + elif len(fss) == 0: + raise RuntimeError("no file system available") + else: + raise RuntimeError("more than one file system available") + self.id = fsmap['id'] + self.name = fsmap['mdsmap']['fs_name'] + self.get_pool_names(status = status, refresh = refresh) + return status + + def reach_max_mds(self): + status = self.wait_for_daemons() + mds_map = self.get_mds_map(status=status) + assert(mds_map['in'] == list(range(0, mds_map['max_mds']))) + + def reset(self): + self.mon_manager.raw_cluster_cmd("fs", "reset", str(self.name), '--yes-i-really-mean-it') + + def fail(self): + self.mon_manager.raw_cluster_cmd("fs", "fail", str(self.name)) + + def set_flag(self, var, *args): + a = map(lambda x: str(x).lower(), args) + self.mon_manager.raw_cluster_cmd("fs", "flag", "set", var, *a) + + def set_allow_multifs(self, yes=True): + self.set_flag("enable_multiple", yes) + + def set_var(self, var, *args): + a = map(lambda x: str(x).lower(), args) + self.mon_manager.raw_cluster_cmd("fs", "set", self.name, var, *a) + + def set_down(self, down=True): + self.set_var("down", str(down).lower()) + + def set_joinable(self, joinable=True): + self.set_var("joinable", joinable) + + def set_max_mds(self, max_mds): + self.set_var("max_mds", "%d" % max_mds) + + def set_session_timeout(self, timeout): + self.set_var("session_timeout", "%d" % timeout) + + def set_allow_standby_replay(self, yes): + self.set_var("allow_standby_replay", yes) + + def set_allow_new_snaps(self, yes): + self.set_var("allow_new_snaps", yes, '--yes-i-really-mean-it') + + def set_bal_rank_mask(self, bal_rank_mask): + self.set_var("bal_rank_mask", bal_rank_mask) + + def set_refuse_client_session(self, yes): + self.set_var("refuse_client_session", yes) + + def compat(self, *args): + a = map(lambda x: str(x).lower(), args) + self.mon_manager.raw_cluster_cmd("fs", "compat", self.name, *a) + + def add_compat(self, *args): + self.compat("add_compat", *args) + + def add_incompat(self, *args): + self.compat("add_incompat", *args) + + def rm_compat(self, *args): + self.compat("rm_compat", *args) + + def rm_incompat(self, *args): + self.compat("rm_incompat", *args) + + def required_client_features(self, *args, **kwargs): + c = ["fs", "required_client_features", self.name, *args] + return self.mon_manager.run_cluster_cmd(args=c, **kwargs) + + # Since v15.1.0 the pg autoscale mode has been enabled as default, + # will let the pg autoscale mode to calculate the pg_num as needed. + # We set the pg_num_min to 64 to make sure that pg autoscale mode + # won't set the pg_num to low to fix Tracker#45434. + pg_num = 64 + pg_num_min = 64 + target_size_ratio = 0.9 + target_size_ratio_ec = 0.9 + + def create(self, recover=False, metadata_overlay=False): + if self.name is None: + self.name = "cephfs" + if self.metadata_pool_name is None: + self.metadata_pool_name = "{0}_metadata".format(self.name) + if self.data_pool_name is None: + data_pool_name = "{0}_data".format(self.name) + else: + data_pool_name = self.data_pool_name + + # will use the ec pool to store the data and a small amount of + # metadata still goes to the primary data pool for all files. + if not metadata_overlay and self.ec_profile and 'disabled' not in self.ec_profile: + self.target_size_ratio = 0.05 + + log.debug("Creating filesystem '{0}'".format(self.name)) + + try: + self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', + self.metadata_pool_name, + '--pg_num_min', str(self.pg_num_min)) + + self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', + data_pool_name, str(self.pg_num), + '--pg_num_min', str(self.pg_num_min), + '--target_size_ratio', + str(self.target_size_ratio)) + except CommandFailedError as e: + if e.exitstatus == 22: # nautilus couldn't specify --pg_num_min option + self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', + self.metadata_pool_name, + str(self.pg_num_min)) + + self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', + data_pool_name, str(self.pg_num), + str(self.pg_num_min)) + else: + raise + + args = ["fs", "new", self.name, self.metadata_pool_name, data_pool_name] + if recover: + args.append('--recover') + if metadata_overlay: + args.append('--allow-dangerous-metadata-overlay') + self.mon_manager.raw_cluster_cmd(*args) + + if not recover: + if self.ec_profile and 'disabled' not in self.ec_profile: + ec_data_pool_name = data_pool_name + "_ec" + log.debug("EC profile is %s", self.ec_profile) + cmd = ['osd', 'erasure-code-profile', 'set', ec_data_pool_name] + cmd.extend(self.ec_profile) + self.mon_manager.raw_cluster_cmd(*cmd) + try: + self.mon_manager.raw_cluster_cmd( + 'osd', 'pool', 'create', ec_data_pool_name, + 'erasure', ec_data_pool_name, + '--pg_num_min', str(self.pg_num_min), + '--target_size_ratio', str(self.target_size_ratio_ec)) + except CommandFailedError as e: + if e.exitstatus == 22: # nautilus couldn't specify --pg_num_min option + self.mon_manager.raw_cluster_cmd( + 'osd', 'pool', 'create', ec_data_pool_name, + str(self.pg_num_min), 'erasure', ec_data_pool_name) + else: + raise + self.mon_manager.raw_cluster_cmd( + 'osd', 'pool', 'set', + ec_data_pool_name, 'allow_ec_overwrites', 'true') + self.add_data_pool(ec_data_pool_name, create=False) + self.check_pool_application(ec_data_pool_name) + + self.run_client_payload(f"setfattr -n ceph.dir.layout.pool -v {ec_data_pool_name} . && getfattr -n ceph.dir.layout .") + + self.check_pool_application(self.metadata_pool_name) + self.check_pool_application(data_pool_name) + + # Turn off spurious standby count warnings from modifying max_mds in tests. + try: + self.mon_manager.raw_cluster_cmd('fs', 'set', self.name, 'standby_count_wanted', '0') + except CommandFailedError as e: + if e.exitstatus == 22: + # standby_count_wanted not available prior to luminous (upgrade tests would fail otherwise) + pass + else: + raise + + if self.fs_config is not None: + log.debug(f"fs_config: {self.fs_config}") + max_mds = self.fs_config.get('max_mds', 1) + if max_mds > 1: + self.set_max_mds(max_mds) + + standby_replay = self.fs_config.get('standby_replay', False) + self.set_allow_standby_replay(standby_replay) + + # If absent will use the default value (60 seconds) + session_timeout = self.fs_config.get('session_timeout', 60) + if session_timeout != 60: + self.set_session_timeout(session_timeout) + + if self.fs_config.get('subvols', None) is not None: + log.debug(f"Creating {self.fs_config.get('subvols')} subvols " + f"for filesystem '{self.name}'") + if not hasattr(self._ctx, "created_subvols"): + self._ctx.created_subvols = dict() + + subvols = self.fs_config.get('subvols') + assert(isinstance(subvols, dict)) + assert(isinstance(subvols['create'], int)) + assert(subvols['create'] > 0) + + for sv in range(0, subvols['create']): + sv_name = f'sv_{sv}' + self.mon_manager.raw_cluster_cmd( + 'fs', 'subvolume', 'create', self.name, sv_name, + self.fs_config.get('subvol_options', '')) + + if self.name not in self._ctx.created_subvols: + self._ctx.created_subvols[self.name] = [] + + subvol_path = self.mon_manager.raw_cluster_cmd( + 'fs', 'subvolume', 'getpath', self.name, sv_name) + subvol_path = subvol_path.strip() + self._ctx.created_subvols[self.name].append(subvol_path) + else: + log.debug(f"Not Creating any subvols for filesystem '{self.name}'") + + + self.getinfo(refresh = True) + + # wait pgs to be clean + self.mon_manager.wait_for_clean() + + def run_client_payload(self, cmd): + # avoid circular dep by importing here: + from tasks.cephfs.fuse_mount import FuseMount + + # Wait for at MDS daemons to be ready before mounting the + # ceph-fuse client in run_client_payload() + self.wait_for_daemons() + + d = misc.get_testdir(self._ctx) + m = FuseMount(self._ctx, d, "admin", self.client_remote, cephfs_name=self.name) + m.mount_wait() + m.run_shell_payload(cmd) + m.umount_wait(require_clean=True) + + def _remove_pool(self, name, **kwargs): + c = f'osd pool rm {name} {name} --yes-i-really-really-mean-it' + return self.mon_manager.ceph(c, **kwargs) + + def rm(self, **kwargs): + c = f'fs rm {self.name} --yes-i-really-mean-it' + return self.mon_manager.ceph(c, **kwargs) + + def remove_pools(self, data_pools): + self._remove_pool(self.get_metadata_pool_name()) + for poolname in data_pools: + try: + self._remove_pool(poolname) + except CommandFailedError as e: + # EBUSY, this data pool is used by two metadata pools, let the + # 2nd pass delete it + if e.exitstatus == EBUSY: + pass + else: + raise + + def destroy(self, reset_obj_attrs=True): + log.info(f'Destroying file system {self.name} and related pools') + + if self.dead(): + log.debug('already dead...') + return + + data_pools = self.get_data_pool_names(refresh=True) + + # make sure no MDSs are attached to given FS. + self.fail() + self.rm() + + self.remove_pools(data_pools) + + if reset_obj_attrs: + self.id = None + self.name = None + self.metadata_pool_name = None + self.data_pool_name = None + self.data_pools = None + + def recreate(self): + self.destroy() + + self.create() + self.getinfo(refresh=True) + + def check_pool_application(self, pool_name): + osd_map = self.mon_manager.get_osd_dump_json() + for pool in osd_map['pools']: + if pool['pool_name'] == pool_name: + if "application_metadata" in pool: + if not "cephfs" in pool['application_metadata']: + raise RuntimeError("Pool {pool_name} does not name cephfs as application!".\ + format(pool_name=pool_name)) + + def __del__(self): + if getattr(self._ctx, "filesystem", None) == self: + delattr(self._ctx, "filesystem") + + def exists(self): + """ + Whether a filesystem exists in the mon's filesystem list + """ + fs_list = json.loads(self.mon_manager.raw_cluster_cmd('fs', 'ls', '--format=json-pretty')) + return self.name in [fs['name'] for fs in fs_list] + + def legacy_configured(self): + """ + Check if a legacy (i.e. pre "fs new") filesystem configuration is present. If this is + the case, the caller should avoid using Filesystem.create + """ + try: + out_text = self.mon_manager.raw_cluster_cmd('--format=json-pretty', 'osd', 'lspools') + pools = json.loads(out_text) + metadata_pool_exists = 'metadata' in [p['poolname'] for p in pools] + if metadata_pool_exists: + self.metadata_pool_name = 'metadata' + except CommandFailedError as e: + # For use in upgrade tests, Ceph cuttlefish and earlier don't support + # structured output (--format) from the CLI. + if e.exitstatus == 22: + metadata_pool_exists = True + else: + raise + + return metadata_pool_exists + + def _df(self): + return json.loads(self.mon_manager.raw_cluster_cmd("df", "--format=json-pretty")) + + # may raise FSMissing + def get_mds_map(self, status=None): + if status is None: + status = self.status() + return status.get_fsmap(self.id)['mdsmap'] + + def get_var(self, var, status=None): + return self.get_mds_map(status=status)[var] + + def set_dir_layout(self, mount, path, layout): + for name, value in layout.items(): + mount.run_shell(args=["setfattr", "-n", "ceph.dir.layout."+name, "-v", str(value), path]) + + def add_data_pool(self, name, create=True): + if create: + try: + self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', name, + '--pg_num_min', str(self.pg_num_min)) + except CommandFailedError as e: + if e.exitstatus == 22: # nautilus couldn't specify --pg_num_min option + self.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', name, + str(self.pg_num_min)) + else: + raise + self.mon_manager.raw_cluster_cmd('fs', 'add_data_pool', self.name, name) + self.get_pool_names(refresh = True) + for poolid, fs_name in self.data_pools.items(): + if name == fs_name: + return poolid + raise RuntimeError("could not get just created pool '{0}'".format(name)) + + def get_pool_names(self, refresh = False, status = None): + if refresh or self.metadata_pool_name is None or self.data_pools is None: + if status is None: + status = self.status() + fsmap = status.get_fsmap(self.id) + + osd_map = self.mon_manager.get_osd_dump_json() + id_to_name = {} + for p in osd_map['pools']: + id_to_name[p['pool']] = p['pool_name'] + + self.metadata_pool_name = id_to_name[fsmap['mdsmap']['metadata_pool']] + self.data_pools = {} + for data_pool in fsmap['mdsmap']['data_pools']: + self.data_pools[data_pool] = id_to_name[data_pool] + + def get_data_pool_name(self, refresh = False): + if refresh or self.data_pools is None: + self.get_pool_names(refresh = True) + assert(len(self.data_pools) == 1) + return next(iter(self.data_pools.values())) + + def get_data_pool_id(self, refresh = False): + """ + Don't call this if you have multiple data pools + :return: integer + """ + if refresh or self.data_pools is None: + self.get_pool_names(refresh = True) + assert(len(self.data_pools) == 1) + return next(iter(self.data_pools.keys())) + + def get_data_pool_names(self, refresh = False): + if refresh or self.data_pools is None: + self.get_pool_names(refresh = True) + return list(self.data_pools.values()) + + def get_metadata_pool_name(self): + return self.metadata_pool_name + + def set_data_pool_name(self, name): + if self.id is not None: + raise RuntimeError("can't set filesystem name if its fscid is set") + self.data_pool_name = name + + def get_pool_pg_num(self, pool_name): + pgs = json.loads(self.mon_manager.raw_cluster_cmd('osd', 'pool', 'get', + pool_name, 'pg_num', + '--format=json-pretty')) + return int(pgs['pg_num']) + + def get_namespace_id(self): + return self.id + + def get_pool_df(self, pool_name): + """ + Return a dict like: + {u'bytes_used': 0, u'max_avail': 83848701, u'objects': 0, u'kb_used': 0} + """ + for pool_df in self._df()['pools']: + if pool_df['name'] == pool_name: + return pool_df['stats'] + + raise RuntimeError("Pool name '{0}' not found".format(pool_name)) + + def get_usage(self): + return self._df()['stats']['total_used_bytes'] + + def are_daemons_healthy(self, status=None, skip_max_mds_check=False): + """ + Return true if all daemons are in one of active, standby, standby-replay, and + at least max_mds daemons are in 'active'. + + Unlike most of Filesystem, this function is tolerant of new-style `fs` + commands being missing, because we are part of the ceph installation + process during upgrade suites, so must fall back to old style commands + when we get an EINVAL on a new style command. + + :return: + """ + # First, check to see that processes haven't exited with an error code + for mds in self._ctx.daemons.iter_daemons_of_role('mds'): + mds.check_status() + + active_count = 0 + mds_map = self.get_mds_map(status=status) + + log.debug("are_daemons_healthy: mds map: {0}".format(mds_map)) + + for mds_id, mds_status in mds_map['info'].items(): + if mds_status['state'] not in ["up:active", "up:standby", "up:standby-replay"]: + log.warning("Unhealthy mds state {0}:{1}".format(mds_id, mds_status['state'])) + return False + elif mds_status['state'] == 'up:active': + active_count += 1 + + log.debug("are_daemons_healthy: {0}/{1}".format( + active_count, mds_map['max_mds'] + )) + + if not skip_max_mds_check: + if active_count > mds_map['max_mds']: + log.debug("are_daemons_healthy: number of actives is greater than max_mds: {0}".format(mds_map)) + return False + elif active_count == mds_map['max_mds']: + # The MDSMap says these guys are active, but let's check they really are + for mds_id, mds_status in mds_map['info'].items(): + if mds_status['state'] == 'up:active': + try: + daemon_status = self.mds_tell(["status"], mds_id=mds_status['name']) + except CommandFailedError as cfe: + if cfe.exitstatus == errno.EINVAL: + # Old version, can't do this check + continue + else: + # MDS not even running + return False + + if daemon_status['state'] != 'up:active': + # MDS hasn't taken the latest map yet + return False + + return True + else: + return False + else: + log.debug("are_daemons_healthy: skipping max_mds check") + return True + + def get_daemon_names(self, state=None, status=None): + """ + Return MDS daemon names of those daemons in the given state + :param state: + :return: + """ + mdsmap = self.get_mds_map(status) + result = [] + for mds_status in sorted(mdsmap['info'].values(), + key=lambda _: _['rank']): + if mds_status['state'] == state or state is None: + result.append(mds_status['name']) + + return result + + def get_active_names(self, status=None): + """ + Return MDS daemon names of those daemons holding ranks + in state up:active + + :return: list of strings like ['a', 'b'], sorted by rank + """ + return self.get_daemon_names("up:active", status=status) + + def get_all_mds_rank(self, status=None): + mdsmap = self.get_mds_map(status) + result = [] + for mds_status in sorted(mdsmap['info'].values(), + key=lambda _: _['rank']): + if mds_status['rank'] != -1 and mds_status['state'] != 'up:standby-replay': + result.append(mds_status['rank']) + + return result + + def get_rank(self, rank=None, status=None): + if status is None: + status = self.getinfo() + if rank is None: + rank = 0 + return status.get_rank(self.id, rank) + + def rank_restart(self, rank=0, status=None): + name = self.get_rank(rank=rank, status=status)['name'] + self.mds_restart(mds_id=name) + + def rank_signal(self, signal, rank=0, status=None): + name = self.get_rank(rank=rank, status=status)['name'] + self.mds_signal(name, signal) + + def rank_freeze(self, yes, rank=0): + self.mon_manager.raw_cluster_cmd("mds", "freeze", "{}:{}".format(self.id, rank), str(yes).lower()) + + def rank_repaired(self, rank): + self.mon_manager.raw_cluster_cmd("mds", "repaired", "{}:{}".format(self.id, rank)) + + def rank_fail(self, rank=0): + self.mon_manager.raw_cluster_cmd("mds", "fail", "{}:{}".format(self.id, rank)) + + def rank_is_running(self, rank=0, status=None): + name = self.get_rank(rank=rank, status=status)['name'] + return self.mds_is_running(name) + + def get_ranks(self, status=None): + if status is None: + status = self.getinfo() + return status.get_ranks(self.id) + + def get_damaged(self, status=None): + if status is None: + status = self.getinfo() + return status.get_damaged(self.id) + + def get_replays(self, status=None): + if status is None: + status = self.getinfo() + return status.get_replays(self.id) + + def get_replay(self, rank=0, status=None): + for replay in self.get_replays(status=status): + if replay['rank'] == rank: + return replay + return None + + def get_rank_names(self, status=None): + """ + Return MDS daemon names of those daemons holding a rank, + sorted by rank. This includes e.g. up:replay/reconnect + as well as active, but does not include standby or + standby-replay. + """ + mdsmap = self.get_mds_map(status) + result = [] + for mds_status in sorted(mdsmap['info'].values(), + key=lambda _: _['rank']): + if mds_status['rank'] != -1 and mds_status['state'] != 'up:standby-replay': + result.append(mds_status['name']) + + return result + + def wait_for_daemons(self, timeout=None, skip_max_mds_check=False, status=None): + """ + Wait until all daemons are healthy + :return: + """ + + if timeout is None: + timeout = DAEMON_WAIT_TIMEOUT + + if self.id is None: + status = self.getinfo(refresh=True) + + if status is None: + status = self.status() + + elapsed = 0 + while True: + if self.are_daemons_healthy(status=status, skip_max_mds_check=skip_max_mds_check): + return status + else: + time.sleep(1) + elapsed += 1 + + if elapsed > timeout: + log.debug("status = {0}".format(status)) + raise RuntimeError("Timed out waiting for MDS daemons to become healthy") + + status = self.status() + + def dencoder(self, obj_type, obj_blob): + args = [os.path.join(self._prefix, "ceph-dencoder"), 'type', obj_type, 'import', '-', 'decode', 'dump_json'] + p = self.mon_manager.controller.run(args=args, stdin=BytesIO(obj_blob), stdout=BytesIO()) + return p.stdout.getvalue() + + def rados(self, *args, **kwargs): + """ + Callout to rados CLI. + """ + + return self.mon_manager.do_rados(*args, **kwargs) + + def radosm(self, *args, **kwargs): + """ + Interact with the metadata pool via rados CLI. + """ + + return self.rados(*args, **kwargs, pool=self.get_metadata_pool_name()) + + def radosmo(self, *args, stdout=BytesIO(), **kwargs): + """ + Interact with the metadata pool via rados CLI. Get the stdout. + """ + + return self.radosm(*args, **kwargs, stdout=stdout).stdout.getvalue() + + def get_metadata_object(self, object_type, object_id): + """ + Retrieve an object from the metadata pool, pass it through + ceph-dencoder to dump it to JSON, and return the decoded object. + """ + + o = self.radosmo(['get', object_id, '-']) + j = self.dencoder(object_type, o) + try: + return json.loads(j) + except (TypeError, ValueError): + log.error("Failed to decode JSON: '{0}'".format(j)) + raise + + def get_journal_version(self): + """ + Read the JournalPointer and Journal::Header objects to learn the version of + encoding in use. + """ + journal_pointer_object = '400.00000000' + journal_pointer_dump = self.get_metadata_object("JournalPointer", journal_pointer_object) + journal_ino = journal_pointer_dump['journal_pointer']['front'] + + journal_header_object = "{0:x}.00000000".format(journal_ino) + journal_header_dump = self.get_metadata_object('Journaler::Header', journal_header_object) + + version = journal_header_dump['journal_header']['stream_format'] + log.debug("Read journal version {0}".format(version)) + + return version + + def mds_asok(self, command, mds_id=None, timeout=None): + if mds_id is None: + return self.rank_asok(command, timeout=timeout) + + return self.json_asok(command, 'mds', mds_id, timeout=timeout) + + def mds_tell(self, command, mds_id=None): + if mds_id is None: + return self.rank_tell(command) + + return json.loads(self.mon_manager.raw_cluster_cmd("tell", f"mds.{mds_id}", *command)) + + def rank_asok(self, command, rank=0, status=None, timeout=None): + info = self.get_rank(rank=rank, status=status) + return self.json_asok(command, 'mds', info['name'], timeout=timeout) + + def rank_tell(self, command, rank=0, status=None): + try: + out = self.mon_manager.raw_cluster_cmd("tell", f"mds.{self.id}:{rank}", *command) + return json.loads(out) + except json.decoder.JSONDecodeError: + log.error("could not decode: {}".format(out)) + raise + + def ranks_tell(self, command, status=None): + if status is None: + status = self.status() + out = [] + for r in status.get_ranks(self.id): + result = self.rank_tell(command, rank=r['rank'], status=status) + out.append((r['rank'], result)) + return sorted(out) + + def ranks_perf(self, f, status=None): + perf = self.ranks_tell(["perf", "dump"], status=status) + out = [] + for rank, perf in perf: + out.append((rank, f(perf))) + return out + + def read_cache(self, path, depth=None, rank=None): + cmd = ["dump", "tree", path] + if depth is not None: + cmd.append(depth.__str__()) + result = self.rank_asok(cmd, rank=rank) + if result is None or len(result) == 0: + raise RuntimeError("Path not found in cache: {0}".format(path)) + + return result + + def wait_for_state(self, goal_state, reject=None, timeout=None, mds_id=None, rank=None): + """ + Block until the MDS reaches a particular state, or a failure condition + is met. + + When there are multiple MDSs, succeed when exaclty one MDS is in the + goal state, or fail when any MDS is in the reject state. + + :param goal_state: Return once the MDS is in this state + :param reject: Fail if the MDS enters this state before the goal state + :param timeout: Fail if this many seconds pass before reaching goal + :return: number of seconds waited, rounded down to integer + """ + + started_at = time.time() + while True: + status = self.status() + if rank is not None: + try: + mds_info = status.get_rank(self.id, rank) + current_state = mds_info['state'] if mds_info else None + log.debug("Looked up MDS state for mds.{0}: {1}".format(rank, current_state)) + except: + mdsmap = self.get_mds_map(status=status) + if rank in mdsmap['failed']: + log.debug("Waiting for rank {0} to come back.".format(rank)) + current_state = None + else: + raise + elif mds_id is not None: + # mds_info is None if no daemon with this ID exists in the map + mds_info = status.get_mds(mds_id) + current_state = mds_info['state'] if mds_info else None + log.debug("Looked up MDS state for {0}: {1}".format(mds_id, current_state)) + else: + # In general, look for a single MDS + states = [m['state'] for m in status.get_ranks(self.id)] + if [s for s in states if s == goal_state] == [goal_state]: + current_state = goal_state + elif reject in states: + current_state = reject + else: + current_state = None + log.debug("mapped states {0} to {1}".format(states, current_state)) + + elapsed = time.time() - started_at + if current_state == goal_state: + log.debug("reached state '{0}' in {1}s".format(current_state, elapsed)) + return elapsed + elif reject is not None and current_state == reject: + raise RuntimeError("MDS in reject state {0}".format(current_state)) + elif timeout is not None and elapsed > timeout: + log.error("MDS status at timeout: {0}".format(status.get_fsmap(self.id))) + raise RuntimeError( + "Reached timeout after {0} seconds waiting for state {1}, while in state {2}".format( + elapsed, goal_state, current_state + )) + else: + time.sleep(1) + + def _read_data_xattr(self, ino_no, xattr_name, obj_type, pool): + if pool is None: + pool = self.get_data_pool_name() + + obj_name = "{0:x}.00000000".format(ino_no) + + args = ["getxattr", obj_name, xattr_name] + try: + proc = self.rados(args, pool=pool, stdout=BytesIO()) + except CommandFailedError as e: + log.error(e.__str__()) + raise ObjectNotFound(obj_name) + + obj_blob = proc.stdout.getvalue() + return json.loads(self.dencoder(obj_type, obj_blob).strip()) + + def _write_data_xattr(self, ino_no, xattr_name, data, pool=None): + """ + Write to an xattr of the 0th data object of an inode. Will + succeed whether the object and/or xattr already exist or not. + + :param ino_no: integer inode number + :param xattr_name: string name of the xattr + :param data: byte array data to write to the xattr + :param pool: name of data pool or None to use primary data pool + :return: None + """ + if pool is None: + pool = self.get_data_pool_name() + + obj_name = "{0:x}.00000000".format(ino_no) + args = ["setxattr", obj_name, xattr_name, data] + self.rados(args, pool=pool) + + def read_symlink(self, ino_no, pool=None): + return self._read_data_xattr(ino_no, "symlink", "string_wrapper", pool) + + def read_backtrace(self, ino_no, pool=None): + """ + Read the backtrace from the data pool, return a dict in the format + given by inode_backtrace_t::dump, which is something like: + + :: + + rados -p cephfs_data getxattr 10000000002.00000000 parent > out.bin + ceph-dencoder type inode_backtrace_t import out.bin decode dump_json + + { "ino": 1099511627778, + "ancestors": [ + { "dirino": 1, + "dname": "blah", + "version": 11}], + "pool": 1, + "old_pools": []} + + :param pool: name of pool to read backtrace from. If omitted, FS must have only + one data pool and that will be used. + """ + return self._read_data_xattr(ino_no, "parent", "inode_backtrace_t", pool) + + def read_layout(self, ino_no, pool=None): + """ + Read 'layout' xattr of an inode and parse the result, returning a dict like: + :: + { + "stripe_unit": 4194304, + "stripe_count": 1, + "object_size": 4194304, + "pool_id": 1, + "pool_ns": "", + } + + :param pool: name of pool to read backtrace from. If omitted, FS must have only + one data pool and that will be used. + """ + return self._read_data_xattr(ino_no, "layout", "file_layout_t", pool) + + def _enumerate_data_objects(self, ino, size): + """ + Get the list of expected data objects for a range, and the list of objects + that really exist. + + :return a tuple of two lists of strings (expected, actual) + """ + stripe_size = 1024 * 1024 * 4 + + size = max(stripe_size, size) + + want_objects = [ + "{0:x}.{1:08x}".format(ino, n) + for n in range(0, ((size - 1) // stripe_size) + 1) + ] + + exist_objects = self.rados(["ls"], pool=self.get_data_pool_name(), stdout=StringIO()).stdout.getvalue().split("\n") + + return want_objects, exist_objects + + def data_objects_present(self, ino, size): + """ + Check that *all* the expected data objects for an inode are present in the data pool + """ + + want_objects, exist_objects = self._enumerate_data_objects(ino, size) + missing = set(want_objects) - set(exist_objects) + + if missing: + log.debug("Objects missing (ino {0}, size {1}): {2}".format( + ino, size, missing + )) + return False + else: + log.debug("All objects for ino {0} size {1} found".format(ino, size)) + return True + + def data_objects_absent(self, ino, size): + want_objects, exist_objects = self._enumerate_data_objects(ino, size) + present = set(want_objects) & set(exist_objects) + + if present: + log.debug("Objects not absent (ino {0}, size {1}): {2}".format( + ino, size, present + )) + return False + else: + log.debug("All objects for ino {0} size {1} are absent".format(ino, size)) + return True + + def dirfrag_exists(self, ino, frag): + try: + self.radosm(["stat", "{0:x}.{1:08x}".format(ino, frag)]) + except CommandFailedError: + return False + else: + return True + + def list_dirfrag(self, dir_ino): + """ + Read the named object and return the list of omap keys + + :return a list of 0 or more strings + """ + + dirfrag_obj_name = "{0:x}.00000000".format(dir_ino) + + try: + key_list_str = self.radosmo(["listomapkeys", dirfrag_obj_name], stdout=StringIO()) + except CommandFailedError as e: + log.error(e.__str__()) + raise ObjectNotFound(dirfrag_obj_name) + + return key_list_str.strip().split("\n") if key_list_str else [] + + def get_meta_of_fs_file(self, dir_ino, obj_name, out): + """ + get metadata from parent to verify the correctness of the data format encoded by the tool, cephfs-meta-injection. + warning : The splitting of directory is not considered here. + """ + + dirfrag_obj_name = "{0:x}.00000000".format(dir_ino) + try: + self.radosm(["getomapval", dirfrag_obj_name, obj_name+"_head", out]) + except CommandFailedError as e: + log.error(e.__str__()) + raise ObjectNotFound(dir_ino) + + def erase_metadata_objects(self, prefix): + """ + For all objects in the metadata pool matching the prefix, + erase them. + + This O(N) with the number of objects in the pool, so only suitable + for use on toy test filesystems. + """ + all_objects = self.radosmo(["ls"], stdout=StringIO()).strip().split("\n") + matching_objects = [o for o in all_objects if o.startswith(prefix)] + for o in matching_objects: + self.radosm(["rm", o]) + + def erase_mds_objects(self, rank): + """ + Erase all the per-MDS objects for a particular rank. This includes + inotable, sessiontable, journal + """ + + def obj_prefix(multiplier): + """ + MDS object naming conventions like rank 1's + journal is at 201.*** + """ + return "%x." % (multiplier * 0x100 + rank) + + # MDS_INO_LOG_OFFSET + self.erase_metadata_objects(obj_prefix(2)) + # MDS_INO_LOG_BACKUP_OFFSET + self.erase_metadata_objects(obj_prefix(3)) + # MDS_INO_LOG_POINTER_OFFSET + self.erase_metadata_objects(obj_prefix(4)) + # MDSTables & SessionMap + self.erase_metadata_objects("mds{rank:d}_".format(rank=rank)) + + @property + def _prefix(self): + """ + Override this to set a different + """ + return "" + + def _make_rank(self, rank): + return "{}:{}".format(self.name, rank) + + def _run_tool(self, tool, args, rank=None, quiet=False): + # Tests frequently have [client] configuration that jacks up + # the objecter log level (unlikely to be interesting here) + # and does not set the mds log level (very interesting here) + if quiet: + base_args = [os.path.join(self._prefix, tool), '--debug-mds=1', '--debug-objecter=1'] + else: + base_args = [os.path.join(self._prefix, tool), '--debug-mds=20', '--debug-ms=1', '--debug-objecter=1'] + + if rank is not None: + base_args.extend(["--rank", "%s" % str(rank)]) + + t1 = datetime.datetime.now() + r = self.tool_remote.sh(script=base_args + args, stdout=StringIO()).strip() + duration = datetime.datetime.now() - t1 + log.debug("Ran {0} in time {1}, result:\n{2}".format( + base_args + args, duration, r + )) + return r + + @property + def tool_remote(self): + """ + An arbitrary remote to use when invoking recovery tools. Use an MDS host because + it'll definitely have keys with perms to access cephfs metadata pool. This is public + so that tests can use this remote to go get locally written output files from the tools. + """ + return self.mon_manager.controller + + def journal_tool(self, args, rank, quiet=False): + """ + Invoke cephfs-journal-tool with the passed arguments for a rank, and return its stdout + """ + fs_rank = self._make_rank(rank) + return self._run_tool("cephfs-journal-tool", args, fs_rank, quiet) + + def meta_tool(self, args, rank, quiet=False): + """ + Invoke cephfs-meta-injection with the passed arguments for a rank, and return its stdout + """ + fs_rank = self._make_rank(rank) + return self._run_tool("cephfs-meta-injection", args, fs_rank, quiet) + + def table_tool(self, args, quiet=False): + """ + Invoke cephfs-table-tool with the passed arguments, and return its stdout + """ + return self._run_tool("cephfs-table-tool", args, None, quiet) + + def data_scan(self, args, quiet=False, worker_count=1): + """ + Invoke cephfs-data-scan with the passed arguments, and return its stdout + + :param worker_count: if greater than 1, multiple workers will be run + in parallel and the return value will be None + """ + + workers = [] + + for n in range(0, worker_count): + if worker_count > 1: + # data-scan args first token is a command, followed by args to it. + # insert worker arguments after the command. + cmd = args[0] + worker_args = [cmd] + ["--worker_n", n.__str__(), "--worker_m", worker_count.__str__()] + args[1:] + else: + worker_args = args + + workers.append(Greenlet.spawn(lambda wargs=worker_args: + self._run_tool("cephfs-data-scan", wargs, None, quiet))) + + for w in workers: + w.get() + + if worker_count == 1: + return workers[0].value + else: + return None + + def is_full(self): + return self.is_pool_full(self.get_data_pool_name()) + + def authorize(self, client_id, caps=('/', 'rw')): + """ + Run "ceph fs authorize" and run "ceph auth get" to get and returnt the + keyring. + + client_id: client id that will be authorized + caps: tuple containing the path and permission (can be r or rw) + respectively. + """ + if isinstance(caps[0], (tuple, list)): + x = [] + for c in caps: + x.extend(c) + caps = tuple(x) + + client_name = 'client.' + client_id + return self.mon_manager.raw_cluster_cmd('fs', 'authorize', self.name, + client_name, *caps) + + def grow(self, new_max_mds, status=None): + oldmax = self.get_var('max_mds', status=status) + assert(new_max_mds > oldmax) + self.set_max_mds(new_max_mds) + return self.wait_for_daemons() + + def shrink(self, new_max_mds, status=None): + oldmax = self.get_var('max_mds', status=status) + assert(new_max_mds < oldmax) + self.set_max_mds(new_max_mds) + return self.wait_for_daemons() + + def run_scrub(self, cmd, rank=0): + return self.rank_tell(["scrub"] + cmd, rank) + + def get_scrub_status(self, rank=0): + return self.run_scrub(["status"], rank) + + def flush(self, rank=0): + return self.rank_tell(["flush", "journal"], rank=rank) + + def wait_until_scrub_complete(self, result=None, tag=None, rank=0, sleep=30, + timeout=300, reverse=False): + # time out after "timeout" seconds and assume as done + if result is None: + result = "no active scrubs running" + with contextutil.safe_while(sleep=sleep, tries=timeout//sleep) as proceed: + while proceed(): + out_json = self.rank_tell(["scrub", "status"], rank=rank) + assert out_json is not None + if not reverse: + if result in out_json['status']: + log.info("all active scrubs completed") + return True + else: + if result not in out_json['status']: + log.info("all active scrubs completed") + return True + + if tag is not None: + status = out_json['scrubs'][tag] + if status is not None: + log.info(f"scrub status for tag:{tag} - {status}") + else: + log.info(f"scrub has completed for tag:{tag}") + return True + + # timed out waiting for scrub to complete + return False + + def get_damage(self, rank=None): + if rank is None: + result = {} + for info in self.get_ranks(): + rank = info['rank'] + result[rank] = self.get_damage(rank=rank) + return result + else: + return self.rank_tell(['damage', 'ls'], rank=rank) diff --git a/qa/tasks/cephfs/fuse_mount.py b/qa/tasks/cephfs/fuse_mount.py new file mode 100644 index 000000000..0b9b17403 --- /dev/null +++ b/qa/tasks/cephfs/fuse_mount.py @@ -0,0 +1,533 @@ +import json +import time +import logging + +from io import StringIO +from textwrap import dedent + +from teuthology.contextutil import MaxWhileTries +from teuthology.contextutil import safe_while +from teuthology.orchestra import run +from teuthology.exceptions import CommandFailedError +from tasks.ceph_manager import get_valgrind_args +from tasks.cephfs.mount import CephFSMount, UMOUNT_TIMEOUT + +log = logging.getLogger(__name__) + +# Refer mount.py for docstrings. +class FuseMount(CephFSMount): + def __init__(self, ctx, test_dir, client_id, client_remote, + client_keyring_path=None, cephfs_name=None, + cephfs_mntpt=None, hostfs_mntpt=None, brxnet=None, + client_config={}): + super(FuseMount, self).__init__(ctx=ctx, test_dir=test_dir, + client_id=client_id, client_remote=client_remote, + client_keyring_path=client_keyring_path, hostfs_mntpt=hostfs_mntpt, + cephfs_name=cephfs_name, cephfs_mntpt=cephfs_mntpt, brxnet=brxnet, + client_config=client_config) + + self.fuse_daemon = None + self._fuse_conn = None + self.id = None + self.inst = None + self.addr = None + self.mount_timeout = int(self.client_config.get('mount_timeout', 30)) + + self._mount_bin = [ + 'ceph-fuse', "-f", + "--admin-socket", "/var/run/ceph/$cluster-$name.$pid.asok"] + self._mount_cmd_cwd = self.test_dir + if self.client_config.get('valgrind') is not None: + self.cwd = None # get_valgrind_args chdir for us + self._mount_cmd_logger = log.getChild('ceph-fuse.{id}'.format(id=self.client_id)) + self._mount_cmd_stdin = run.PIPE + + def mount(self, mntopts=None, check_status=True, mntargs=None, **kwargs): + self.update_attrs(**kwargs) + self.assert_and_log_minimum_mount_details() + + self.setup_netns() + + try: + return self._mount(mntopts, mntargs, check_status) + except RuntimeError: + # Catch exceptions by the mount() logic (i.e. not remote command + # failures) and ensure the mount is not left half-up. + # Otherwise we might leave a zombie mount point that causes + # anyone traversing cephtest/ to get hung up on. + log.warning("Trying to clean up after failed mount") + self.umount_wait(force=True) + raise + + def _mount(self, mntopts, mntargs, check_status): + log.info("Client client.%s config is %s" % (self.client_id, + self.client_config)) + + self._create_mntpt() + + retval = self._run_mount_cmd(mntopts, mntargs, check_status) + if retval: + return retval + + self.gather_mount_info() + + def _run_mount_cmd(self, mntopts, mntargs, check_status): + mount_cmd = self._get_mount_cmd(mntopts, mntargs) + mountcmd_stdout, mountcmd_stderr = StringIO(), StringIO() + + # Before starting ceph-fuse process, note the contents of + # /sys/fs/fuse/connections + pre_mount_conns = self._list_fuse_conns() + log.info("Pre-mount connections: {0}".format(pre_mount_conns)) + + self.fuse_daemon = self.client_remote.run( + args=mount_cmd, + cwd=self._mount_cmd_cwd, + logger=self._mount_cmd_logger, + stdin=self._mount_cmd_stdin, + stdout=mountcmd_stdout, + stderr=mountcmd_stderr, + wait=False + ) + + return self._wait_and_record_our_fuse_conn( + check_status, pre_mount_conns, mountcmd_stdout, mountcmd_stderr) + + def _get_mount_cmd(self, mntopts, mntargs): + daemon_signal = 'kill' + if self.client_config.get('coverage') or \ + self.client_config.get('valgrind') is not None: + daemon_signal = 'term' + + mount_cmd = ['sudo', 'adjust-ulimits', 'ceph-coverage', + '{tdir}/archive/coverage'.format(tdir=self.test_dir), + 'daemon-helper', daemon_signal] + + mount_cmd = self._add_valgrind_args(mount_cmd) + mount_cmd = ['sudo'] + self._nsenter_args + mount_cmd + + mount_cmd += self._mount_bin + [self.hostfs_mntpt] + if self.client_id: + mount_cmd += ['--id', self.client_id] + if self.client_keyring_path and self.client_id: + mount_cmd += ['-k', self.client_keyring_path] + + self.validate_subvol_options() + + if self.cephfs_mntpt: + mount_cmd += ["--client_mountpoint=" + self.cephfs_mntpt] + + if self.cephfs_name: + mount_cmd += ["--client_fs=" + self.cephfs_name] + if mntopts: + mount_cmd.extend(('-o', ','.join(mntopts))) + if mntargs: + mount_cmd.extend(mntargs) + + return mount_cmd + + def _add_valgrind_args(self, mount_cmd): + if self.client_config.get('valgrind') is not None: + mount_cmd = get_valgrind_args( + self.test_dir, + 'client.{id}'.format(id=self.client_id), + mount_cmd, + self.client_config.get('valgrind'), + cd=False + ) + + return mount_cmd + + def _list_fuse_conns(self): + conn_dir = "/sys/fs/fuse/connections" + + self.client_remote.run(args=['sudo', 'modprobe', 'fuse'], + check_status=False) + self.client_remote.run( + args=["sudo", "mount", "-t", "fusectl", conn_dir, conn_dir], + check_status=False, timeout=(30)) + + try: + ls_str = self.client_remote.sh("ls " + conn_dir, + stdout=StringIO(), + timeout=300).strip() + except CommandFailedError: + return [] + + if ls_str: + return [int(n) for n in ls_str.split("\n")] + else: + return [] + + def _wait_and_record_our_fuse_conn(self, check_status, pre_mount_conns, + mountcmd_stdout, mountcmd_stderr): + """ + Wait for the connection reference to appear in /sys + """ + waited = 0 + + post_mount_conns = self._list_fuse_conns() + while len(post_mount_conns) <= len(pre_mount_conns): + if self.fuse_daemon.finished: + # Did mount fail? Raise the CommandFailedError instead of + # hitting the "failed to populate /sys/" timeout + try: + self.fuse_daemon.wait() + except CommandFailedError as e: + log.info('mount command failed.') + if check_status: + raise + else: + return (e, mountcmd_stdout.getvalue(), + mountcmd_stderr.getvalue()) + time.sleep(1) + waited += 1 + if waited > self._fuse_conn_check_timeout: + raise RuntimeError( + "Fuse mount failed to populate/sys/ after {} " + "seconds".format(waited)) + else: + post_mount_conns = self._list_fuse_conns() + + log.info("Post-mount connections: {0}".format(post_mount_conns)) + + self._record_our_fuse_conn(pre_mount_conns, post_mount_conns) + + @property + def _fuse_conn_check_timeout(self): + mount_wait = self.client_config.get('mount_wait', 0) + if mount_wait > 0: + log.info("Fuse mount waits {0} seconds before checking /sys/".format(mount_wait)) + time.sleep(mount_wait) + timeout = int(self.client_config.get('mount_timeout', 30)) + return timeout + + def _record_our_fuse_conn(self, pre_mount_conns, post_mount_conns): + """ + Record our fuse connection number so that we can use it when forcing + an unmount. + """ + new_conns = list(set(post_mount_conns) - set(pre_mount_conns)) + if len(new_conns) == 0: + raise RuntimeError("New fuse connection directory not found ({0})".format(new_conns)) + elif len(new_conns) > 1: + raise RuntimeError("Unexpectedly numerous fuse connections {0}".format(new_conns)) + else: + self._fuse_conn = new_conns[0] + + def gather_mount_info(self): + status = self.admin_socket(['status']) + self.id = status['id'] + self.client_pid = status['metadata']['pid'] + try: + self.inst = status['inst_str'] + self.addr = status['addr_str'] + except KeyError: + sessions = self.fs.rank_asok(['session', 'ls']) + for s in sessions: + if s['id'] == self.id: + self.inst = s['inst'] + self.addr = self.inst.split()[1] + if self.inst is None: + raise RuntimeError("cannot find client session") + + def check_mounted_state(self): + proc = self.client_remote.run( + args=[ + 'stat', + '--file-system', + '--printf=%T\n', + '--', + self.hostfs_mntpt, + ], + stdout=StringIO(), + stderr=StringIO(), + wait=False, + timeout=300 + ) + try: + proc.wait() + except CommandFailedError: + error = proc.stderr.getvalue() + if ("endpoint is not connected" in error + or "Software caused connection abort" in error): + # This happens is fuse is killed without unmount + log.warning("Found stale mount point at {0}".format(self.hostfs_mntpt)) + return True + else: + # This happens if the mount directory doesn't exist + log.info('mount point does not exist: %s', self.hostfs_mntpt) + return False + + fstype = proc.stdout.getvalue().rstrip('\n') + if fstype == 'fuseblk': + log.info('ceph-fuse is mounted on %s', self.hostfs_mntpt) + return True + else: + log.debug('ceph-fuse not mounted, got fs type {fstype!r}'.format( + fstype=fstype)) + return False + + def wait_until_mounted(self): + """ + Check to make sure that fuse is mounted on mountpoint. If not, + sleep for 5 seconds and check again. + """ + + while not self.check_mounted_state(): + # Even if it's not mounted, it should at least + # be running: catch simple failures where it has terminated. + assert not self.fuse_daemon.poll() + + time.sleep(5) + + # Now that we're mounted, set permissions so that the rest of the test + # will have unrestricted access to the filesystem mount. + for retry in range(10): + try: + stderr = StringIO() + self.client_remote.run(args=['sudo', 'chmod', '1777', + self.hostfs_mntpt], + timeout=300, + stderr=stderr, omit_sudo=False) + break + except run.CommandFailedError: + stderr = stderr.getvalue().lower() + if "read-only file system" in stderr: + break + elif "permission denied" in stderr: + time.sleep(5) + else: + raise + + def _mountpoint_exists(self): + return self.client_remote.run(args=["ls", "-d", self.hostfs_mntpt], + check_status=False, + timeout=300).exitstatus == 0 + + def umount(self, cleanup=True): + """ + umount() must not run cleanup() when it's called by umount_wait() + since "run.wait([self.fuse_daemon], timeout)" would hang otherwise. + """ + if not self.is_mounted(): + if cleanup: + self.cleanup() + return + if self.is_blocked(): + self._run_umount_lf() + if cleanup: + self.cleanup() + return + + try: + log.info('Running fusermount -u on {name}...'.format(name=self.client_remote.name)) + stderr = StringIO() + self.client_remote.run( + args=['sudo', 'fusermount', '-u', self.hostfs_mntpt], + stderr=stderr, timeout=UMOUNT_TIMEOUT, omit_sudo=False) + except run.CommandFailedError: + if "mountpoint not found" in stderr.getvalue(): + # This happens if the mount directory doesn't exist + log.info('mount point does not exist: %s', self.mountpoint) + elif "not mounted" in stderr.getvalue(): + # This happens if the mount directory already unmouted + log.info('mount point not mounted: %s', self.mountpoint) + else: + log.info('Failed to unmount ceph-fuse on {name}, aborting...'.format(name=self.client_remote.name)) + + self.client_remote.run( + args=['sudo', run.Raw('PATH=/usr/sbin:$PATH'), 'lsof', + run.Raw(';'), 'ps', 'auxf'], + timeout=UMOUNT_TIMEOUT, omit_sudo=False) + + # abort the fuse mount, killing all hung processes + if self._fuse_conn: + self.run_python(dedent(""" + import os + path = "/sys/fs/fuse/connections/{0}/abort" + if os.path.exists(path): + open(path, "w").write("1") + """).format(self._fuse_conn)) + self._fuse_conn = None + + # make sure its unmounted + self._run_umount_lf() + + self._fuse_conn = None + self.id = None + self.inst = None + self.addr = None + if cleanup: + self.cleanup() + + def umount_wait(self, force=False, require_clean=False, + timeout=UMOUNT_TIMEOUT): + """ + :param force: Complete cleanly even if the MDS is offline + """ + if not (self.is_mounted() and self.fuse_daemon): + log.debug('ceph-fuse client.{id} is not mounted at {remote} ' + '{mnt}'.format(id=self.client_id, + remote=self.client_remote, + mnt=self.hostfs_mntpt)) + self.cleanup() + return + + if force: + assert not require_clean # mutually exclusive + + # When we expect to be forcing, kill the ceph-fuse process directly. + # This should avoid hitting the more aggressive fallback killing + # in umount() which can affect other mounts too. + self.fuse_daemon.stdin.close() + + # However, we will still hit the aggressive wait if there is an ongoing + # mount -o remount (especially if the remount is stuck because MDSs + # are unavailable) + + if self.is_blocked(): + self._run_umount_lf() + self.cleanup() + return + + # cleanup is set to to fail since clieanup must happen after umount is + # complete; otherwise following call to run.wait hangs. + self.umount(cleanup=False) + + try: + # Permit a timeout, so that we do not block forever + run.wait([self.fuse_daemon], timeout) + + except MaxWhileTries: + log.error("process failed to terminate after unmount. This probably" + " indicates a bug within ceph-fuse.") + raise + except CommandFailedError: + if require_clean: + raise + + self.cleanup() + + def teardown(self): + """ + Whatever the state of the mount, get it gone. + """ + super(FuseMount, self).teardown() + + self.umount() + + if self.fuse_daemon and not self.fuse_daemon.finished: + self.fuse_daemon.stdin.close() + try: + self.fuse_daemon.wait() + except CommandFailedError: + pass + + def _asok_path(self): + return "/var/run/ceph/ceph-client.{0}.*.asok".format(self.client_id) + + @property + def _prefix(self): + return "" + + def find_admin_socket(self): + pyscript = """ +import glob +import re +import os +import subprocess + +def _find_admin_socket(client_name): + asok_path = "{asok_path}" + files = glob.glob(asok_path) + mountpoint = "{mountpoint}" + + # Given a non-glob path, it better be there + if "*" not in asok_path: + assert(len(files) == 1) + return files[0] + + for f in files: + pid = re.match(".*\.(\d+)\.asok$", f).group(1) + if os.path.exists("/proc/{{0}}".format(pid)): + with open("/proc/{{0}}/cmdline".format(pid), 'r') as proc_f: + contents = proc_f.read() + if mountpoint in contents: + return f + raise RuntimeError("Client socket {{0}} not found".format(client_name)) + +print(_find_admin_socket("{client_name}")) +""".format( + asok_path=self._asok_path(), + client_name="client.{0}".format(self.client_id), + mountpoint=self.mountpoint) + + asok_path = self.run_python(pyscript, sudo=True) + log.info("Found client admin socket at {0}".format(asok_path)) + return asok_path + + def admin_socket(self, args): + asok_path = self.find_admin_socket() + + # Query client ID from admin socket, wait 2 seconds + # and retry 10 times if it is not ready + with safe_while(sleep=2, tries=10) as proceed: + while proceed(): + try: + p = self.client_remote.run(args= + ['sudo', self._prefix + 'ceph', '--admin-daemon', asok_path] + args, + stdout=StringIO(), stderr=StringIO(), wait=False, + timeout=300) + p.wait() + break + except CommandFailedError: + if "connection refused" in p.stderr.getvalue().lower(): + pass + + return json.loads(p.stdout.getvalue().strip()) + + def get_global_id(self): + """ + Look up the CephFS client ID for this mount + """ + return self.admin_socket(['mds_sessions'])['id'] + + def get_global_inst(self): + """ + Look up the CephFS client instance for this mount + """ + return self.inst + + def get_global_addr(self): + """ + Look up the CephFS client addr for this mount + """ + return self.addr + + def get_client_pid(self): + """ + return pid of ceph-fuse process + """ + status = self.admin_socket(['status']) + return status['metadata']['pid'] + + def get_osd_epoch(self): + """ + Return 2-tuple of osd_epoch, osd_epoch_barrier + """ + status = self.admin_socket(['status']) + return status['osd_epoch'], status['osd_epoch_barrier'] + + def get_dentry_count(self): + """ + Return 2-tuple of dentry_count, dentry_pinned_count + """ + status = self.admin_socket(['status']) + return status['dentry_count'], status['dentry_pinned_count'] + + def set_cache_size(self, size): + return self.admin_socket(['config', 'set', 'client_cache_size', str(size)]) + + def get_op_read_count(self): + return self.admin_socket(['perf', 'dump', 'objecter'])['objecter']['osdop_read'] diff --git a/qa/tasks/cephfs/kernel_mount.py b/qa/tasks/cephfs/kernel_mount.py new file mode 100644 index 000000000..89f6b6639 --- /dev/null +++ b/qa/tasks/cephfs/kernel_mount.py @@ -0,0 +1,394 @@ +import errno +import json +import logging +import os +import re + +from io import StringIO +from textwrap import dedent + +from teuthology.exceptions import CommandFailedError +from teuthology.orchestra import run +from teuthology.contextutil import MaxWhileTries + +from tasks.cephfs.mount import CephFSMount, UMOUNT_TIMEOUT + +log = logging.getLogger(__name__) + + +# internal metadata directory +DEBUGFS_META_DIR = 'meta' + +class KernelMount(CephFSMount): + def __init__(self, ctx, test_dir, client_id, client_remote, + client_keyring_path=None, hostfs_mntpt=None, + cephfs_name=None, cephfs_mntpt=None, brxnet=None, + client_config={}): + super(KernelMount, self).__init__(ctx=ctx, test_dir=test_dir, + client_id=client_id, client_remote=client_remote, + client_keyring_path=client_keyring_path, hostfs_mntpt=hostfs_mntpt, + cephfs_name=cephfs_name, cephfs_mntpt=cephfs_mntpt, brxnet=brxnet, + client_config=client_config) + + if client_config.get('debug', False): + self.client_remote.run(args=["sudo", "bash", "-c", "echo 'module ceph +p' > /sys/kernel/debug/dynamic_debug/control"]) + self.client_remote.run(args=["sudo", "bash", "-c", "echo 'module libceph +p' > /sys/kernel/debug/dynamic_debug/control"]) + + self.dynamic_debug = self.client_config.get('dynamic_debug', False) + self.rbytes = self.client_config.get('rbytes', False) + self.snapdirname = client_config.get('snapdirname', '.snap') + self.syntax_style = self.client_config.get('syntax', 'v2') + self.inst = None + self.addr = None + self._mount_bin = ['adjust-ulimits', 'ceph-coverage', self.test_dir +\ + '/archive/coverage', '/bin/mount', '-t', 'ceph'] + + def mount(self, mntopts=None, check_status=True, **kwargs): + self.update_attrs(**kwargs) + self.assert_and_log_minimum_mount_details() + + self.setup_netns() + + if not self.cephfs_mntpt: + self.cephfs_mntpt = '/' + if not self.cephfs_name: + self.cephfs_name = 'cephfs' + + self._create_mntpt() + + retval = self._run_mount_cmd(mntopts, check_status) + if retval: + return retval + + self._set_filemode_on_mntpt() + + if self.dynamic_debug: + kmount_count = self.ctx.get(f'kmount_count.{self.client_remote.hostname}', 0) + if kmount_count == 0: + self.enable_dynamic_debug() + self.ctx[f'kmount_count.{self.client_remote.hostname}'] = kmount_count + 1 + + try: + self.gather_mount_info() + except: + log.warn('failed to fetch mount info - tests depending on mount addr/inst may fail!') + + def gather_mount_info(self): + self.id = self._get_global_id() + self.get_global_inst() + self.get_global_addr() + + def _run_mount_cmd(self, mntopts, check_status): + mount_cmd = self._get_mount_cmd(mntopts) + mountcmd_stdout, mountcmd_stderr = StringIO(), StringIO() + + try: + self.client_remote.run(args=mount_cmd, timeout=300, + stdout=mountcmd_stdout, + stderr=mountcmd_stderr, omit_sudo=False) + except CommandFailedError as e: + log.info('mount command failed') + if check_status: + raise + else: + return (e, mountcmd_stdout.getvalue(), + mountcmd_stderr.getvalue()) + log.info('mount command passed') + + def _make_mount_cmd_old_or_new_style(self): + optd = {} + mnt_stx = '' + + self.validate_subvol_options() + + assert(self.cephfs_mntpt) + if self.syntax_style == 'v1': + mnt_stx = f':{self.cephfs_mntpt}' + if self.client_id: + optd['name'] = self.client_id + if self.cephfs_name: + optd['mds_namespace'] = self.cephfs_name + elif self.syntax_style == 'v2': + mnt_stx = f'{self.client_id}@.{self.cephfs_name}={self.cephfs_mntpt}' + else: + assert 0, f'invalid syntax style: {self.syntax_style}' + return (mnt_stx, optd) + + def _get_mount_cmd(self, mntopts): + opts = 'norequire_active_mds' + if self.client_keyring_path and self.client_id: + opts += ',secret=' + self.get_key_from_keyfile() + if self.config_path: + opts += ',conf=' + self.config_path + if self.rbytes: + opts += ",rbytes" + else: + opts += ",norbytes" + if self.snapdirname != '.snap': + opts += f',snapdirname={self.snapdirname}' + + mount_cmd = ['sudo'] + self._nsenter_args + stx_opt = self._make_mount_cmd_old_or_new_style() + for opt_name, opt_val in stx_opt[1].items(): + opts += f',{opt_name}={opt_val}' + if mntopts: + opts += ',' + ','.join(mntopts) + log.info(f'mounting using device: {stx_opt[0]}') + # do not fall-back to old-style mount (catch new-style + # mount syntax bugs in the kernel). exclude this config + # when using v1-style syntax, since old mount helpers + # (pre-quincy) would pass this option to the kernel. + if self.syntax_style != 'v1': + opts += ",nofallback" + mount_cmd += self._mount_bin + [stx_opt[0], self.hostfs_mntpt, '-v', + '-o', opts] + return mount_cmd + + def umount(self, force=False): + if not self.is_mounted(): + self.cleanup() + return + + if self.is_blocked(): + self._run_umount_lf() + self.cleanup() + return + + log.debug('Unmounting client client.{id}...'.format(id=self.client_id)) + + try: + cmd=['sudo', 'umount', self.hostfs_mntpt] + if force: + cmd.append('-f') + self.client_remote.run(args=cmd, timeout=UMOUNT_TIMEOUT, omit_sudo=False) + except Exception as e: + log.debug('Killing processes on client.{id}...'.format(id=self.client_id)) + self.client_remote.run( + args=['sudo', run.Raw('PATH=/usr/sbin:$PATH'), 'lsof', + run.Raw(';'), 'ps', 'auxf'], + timeout=UMOUNT_TIMEOUT, omit_sudo=False) + raise e + + if self.dynamic_debug: + kmount_count = self.ctx.get(f'kmount_count.{self.client_remote.hostname}') + assert kmount_count + if kmount_count == 1: + self.disable_dynamic_debug() + self.ctx[f'kmount_count.{self.client_remote.hostname}'] = kmount_count - 1 + + self.cleanup() + + def umount_wait(self, force=False, require_clean=False, + timeout=UMOUNT_TIMEOUT): + """ + Unlike the fuse client, the kernel client's umount is immediate + """ + if not self.is_mounted(): + self.cleanup() + return + + try: + self.umount(force) + except (CommandFailedError, MaxWhileTries): + if not force: + raise + + # force delete the netns and umount + self._run_umount_lf() + self.cleanup() + + def wait_until_mounted(self): + """ + Unlike the fuse client, the kernel client is up and running as soon + as the initial mount() function returns. + """ + assert self.is_mounted() + + def teardown(self): + super(KernelMount, self).teardown() + if self.is_mounted(): + self.umount() + + def _get_debug_dir(self): + """ + Get the debugfs folder for this mount + """ + + cluster_name = 'ceph' + fsid = self.ctx.ceph[cluster_name].fsid + + global_id = self._get_global_id() + + return os.path.join("/sys/kernel/debug/ceph/", f"{fsid}.client{global_id}") + + def read_debug_file(self, filename): + """ + Read the debug file "filename", return None if the file doesn't exist. + """ + + path = os.path.join(self._get_debug_dir(), filename) + + stdout = StringIO() + stderr = StringIO() + try: + self.run_shell_payload(f"sudo dd if={path}", timeout=(5 * 60), + stdout=stdout, stderr=stderr) + return stdout.getvalue() + except CommandFailedError: + if 'no such file or directory' in stderr.getvalue().lower(): + return errno.ENOENT + elif 'not a directory' in stderr.getvalue().lower(): + return errno.ENOTDIR + elif 'permission denied' in stderr.getvalue().lower(): + return errno.EACCES + raise + + def _get_global_id(self): + try: + p = self.run_shell_payload("getfattr --only-values -n ceph.client_id .", stdout=StringIO()) + v = p.stdout.getvalue() + prefix = "client" + assert v.startswith(prefix) + return int(v[len(prefix):]) + except CommandFailedError: + # Probably this fallback can be deleted in a few releases when the kernel xattr is widely available. + log.debug("Falling back to messy global_id lookup via /sys...") + + pyscript = dedent(""" + import glob + import os + import json + + def get_id_to_dir(): + result = {} + for dir in glob.glob("/sys/kernel/debug/ceph/*"): + if os.path.basename(dir) == DEBUGFS_META_DIR: + continue + mds_sessions_lines = open(os.path.join(dir, "mds_sessions")).readlines() + global_id = mds_sessions_lines[0].split()[1].strip('"') + client_id = mds_sessions_lines[1].split()[1].strip('"') + result[client_id] = global_id + return result + print(json.dumps(get_id_to_dir())) + """) + + output = self.client_remote.sh([ + 'sudo', 'python3', '-c', pyscript + ], timeout=(5*60)) + client_id_to_global_id = json.loads(output) + + try: + return client_id_to_global_id[self.client_id] + except KeyError: + log.error("Client id '{0}' debug dir not found (clients seen were: {1})".format( + self.client_id, ",".join(client_id_to_global_id.keys()) + )) + raise + + def _dynamic_debug_control(self, enable): + """ + Write to dynamic debug control file. + """ + if enable: + fdata = "module ceph +p" + else: + fdata = "module ceph -p" + + self.run_shell_payload(f""" +sudo modprobe ceph +echo '{fdata}' | sudo tee /sys/kernel/debug/dynamic_debug/control +""") + + def enable_dynamic_debug(self): + """ + Enable the dynamic debug. + """ + self._dynamic_debug_control(True) + + def disable_dynamic_debug(self): + """ + Disable the dynamic debug. + """ + self._dynamic_debug_control(False) + + def get_global_id(self): + """ + Look up the CephFS client ID for this mount, using debugfs. + """ + + assert self.is_mounted() + + return self._get_global_id() + + @property + def _global_addr(self): + if self.addr is not None: + return self.addr + + # The first line of the "status" file's output will be something + # like: + # "instance: client.4297 (0)10.72.47.117:0/1148470933" + # What we need here is only the string "10.72.47.117:0/1148470933" + status = self.read_debug_file("status") + if status is None: + return None + + instance = re.findall(r'instance:.*', status)[0] + self.addr = instance.split()[2].split(')')[1] + return self.addr; + + @property + def _global_inst(self): + if self.inst is not None: + return self.inst + + client_gid = "client%d" % self.get_global_id() + self.inst = " ".join([client_gid, self._global_addr]) + return self.inst + + def get_global_inst(self): + """ + Look up the CephFS client instance for this mount + """ + return self._global_inst + + def get_global_addr(self): + """ + Look up the CephFS client addr for this mount + """ + return self._global_addr + + def get_osd_epoch(self): + """ + Return 2-tuple of osd_epoch, osd_epoch_barrier + """ + osd_map = self.read_debug_file("osdmap") + assert osd_map + + lines = osd_map.split("\n") + first_line_tokens = lines[0].split() + epoch, barrier = int(first_line_tokens[1]), int(first_line_tokens[3]) + + return epoch, barrier + + def get_op_read_count(self): + stdout = StringIO() + stderr = StringIO() + try: + path = os.path.join(self._get_debug_dir(), "metrics/size") + self.run_shell(f"sudo stat {path}", stdout=stdout, + stderr=stderr, cwd=None) + buf = self.read_debug_file("metrics/size") + except CommandFailedError: + if 'no such file or directory' in stderr.getvalue().lower() \ + or 'not a directory' in stderr.getvalue().lower(): + try: + path = os.path.join(self._get_debug_dir(), "metrics") + self.run_shell(f"sudo stat {path}", stdout=stdout, + stderr=stderr, cwd=None) + buf = self.read_debug_file("metrics") + except CommandFailedError: + return errno.ENOENT + else: + return 0 + return int(re.findall(r'read.*', buf)[0].split()[1]) diff --git a/qa/tasks/cephfs/mount.py b/qa/tasks/cephfs/mount.py new file mode 100644 index 000000000..4a8187406 --- /dev/null +++ b/qa/tasks/cephfs/mount.py @@ -0,0 +1,1570 @@ +import hashlib +import json +import logging +import datetime +import os +import re +import time + +from io import StringIO +from contextlib import contextmanager +from textwrap import dedent +from IPy import IP + +from teuthology.contextutil import safe_while +from teuthology.misc import get_file, write_file +from teuthology.orchestra import run +from teuthology.orchestra.run import Raw +from teuthology.exceptions import CommandFailedError, ConnectionLostError + +from tasks.cephfs.filesystem import Filesystem + +log = logging.getLogger(__name__) + + +UMOUNT_TIMEOUT = 300 + + +class CephFSMount(object): + def __init__(self, ctx, test_dir, client_id, client_remote, + client_keyring_path=None, hostfs_mntpt=None, + cephfs_name=None, cephfs_mntpt=None, brxnet=None, + client_config=None): + """ + :param test_dir: Global teuthology test dir + :param client_id: Client ID, the 'foo' in client.foo + :param client_keyring_path: path to keyring for given client_id + :param client_remote: Remote instance for the host where client will + run + :param hostfs_mntpt: Path to directory on the FS on which Ceph FS will + be mounted + :param cephfs_name: Name of Ceph FS to be mounted + :param cephfs_mntpt: Path to directory inside Ceph FS that will be + mounted as root + """ + self.ctx = ctx + self.test_dir = test_dir + + self._verify_attrs(client_id=client_id, + client_keyring_path=client_keyring_path, + hostfs_mntpt=hostfs_mntpt, cephfs_name=cephfs_name, + cephfs_mntpt=cephfs_mntpt) + + if client_config is None: + client_config = {} + self.client_config = client_config + + self.cephfs_name = cephfs_name + self.client_id = client_id + self.client_keyring_path = client_keyring_path + self.client_remote = client_remote + self.cluster_name = 'ceph' # TODO: use config['cluster'] + self.fs = None + + if cephfs_mntpt is None and client_config.get("mount_path"): + self.cephfs_mntpt = client_config.get("mount_path") + log.info(f"using client_config[\"cephfs_mntpt\"] = {self.cephfs_mntpt}") + else: + self.cephfs_mntpt = cephfs_mntpt + log.info(f"cephfs_mntpt = {self.cephfs_mntpt}") + + if hostfs_mntpt is None and client_config.get("mountpoint"): + self.hostfs_mntpt = client_config.get("mountpoint") + log.info(f"using client_config[\"hostfs_mntpt\"] = {self.hostfs_mntpt}") + elif hostfs_mntpt is not None: + self.hostfs_mntpt = hostfs_mntpt + else: + self.hostfs_mntpt = os.path.join(self.test_dir, f'mnt.{self.client_id}') + self.hostfs_mntpt_dirname = os.path.basename(self.hostfs_mntpt) + log.info(f"hostfs_mntpt = {self.hostfs_mntpt}") + + self._netns_name = None + self.nsid = -1 + if brxnet is None: + self.ceph_brx_net = '192.168.0.0/16' + else: + self.ceph_brx_net = brxnet + + self.test_files = ['a', 'b', 'c'] + + self.background_procs = [] + + # This will cleanup the stale netnses, which are from the + # last failed test cases. + @staticmethod + def cleanup_stale_netnses_and_bridge(remote): + p = remote.run(args=['ip', 'netns', 'list'], + stdout=StringIO(), timeout=(5*60)) + p = p.stdout.getvalue().strip() + + # Get the netns name list + netns_list = re.findall(r'ceph-ns-[^()\s][-.\w]+[^():\s]', p) + + # Remove the stale netnses + for ns in netns_list: + ns_name = ns.split()[0] + args = ['sudo', 'ip', 'netns', 'delete', '{0}'.format(ns_name)] + try: + remote.run(args=args, timeout=(5*60), omit_sudo=False) + except Exception: + pass + + # Remove the stale 'ceph-brx' + try: + args = ['sudo', 'ip', 'link', 'delete', 'ceph-brx'] + remote.run(args=args, timeout=(5*60), omit_sudo=False) + except Exception: + pass + + def _parse_netns_name(self): + self._netns_name = '-'.join(["ceph-ns", + re.sub(r'/+', "-", self.mountpoint)]) + + @property + def mountpoint(self): + if self.hostfs_mntpt is None: + self.hostfs_mntpt = os.path.join(self.test_dir, + self.hostfs_mntpt_dirname) + return self.hostfs_mntpt + + @mountpoint.setter + def mountpoint(self, path): + if not isinstance(path, str): + raise RuntimeError('path should be of str type.') + self._mountpoint = self.hostfs_mntpt = path + + @property + def netns_name(self): + if self._netns_name == None: + self._parse_netns_name() + return self._netns_name + + @netns_name.setter + def netns_name(self, name): + self._netns_name = name + + def assert_that_ceph_fs_exists(self): + output = self.ctx.managers[self.cluster_name].raw_cluster_cmd("fs", "ls") + if self.cephfs_name: + assert self.cephfs_name in output, \ + 'expected ceph fs is not present on the cluster' + log.info(f'Mounting Ceph FS {self.cephfs_name}; just confirmed its presence on cluster') + else: + assert 'No filesystems enabled' not in output, \ + 'ceph cluster has no ceph fs, not even the default ceph fs' + log.info('Mounting default Ceph FS; just confirmed its presence on cluster') + + def assert_and_log_minimum_mount_details(self): + """ + Make sure we have minimum details required for mounting. Ideally, this + method should be called at the beginning of the mount method. + """ + if not self.client_id or not self.client_remote or \ + not self.hostfs_mntpt: + log.error(f"self.client_id = {self.client_id}") + log.error(f"self.client_remote = {self.client_remote}") + log.error(f"self.hostfs_mntpt = {self.hostfs_mntpt}") + errmsg = ('Mounting CephFS requires that at least following ' + 'details to be provided -\n' + '1. the client ID,\n2. the mountpoint and\n' + '3. the remote machine where CephFS will be mounted.\n') + raise RuntimeError(errmsg) + + self.assert_that_ceph_fs_exists() + + log.info('Mounting Ceph FS. Following are details of mount; remember ' + '"None" represents Python type None -') + log.info(f'self.client_remote.hostname = {self.client_remote.hostname}') + log.info(f'self.client.name = client.{self.client_id}') + log.info(f'self.hostfs_mntpt = {self.hostfs_mntpt}') + log.info(f'self.cephfs_name = {self.cephfs_name}') + log.info(f'self.cephfs_mntpt = {self.cephfs_mntpt}') + log.info(f'self.client_keyring_path = {self.client_keyring_path}') + if self.client_keyring_path: + log.info('keyring content -\n' + + get_file(self.client_remote, self.client_keyring_path, + sudo=True).decode()) + + def is_blocked(self): + if not self.addr: + # can't infer if our addr is blocklisted - let the caller try to + # umount without lazy/force. If the client was blocklisted, then + # the umount would be stuck and the test would fail on timeout. + # happens only with Ubuntu 20.04 (missing kclient patches :/). + return False + self.fs = Filesystem(self.ctx, name=self.cephfs_name) + + try: + output = self.fs.mon_manager.raw_cluster_cmd(args='osd blocklist ls') + except CommandFailedError: + # Fallback for older Ceph cluster + output = self.fs.mon_manager.raw_cluster_cmd(args='osd blacklist ls') + + return self.addr in output + + def is_stuck(self): + """ + Check if mount is stuck/in a hanged state. + """ + if not self.is_mounted(): + return False + + retval = self.client_remote.run(args=f'sudo stat {self.hostfs_mntpt}', + omit_sudo=False, wait=False).returncode + if retval == 0: + return False + + time.sleep(10) + proc = self.client_remote.run(args='ps -ef', stdout=StringIO()) + # if proc was running even after 10 seconds, it has to be stuck. + if f'stat {self.hostfs_mntpt}' in proc.stdout.getvalue(): + log.critical('client mounted at self.hostfs_mntpt is stuck!') + return True + return False + + def is_mounted(self): + file = self.client_remote.read_file('/proc/self/mounts',stdout=StringIO()) + if self.hostfs_mntpt in file: + return True + else: + log.debug(f"not mounted; /proc/self/mounts is:\n{file}") + return False + + def setupfs(self, name=None): + if name is None and self.fs is not None: + # Previous mount existed, reuse the old name + name = self.fs.name + self.fs = Filesystem(self.ctx, name=name) + log.info('Wait for MDS to reach steady state...') + self.fs.wait_for_daemons() + log.info('Ready to start {}...'.format(type(self).__name__)) + + def _create_mntpt(self): + self.client_remote.run(args=f'mkdir -p -v {self.hostfs_mntpt}', + timeout=60) + # Use 0000 mode to prevent undesired modifications to the mountpoint on + # the local file system. + self.client_remote.run(args=f'chmod 0000 {self.hostfs_mntpt}', + timeout=60) + + @property + def _nsenter_args(self): + return ['nsenter', f'--net=/var/run/netns/{self.netns_name}'] + + def _set_filemode_on_mntpt(self): + stderr = StringIO() + try: + self.client_remote.run( + args=['sudo', 'chmod', '1777', self.hostfs_mntpt], + stderr=stderr, timeout=(5*60)) + except CommandFailedError: + # the client does not have write permissions in the caps it holds + # for the Ceph FS that was just mounted. + if 'permission denied' in stderr.getvalue().lower(): + pass + + def _setup_brx_and_nat(self): + # The ip for ceph-brx should be + ip = IP(self.ceph_brx_net)[-2] + mask = self.ceph_brx_net.split('/')[1] + brd = IP(self.ceph_brx_net).broadcast() + + brx = self.client_remote.run(args=['ip', 'addr'], stderr=StringIO(), + stdout=StringIO(), timeout=(5*60)) + brx = re.findall(r'inet .* ceph-brx', brx.stdout.getvalue()) + if brx: + # If the 'ceph-brx' already exists, then check whether + # the new net is conflicting with it + _ip, _mask = brx[0].split()[1].split('/', 1) + if _ip != "{}".format(ip) or _mask != mask: + raise RuntimeError("Conflict with existing ceph-brx {0}, new {1}/{2}".format(brx[0].split()[1], ip, mask)) + + # Setup the ceph-brx and always use the last valid IP + if not brx: + log.info("Setuping the 'ceph-brx' with {0}/{1}".format(ip, mask)) + + self.run_shell_payload(f""" + set -e + sudo ip link add name ceph-brx type bridge + sudo ip addr flush dev ceph-brx + sudo ip link set ceph-brx up + sudo ip addr add {ip}/{mask} brd {brd} dev ceph-brx + """, timeout=(5*60), omit_sudo=False, cwd='/') + + args = "echo 1 | sudo tee /proc/sys/net/ipv4/ip_forward" + self.client_remote.run(args=args, timeout=(5*60), omit_sudo=False) + + # Setup the NAT + p = self.client_remote.run(args=['route'], stderr=StringIO(), + stdout=StringIO(), timeout=(5*60)) + p = re.findall(r'default .*', p.stdout.getvalue()) + if p == False: + raise RuntimeError("No default gw found") + gw = p[0].split()[7] + + self.run_shell_payload(f""" + set -e + sudo iptables -A FORWARD -o {gw} -i ceph-brx -j ACCEPT + sudo iptables -A FORWARD -i {gw} -o ceph-brx -j ACCEPT + sudo iptables -t nat -A POSTROUTING -s {ip}/{mask} -o {gw} -j MASQUERADE + """, timeout=(5*60), omit_sudo=False, cwd='/') + + def _setup_netns(self): + p = self.client_remote.run(args=['ip', 'netns', 'list'], + stderr=StringIO(), stdout=StringIO(), + timeout=(5*60)).stdout.getvalue().strip() + + # Get the netns name list + netns_list = re.findall(r'[^()\s][-.\w]+[^():\s]', p) + + out = re.search(r"{0}".format(self.netns_name), p) + if out is None: + # Get an uniq nsid for the new netns + nsid = 0 + p = self.client_remote.run(args=['ip', 'netns', 'list-id'], + stderr=StringIO(), stdout=StringIO(), + timeout=(5*60)).stdout.getvalue() + while True: + out = re.search(r"nsid {} ".format(nsid), p) + if out is None: + break + + nsid += 1 + + # Add one new netns and set it id + self.run_shell_payload(f""" + set -e + sudo ip netns add {self.netns_name} + sudo ip netns set {self.netns_name} {nsid} + """, timeout=(5*60), omit_sudo=False, cwd='/') + self.nsid = nsid; + else: + # The netns already exists and maybe suspended by self.kill() + self.resume_netns(); + + nsid = int(re.search(r"{0} \(id: (\d+)\)".format(self.netns_name), p).group(1)) + self.nsid = nsid; + return + + # Get one ip address for netns + ips = IP(self.ceph_brx_net) + for ip in ips: + found = False + if ip == ips[0]: + continue + if ip == ips[-2]: + raise RuntimeError("we have ran out of the ip addresses") + + for ns in netns_list: + ns_name = ns.split()[0] + args = ['sudo', 'ip', 'netns', 'exec', '{0}'.format(ns_name), 'ip', 'addr'] + try: + p = self.client_remote.run(args=args, stderr=StringIO(), + stdout=StringIO(), timeout=(5*60), + omit_sudo=False) + q = re.search("{0}".format(ip), p.stdout.getvalue()) + if q is not None: + found = True + break + except CommandFailedError: + if "No such file or directory" in p.stderr.getvalue(): + pass + if "Invalid argument" in p.stderr.getvalue(): + pass + + if found == False: + break + + mask = self.ceph_brx_net.split('/')[1] + brd = IP(self.ceph_brx_net).broadcast() + + log.info("Setuping the netns '{0}' with {1}/{2}".format(self.netns_name, ip, mask)) + + # Setup the veth interfaces + brxip = IP(self.ceph_brx_net)[-2] + self.run_shell_payload(f""" + set -e + sudo ip link add veth0 netns {self.netns_name} type veth peer name brx.{nsid} + sudo ip netns exec {self.netns_name} ip addr add {ip}/{mask} brd {brd} dev veth0 + sudo ip netns exec {self.netns_name} ip link set veth0 up + sudo ip netns exec {self.netns_name} ip link set lo up + sudo ip netns exec {self.netns_name} ip route add default via {brxip} + """, timeout=(5*60), omit_sudo=False, cwd='/') + + # Bring up the brx interface and join it to 'ceph-brx' + self.run_shell_payload(f""" + set -e + sudo ip link set brx.{nsid} up + sudo ip link set dev brx.{nsid} master ceph-brx + """, timeout=(5*60), omit_sudo=False, cwd='/') + + def _cleanup_netns(self): + if self.nsid == -1: + return + log.info("Removing the netns '{0}'".format(self.netns_name)) + + # Delete the netns and the peer veth interface + self.run_shell_payload(f""" + set -e + sudo ip link set brx.{self.nsid} down + sudo ip link delete dev brx.{self.nsid} + sudo ip netns delete {self.netns_name} + """, timeout=(5*60), omit_sudo=False, cwd='/') + + self.nsid = -1 + + def _cleanup_brx_and_nat(self): + brx = self.client_remote.run(args=['ip', 'addr'], stderr=StringIO(), + stdout=StringIO(), timeout=(5*60)) + brx = re.findall(r'inet .* ceph-brx', brx.stdout.getvalue()) + if not brx: + return + + # If we are the last netns, will delete the ceph-brx + args = ['sudo', 'ip', 'link', 'show'] + p = self.client_remote.run(args=args, stdout=StringIO(), + timeout=(5*60), omit_sudo=False) + _list = re.findall(r'brx\.', p.stdout.getvalue().strip()) + if len(_list) != 0: + return + + log.info("Removing the 'ceph-brx'") + + self.run_shell_payload(""" + set -e + sudo ip link set ceph-brx down + sudo ip link delete ceph-brx + """, timeout=(5*60), omit_sudo=False, cwd='/') + + # Drop the iptables NAT rules + ip = IP(self.ceph_brx_net)[-2] + mask = self.ceph_brx_net.split('/')[1] + + p = self.client_remote.run(args=['route'], stderr=StringIO(), + stdout=StringIO(), timeout=(5*60)) + p = re.findall(r'default .*', p.stdout.getvalue()) + if p == False: + raise RuntimeError("No default gw found") + gw = p[0].split()[7] + self.run_shell_payload(f""" + set -e + sudo iptables -D FORWARD -o {gw} -i ceph-brx -j ACCEPT + sudo iptables -D FORWARD -i {gw} -o ceph-brx -j ACCEPT + sudo iptables -t nat -D POSTROUTING -s {ip}/{mask} -o {gw} -j MASQUERADE + """, timeout=(5*60), omit_sudo=False, cwd='/') + + def setup_netns(self): + """ + Setup the netns for the mountpoint. + """ + log.info("Setting the '{0}' netns for '{1}'".format(self._netns_name, self.mountpoint)) + self._setup_brx_and_nat() + self._setup_netns() + + def cleanup_netns(self): + """ + Cleanup the netns for the mountpoint. + """ + # We will defer cleaning the netnses and bridge until the last + # mountpoint is unmounted, this will be a temporary work around + # for issue#46282. + + # log.info("Cleaning the '{0}' netns for '{1}'".format(self._netns_name, self.mountpoint)) + # self._cleanup_netns() + # self._cleanup_brx_and_nat() + + def suspend_netns(self): + """ + Suspend the netns veth interface. + """ + if self.nsid == -1: + return + + log.info("Suspending the '{0}' netns for '{1}'".format(self._netns_name, self.mountpoint)) + + args = ['sudo', 'ip', 'link', 'set', 'brx.{0}'.format(self.nsid), 'down'] + self.client_remote.run(args=args, timeout=(5*60), omit_sudo=False) + + def resume_netns(self): + """ + Resume the netns veth interface. + """ + if self.nsid == -1: + return + + log.info("Resuming the '{0}' netns for '{1}'".format(self._netns_name, self.mountpoint)) + + args = ['sudo', 'ip', 'link', 'set', 'brx.{0}'.format(self.nsid), 'up'] + self.client_remote.run(args=args, timeout=(5*60), omit_sudo=False) + + def mount(self, mntopts=[], check_status=True, **kwargs): + """ + kwargs expects its members to be same as the arguments accepted by + self.update_attrs(). + """ + raise NotImplementedError() + + def mount_wait(self, **kwargs): + """ + Accepts arguments same as self.mount(). + """ + self.mount(**kwargs) + self.wait_until_mounted() + + def _run_umount_lf(self): + log.debug(f'Force/lazy unmounting on client.{self.client_id}') + + try: + proc = self.client_remote.run( + args=f'sudo umount --lazy --force {self.hostfs_mntpt}', + timeout=UMOUNT_TIMEOUT, omit_sudo=False) + except CommandFailedError: + if self.is_mounted(): + raise + + return proc + + def umount(self): + raise NotImplementedError() + + def umount_wait(self, force=False, require_clean=False, + timeout=UMOUNT_TIMEOUT): + """ + + :param force: Expect that the mount will not shutdown cleanly: kill + it hard. + :param require_clean: Wait for the Ceph client associated with the + mount (e.g. ceph-fuse) to terminate, and + raise if it doesn't do so cleanly. + :param timeout: amount of time to be waited for umount command to finish + :return: + """ + raise NotImplementedError() + + def _verify_attrs(self, **kwargs): + """ + Verify that client_id, client_keyring_path, client_remote, hostfs_mntpt, + cephfs_name, cephfs_mntpt are either type str or None. + """ + for k, v in kwargs.items(): + if v is not None and not isinstance(v, str): + raise RuntimeError('value of attributes should be either str ' + f'or None. {k} - {v}') + + def update_attrs(self, client_id=None, client_keyring_path=None, + client_remote=None, hostfs_mntpt=None, cephfs_name=None, + cephfs_mntpt=None): + if not (client_id or client_keyring_path or client_remote or + cephfs_name or cephfs_mntpt or hostfs_mntpt): + return + + self._verify_attrs(client_id=client_id, + client_keyring_path=client_keyring_path, + hostfs_mntpt=hostfs_mntpt, cephfs_name=cephfs_name, + cephfs_mntpt=cephfs_mntpt) + + if client_id: + self.client_id = client_id + if client_keyring_path: + self.client_keyring_path = client_keyring_path + if client_remote: + self.client_remote = client_remote + if hostfs_mntpt: + self.hostfs_mntpt = hostfs_mntpt + if cephfs_name: + self.cephfs_name = cephfs_name + if cephfs_mntpt: + self.cephfs_mntpt = cephfs_mntpt + + def remount(self, **kwargs): + """ + Update mount object's attributes and attempt remount with these + new values for these attrbiutes. + + 1. Run umount_wait(). + 2. Run update_attrs(). + 3. Run mount(). + + Accepts arguments of self.mount() and self.update_attrs() with 1 + exception: wait accepted too which can be True or False. + """ + self.umount_wait() + assert not self.is_mounted() + + mntopts = kwargs.pop('mntopts', []) + check_status = kwargs.pop('check_status', True) + wait = kwargs.pop('wait', True) + + self.update_attrs(**kwargs) + + retval = self.mount(mntopts=mntopts, check_status=check_status) + # avoid this scenario (again): mount command might've failed and + # check_status might have silenced the exception, yet we attempt to + # wait which might lead to an error. + if retval is None and wait: + self.wait_until_mounted() + + return retval + + def kill(self): + """ + Suspend the netns veth interface to make the client disconnected + from the ceph cluster + """ + log.info('Killing connection on {0}...'.format(self.client_remote.name)) + self.suspend_netns() + + def kill_cleanup(self): + """ + Follow up ``kill`` to get to a clean unmounted state. + """ + log.info('Cleaning up killed connection on {0}'.format(self.client_remote.name)) + self.umount_wait(force=True) + + def cleanup(self): + """ + Remove the mount point. + + Prerequisite: the client is not mounted. + """ + log.info('Cleaning up mount {0}'.format(self.client_remote.name)) + stderr = StringIO() + try: + self.client_remote.run(args=['rmdir', '--', self.mountpoint], + cwd=self.test_dir, stderr=stderr, + timeout=(60*5), check_status=False) + except CommandFailedError: + if "no such file or directory" not in stderr.getvalue().lower(): + raise + + self.cleanup_netns() + + def wait_until_mounted(self): + raise NotImplementedError() + + def get_keyring_path(self): + # N.B.: default keyring is /etc/ceph/ceph.keyring; see ceph.py and generate_caps + return '/etc/ceph/ceph.client.{id}.keyring'.format(id=self.client_id) + + def get_key_from_keyfile(self): + # XXX: don't call run_shell(), since CephFS might be unmounted. + keyring = self.client_remote.read_file(self.client_keyring_path).\ + decode() + + for line in keyring.split('\n'): + if line.find('key') != -1: + return line[line.find('=') + 1 : ].strip() + + raise RuntimeError('Key not found in keyring file ' + f'{self.client_keyring_path}. Its contents are -\n' + f'{keyring}') + + @property + def config_path(self): + """ + Path to ceph.conf: override this if you're not a normal systemwide ceph install + :return: stringv + """ + return "/etc/ceph/ceph.conf" + + @contextmanager + def mounted_wait(self): + """ + A context manager, from an initially unmounted state, to mount + this, yield, and then unmount and clean up. + """ + self.mount() + self.wait_until_mounted() + try: + yield + finally: + self.umount_wait() + + def create_file(self, filename='testfile', dirname=None, user=None, + check_status=True): + assert(self.is_mounted()) + + if not os.path.isabs(filename): + if dirname: + if os.path.isabs(dirname): + path = os.path.join(dirname, filename) + else: + path = os.path.join(self.hostfs_mntpt, dirname, filename) + else: + path = os.path.join(self.hostfs_mntpt, filename) + else: + path = filename + + if user: + args = ['sudo', '-u', user, '-s', '/bin/bash', '-c', 'touch ' + path] + else: + args = 'touch ' + path + + return self.client_remote.run(args=args, check_status=check_status) + + def create_files(self): + assert(self.is_mounted()) + + for suffix in self.test_files: + log.info("Creating file {0}".format(suffix)) + self.client_remote.run(args=[ + 'touch', os.path.join(self.hostfs_mntpt, suffix) + ]) + + def test_create_file(self, filename='testfile', dirname=None, user=None, + check_status=True): + return self.create_file(filename=filename, dirname=dirname, user=user, + check_status=False) + + def check_files(self): + assert(self.is_mounted()) + + for suffix in self.test_files: + log.info("Checking file {0}".format(suffix)) + r = self.client_remote.run(args=[ + 'ls', os.path.join(self.hostfs_mntpt, suffix) + ], check_status=False) + if r.exitstatus != 0: + raise RuntimeError("Expected file {0} not found".format(suffix)) + + def write_file(self, path, data, perms=None): + """ + Write the given data at the given path and set the given perms to the + file on the path. + """ + if path.find(self.hostfs_mntpt) == -1: + path = os.path.join(self.hostfs_mntpt, path) + + write_file(self.client_remote, path, data) + + if perms: + self.run_shell(args=f'chmod {perms} {path}') + + def read_file(self, path): + """ + Return the data from the file on given path. + """ + if path.find(self.hostfs_mntpt) == -1: + path = os.path.join(self.hostfs_mntpt, path) + + return self.run_shell(args=['cat', path]).\ + stdout.getvalue().strip() + + def create_destroy(self): + assert(self.is_mounted()) + + filename = "{0} {1}".format(datetime.datetime.now(), self.client_id) + log.debug("Creating test file {0}".format(filename)) + self.client_remote.run(args=[ + 'touch', os.path.join(self.hostfs_mntpt, filename) + ]) + log.debug("Deleting test file {0}".format(filename)) + self.client_remote.run(args=[ + 'rm', '-f', os.path.join(self.hostfs_mntpt, filename) + ]) + + def _run_python(self, pyscript, py_version='python3', sudo=False): + args, omit_sudo = [], True + if sudo: + args.append('sudo') + omit_sudo = False + args += ['adjust-ulimits', 'daemon-helper', 'kill', py_version, '-c', pyscript] + return self.client_remote.run(args=args, wait=False, stdin=run.PIPE, + stdout=StringIO(), omit_sudo=omit_sudo) + + def run_python(self, pyscript, py_version='python3', sudo=False): + p = self._run_python(pyscript, py_version, sudo=sudo) + p.wait() + return p.stdout.getvalue().strip() + + def run_shell(self, args, timeout=300, **kwargs): + omit_sudo = kwargs.pop('omit_sudo', False) + cwd = kwargs.pop('cwd', self.mountpoint) + stdout = kwargs.pop('stdout', StringIO()) + stderr = kwargs.pop('stderr', StringIO()) + + return self.client_remote.run(args=args, cwd=cwd, timeout=timeout, + stdout=stdout, stderr=stderr, + omit_sudo=omit_sudo, **kwargs) + + def run_shell_payload(self, payload, **kwargs): + kwargs['args'] = ["bash", "-c", Raw(f"'{payload}'")] + if kwargs.pop('sudo', False): + kwargs['args'].insert(0, 'sudo') + kwargs['omit_sudo'] = False + return self.run_shell(**kwargs) + + def run_as_user(self, **kwargs): + """ + Besides the arguments defined for run_shell() this method also + accepts argument 'user'. + """ + args = kwargs.pop('args') + user = kwargs.pop('user') + if isinstance(args, str): + args = ['sudo', '-u', user, '-s', '/bin/bash', '-c', args] + elif isinstance(args, list): + cmdlist = args + cmd = '' + for i in cmdlist: + cmd = cmd + i + ' ' + # get rid of extra space at the end. + cmd = cmd[:-1] + + args = ['sudo', '-u', user, '-s', '/bin/bash', '-c', cmd] + + kwargs['args'] = args + kwargs['omit_sudo'] = False + return self.run_shell(**kwargs) + + def run_as_root(self, **kwargs): + """ + Accepts same arguments as run_shell(). + """ + kwargs['user'] = 'root' + return self.run_as_user(**kwargs) + + def assert_retval(self, proc_retval, exp_retval): + msg = (f'expected return value: {exp_retval}\n' + f'received return value: {proc_retval}\n') + assert proc_retval == exp_retval, msg + + def _verify(self, proc, exp_retval=None, exp_errmsgs=None): + if exp_retval is None and exp_errmsgs is None: + raise RuntimeError('Method didn\'t get enough parameters. Pass ' + 'return value or error message expected from ' + 'the command/process.') + + if exp_retval is not None: + self.assert_retval(proc.returncode, exp_retval) + if exp_errmsgs is None: + return + + if isinstance(exp_errmsgs, str): + exp_errmsgs = (exp_errmsgs, ) + + proc_stderr = proc.stderr.getvalue().lower() + msg = ('didn\'t find any of the expected string in stderr.\n' + f'expected string: {exp_errmsgs}\n' + f'received error message: {proc_stderr}\n' + 'note: received error message is converted to lowercase') + for e in exp_errmsgs: + if e in proc_stderr: + break + # this else is meant for for loop. + else: + assert False, msg + + def negtestcmd(self, args, retval=None, errmsgs=None, stdin=None, + cwd=None, wait=True): + """ + Conduct a negative test for the given command. + + retval and errmsgs are parameters to confirm the cause of command + failure. + + Note: errmsgs is expected to be a tuple, but in case there's only + error message, it can also be a string. This method will handle + that internally. + """ + proc = self.run_shell(args=args, wait=wait, stdin=stdin, cwd=cwd, + check_status=False) + self._verify(proc, retval, errmsgs) + return proc + + def negtestcmd_as_user(self, args, user, retval=None, errmsgs=None, + stdin=None, cwd=None, wait=True): + proc = self.run_as_user(args=args, user=user, wait=wait, stdin=stdin, + cwd=cwd, check_status=False) + self._verify(proc, retval, errmsgs) + return proc + + def negtestcmd_as_root(self, args, retval=None, errmsgs=None, stdin=None, + cwd=None, wait=True): + proc = self.run_as_root(args=args, wait=wait, stdin=stdin, cwd=cwd, + check_status=False) + self._verify(proc, retval, errmsgs) + return proc + + def open_for_reading(self, basename): + """ + Open a file for reading only. + """ + assert(self.is_mounted()) + + path = os.path.join(self.hostfs_mntpt, basename) + + return self._run_python(dedent( + """ + import os + mode = os.O_RDONLY + fd = os.open("{path}", mode) + os.close(fd) + """.format(path=path) + )) + + def open_for_writing(self, basename, creat=True, trunc=True, excl=False): + """ + Open a file for writing only. + """ + assert(self.is_mounted()) + + path = os.path.join(self.hostfs_mntpt, basename) + + return self._run_python(dedent( + """ + import os + mode = os.O_WRONLY + if {creat}: + mode |= os.O_CREAT + if {trunc}: + mode |= os.O_TRUNC + if {excl}: + mode |= os.O_EXCL + fd = os.open("{path}", mode) + os.close(fd) + """.format(path=path, creat=creat, trunc=trunc, excl=excl) + )) + + def open_no_data(self, basename): + """ + A pure metadata operation + """ + assert(self.is_mounted()) + + path = os.path.join(self.hostfs_mntpt, basename) + + p = self._run_python(dedent( + """ + f = open("{path}", 'w') + """.format(path=path) + )) + p.wait() + + def open_background(self, basename="background_file", write=True, content="content"): + """ + Open a file for writing, then block such that the client + will hold a capability. + + Don't return until the remote process has got as far as opening + the file, then return the RemoteProcess instance. + """ + assert(self.is_mounted()) + + path = os.path.join(self.hostfs_mntpt, basename) + + if write: + pyscript = dedent(""" + import time + + with open("{path}", 'w') as f: + f.write("{content}") + f.flush() + while True: + time.sleep(1) + """).format(path=path, content=content) + else: + pyscript = dedent(""" + import time + + with open("{path}", 'r') as f: + while True: + time.sleep(1) + """).format(path=path) + + rproc = self._run_python(pyscript) + self.background_procs.append(rproc) + + # This wait would not be sufficient if the file had already + # existed, but it's simple and in practice users of open_background + # are not using it on existing files. + if write: + self.wait_for_visible(basename, size=len(content)) + else: + self.wait_for_visible(basename) + + return rproc + + def open_dir_background(self, basename): + """ + Create and hold a capability to a directory. + """ + assert(self.is_mounted()) + + path = os.path.join(self.hostfs_mntpt, basename) + + pyscript = dedent(""" + import time + import os + + os.mkdir("{path}") + fd = os.open("{path}", os.O_RDONLY) + while True: + time.sleep(1) + """).format(path=path) + + rproc = self._run_python(pyscript) + self.background_procs.append(rproc) + + self.wait_for_visible(basename) + + return rproc + + def wait_for_dir_empty(self, dirname, timeout=30): + dirpath = os.path.join(self.hostfs_mntpt, dirname) + with safe_while(sleep=5, tries=(timeout//5)) as proceed: + while proceed(): + p = self.run_shell_payload(f"stat -c %h {dirpath}") + nr_links = int(p.stdout.getvalue().strip()) + if nr_links == 2: + return + + def wait_for_visible(self, basename="background_file", size=None, timeout=30): + i = 0 + args = ['stat'] + if size is not None: + args += ['--printf=%s'] + args += [os.path.join(self.hostfs_mntpt, basename)] + while i < timeout: + p = self.client_remote.run(args=args, stdout=StringIO(), check_status=False) + if p.exitstatus == 0: + if size is not None: + s = p.stdout.getvalue().strip() + if int(s) == size: + log.info(f"File {basename} became visible with size {size} from {self.client_id} after {i}s") + return + else: + log.error(f"File {basename} became visible but with size {int(s)} not {size}") + else: + log.info(f"File {basename} became visible from {self.client_id} after {i}s") + return + time.sleep(1) + i += 1 + + raise RuntimeError("Timed out after {0}s waiting for {1} to become visible from {2}".format( + i, basename, self.client_id)) + + def lock_background(self, basename="background_file", do_flock=True): + """ + Open and lock a files for writing, hold the lock in a background process + """ + assert(self.is_mounted()) + + path = os.path.join(self.hostfs_mntpt, basename) + + script_builder = """ + import time + import fcntl + import struct""" + if do_flock: + script_builder += """ + f1 = open("{path}-1", 'w') + fcntl.flock(f1, fcntl.LOCK_EX | fcntl.LOCK_NB)""" + script_builder += """ + f2 = open("{path}-2", 'w') + lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0) + fcntl.fcntl(f2, fcntl.F_SETLK, lockdata) + while True: + time.sleep(1) + """ + + pyscript = dedent(script_builder).format(path=path) + + log.info("lock_background file {0}".format(basename)) + rproc = self._run_python(pyscript) + self.background_procs.append(rproc) + return rproc + + def lock_and_release(self, basename="background_file"): + assert(self.is_mounted()) + + path = os.path.join(self.hostfs_mntpt, basename) + + script = """ + import time + import fcntl + import struct + f1 = open("{path}-1", 'w') + fcntl.flock(f1, fcntl.LOCK_EX) + f2 = open("{path}-2", 'w') + lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0) + fcntl.fcntl(f2, fcntl.F_SETLK, lockdata) + """ + pyscript = dedent(script).format(path=path) + + log.info("lock_and_release file {0}".format(basename)) + return self._run_python(pyscript) + + def check_filelock(self, basename="background_file", do_flock=True): + assert(self.is_mounted()) + + path = os.path.join(self.hostfs_mntpt, basename) + + script_builder = """ + import fcntl + import errno + import struct""" + if do_flock: + script_builder += """ + f1 = open("{path}-1", 'r') + try: + fcntl.flock(f1, fcntl.LOCK_EX | fcntl.LOCK_NB) + except IOError as e: + if e.errno == errno.EAGAIN: + pass + else: + raise RuntimeError("flock on file {path}-1 not found")""" + script_builder += """ + f2 = open("{path}-2", 'r') + try: + lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0) + fcntl.fcntl(f2, fcntl.F_SETLK, lockdata) + except IOError as e: + if e.errno == errno.EAGAIN: + pass + else: + raise RuntimeError("posix lock on file {path}-2 not found") + """ + pyscript = dedent(script_builder).format(path=path) + + log.info("check lock on file {0}".format(basename)) + self.client_remote.run(args=[ + 'python3', '-c', pyscript + ]) + + def write_background(self, basename="background_file", loop=False): + """ + Open a file for writing, complete as soon as you can + :param basename: + :return: + """ + assert(self.is_mounted()) + + path = os.path.join(self.hostfs_mntpt, basename) + + pyscript = dedent(""" + import os + import time + + fd = os.open("{path}", os.O_RDWR | os.O_CREAT, 0o644) + try: + while True: + os.write(fd, b'content') + time.sleep(1) + if not {loop}: + break + except IOError as e: + pass + os.close(fd) + """).format(path=path, loop=str(loop)) + + rproc = self._run_python(pyscript) + self.background_procs.append(rproc) + return rproc + + def write_n_mb(self, filename, n_mb, seek=0, wait=True): + """ + Write the requested number of megabytes to a file + """ + assert(self.is_mounted()) + + return self.run_shell(["dd", "if=/dev/urandom", "of={0}".format(filename), + "bs=1M", "conv=fdatasync", + "count={0}".format(int(n_mb)), + "seek={0}".format(int(seek)) + ], wait=wait) + + def write_test_pattern(self, filename, size): + log.info("Writing {0} bytes to {1}".format(size, filename)) + return self.run_python(dedent(""" + import zlib + path = "{path}" + with open(path, 'w') as f: + for i in range(0, {size}): + val = zlib.crc32(str(i).encode('utf-8')) & 7 + f.write(chr(val)) + """.format( + path=os.path.join(self.hostfs_mntpt, filename), + size=size + ))) + + def validate_test_pattern(self, filename, size): + log.info("Validating {0} bytes from {1}".format(size, filename)) + # Use sudo because cephfs-data-scan may recreate the file with owner==root + return self.run_python(dedent(""" + import zlib + path = "{path}" + with open(path, 'r') as f: + bytes = f.read() + if len(bytes) != {size}: + raise RuntimeError("Bad length {{0}} vs. expected {{1}}".format( + len(bytes), {size} + )) + for i, b in enumerate(bytes): + val = zlib.crc32(str(i).encode('utf-8')) & 7 + if b != chr(val): + raise RuntimeError("Bad data at offset {{0}}".format(i)) + """.format( + path=os.path.join(self.hostfs_mntpt, filename), + size=size + )), sudo=True) + + def open_n_background(self, fs_path, count): + """ + Open N files for writing, hold them open in a background process + + :param fs_path: Path relative to CephFS root, e.g. "foo/bar" + :return: a RemoteProcess + """ + assert(self.is_mounted()) + + abs_path = os.path.join(self.hostfs_mntpt, fs_path) + + pyscript = dedent(""" + import sys + import time + import os + + n = {count} + abs_path = "{abs_path}" + + if not os.path.exists(abs_path): + os.makedirs(abs_path) + + handles = [] + for i in range(0, n): + fname = "file_"+str(i) + path = os.path.join(abs_path, fname) + handles.append(open(path, 'w')) + + while True: + time.sleep(1) + """).format(abs_path=abs_path, count=count) + + rproc = self._run_python(pyscript) + self.background_procs.append(rproc) + return rproc + + def create_n_files(self, fs_path, count, sync=False, dirsync=False, + unlink=False, finaldirsync=False, hard_links=0): + """ + Create n files. + + :param sync: sync the file after writing + :param dirsync: sync the containing directory after closing the file + :param unlink: unlink the file after closing + :param finaldirsync: sync the containing directory after closing the last file + :param hard_links: create given number of hard link(s) for each file + """ + + assert(self.is_mounted()) + + abs_path = os.path.join(self.hostfs_mntpt, fs_path) + + pyscript = dedent(f""" + import os + import uuid + + n = {count} + create_hard_links = False + if {hard_links} > 0: + create_hard_links = True + path = "{abs_path}" + + dpath = os.path.dirname(path) + fnameprefix = os.path.basename(path) + os.makedirs(dpath, exist_ok=True) + + try: + dirfd = os.open(dpath, os.O_DIRECTORY) + + for i in range(n): + fpath = os.path.join(dpath, f"{{fnameprefix}}_{{i}}") + with open(fpath, 'w') as f: + f.write(f"{{i}}") + if {sync}: + f.flush() + os.fsync(f.fileno()) + if {unlink}: + os.unlink(fpath) + if {dirsync}: + os.fsync(dirfd) + if create_hard_links: + for j in range({hard_links}): + os.system(f"ln {{fpath}} {{dpath}}/{{fnameprefix}}_{{i}}_{{uuid.uuid4()}}") + if {finaldirsync}: + os.fsync(dirfd) + finally: + os.close(dirfd) + """) + + self.run_python(pyscript) + + def teardown(self): + for p in self.background_procs: + log.info("Terminating background process") + self._kill_background(p) + + self.background_procs = [] + + def _kill_background(self, p): + if p.stdin: + p.stdin.close() + try: + p.wait() + except (CommandFailedError, ConnectionLostError): + pass + + def kill_background(self, p): + """ + For a process that was returned by one of the _background member functions, + kill it hard. + """ + self._kill_background(p) + self.background_procs.remove(p) + + def send_signal(self, signal): + signal = signal.lower() + if signal.lower() not in ['sigstop', 'sigcont', 'sigterm', 'sigkill']: + raise NotImplementedError + + self.client_remote.run(args=['sudo', 'kill', '-{0}'.format(signal), + self.client_pid], omit_sudo=False) + + def get_global_id(self): + raise NotImplementedError() + + def get_global_inst(self): + raise NotImplementedError() + + def get_global_addr(self): + raise NotImplementedError() + + def get_osd_epoch(self): + raise NotImplementedError() + + def get_op_read_count(self): + raise NotImplementedError() + + def readlink(self, fs_path): + abs_path = os.path.join(self.hostfs_mntpt, fs_path) + + pyscript = dedent(""" + import os + + print(os.readlink("{path}")) + """).format(path=abs_path) + + proc = self._run_python(pyscript) + proc.wait() + return str(proc.stdout.getvalue().strip()) + + + def lstat(self, fs_path, follow_symlinks=False, wait=True): + return self.stat(fs_path, follow_symlinks=False, wait=True) + + def stat(self, fs_path, follow_symlinks=True, wait=True, **kwargs): + """ + stat a file, and return the result as a dictionary like this: + { + "st_ctime": 1414161137.0, + "st_mtime": 1414161137.0, + "st_nlink": 33, + "st_gid": 0, + "st_dev": 16777218, + "st_size": 1190, + "st_ino": 2, + "st_uid": 0, + "st_mode": 16877, + "st_atime": 1431520593.0 + } + + Raises exception on absent file. + """ + abs_path = os.path.join(self.hostfs_mntpt, fs_path) + if follow_symlinks: + stat_call = "os.stat('" + abs_path + "')" + else: + stat_call = "os.lstat('" + abs_path + "')" + + pyscript = dedent(""" + import os + import stat + import json + import sys + + try: + s = {stat_call} + except OSError as e: + sys.exit(e.errno) + + attrs = ["st_mode", "st_ino", "st_dev", "st_nlink", "st_uid", "st_gid", "st_size", "st_atime", "st_mtime", "st_ctime"] + print(json.dumps( + dict([(a, getattr(s, a)) for a in attrs]), + indent=2)) + """).format(stat_call=stat_call) + proc = self._run_python(pyscript, **kwargs) + if wait: + proc.wait() + return json.loads(proc.stdout.getvalue().strip()) + else: + return proc + + def touch(self, fs_path): + """ + Create a dentry if it doesn't already exist. This python + implementation exists because the usual command line tool doesn't + pass through error codes like EIO. + + :param fs_path: + :return: + """ + abs_path = os.path.join(self.hostfs_mntpt, fs_path) + pyscript = dedent(""" + import sys + import errno + + try: + f = open("{path}", "w") + f.close() + except IOError as e: + sys.exit(errno.EIO) + """).format(path=abs_path) + proc = self._run_python(pyscript) + proc.wait() + + def path_to_ino(self, fs_path, follow_symlinks=True): + abs_path = os.path.join(self.hostfs_mntpt, fs_path) + + if follow_symlinks: + pyscript = dedent(""" + import os + import stat + + print(os.stat("{path}").st_ino) + """).format(path=abs_path) + else: + pyscript = dedent(""" + import os + import stat + + print(os.lstat("{path}").st_ino) + """).format(path=abs_path) + + proc = self._run_python(pyscript) + proc.wait() + return int(proc.stdout.getvalue().strip()) + + def path_to_nlink(self, fs_path): + abs_path = os.path.join(self.hostfs_mntpt, fs_path) + + pyscript = dedent(""" + import os + import stat + + print(os.stat("{path}").st_nlink) + """).format(path=abs_path) + + proc = self._run_python(pyscript) + proc.wait() + return int(proc.stdout.getvalue().strip()) + + def ls(self, path=None, **kwargs): + """ + Wrap ls: return a list of strings + """ + kwargs['args'] = ["ls"] + if path: + kwargs['args'].append(path) + if kwargs.pop('sudo', False): + kwargs['args'].insert(0, 'sudo') + kwargs['omit_sudo'] = False + ls_text = self.run_shell(**kwargs).stdout.getvalue().strip() + + if ls_text: + return ls_text.split("\n") + else: + # Special case because otherwise split on empty string + # gives you [''] instead of [] + return [] + + def setfattr(self, path, key, val, **kwargs): + """ + Wrap setfattr. + + :param path: relative to mount point + :param key: xattr name + :param val: xattr value + :return: None + """ + kwargs['args'] = ["setfattr", "-n", key, "-v", val, path] + if kwargs.pop('sudo', False): + kwargs['args'].insert(0, 'sudo') + kwargs['omit_sudo'] = False + self.run_shell(**kwargs) + + def getfattr(self, path, attr, **kwargs): + """ + Wrap getfattr: return the values of a named xattr on one file, or + None if the attribute is not found. + + :return: a string + """ + kwargs['args'] = ["getfattr", "--only-values", "-n", attr, path] + if kwargs.pop('sudo', False): + kwargs['args'].insert(0, 'sudo') + kwargs['omit_sudo'] = False + kwargs['wait'] = False + p = self.run_shell(**kwargs) + try: + p.wait() + except CommandFailedError as e: + if e.exitstatus == 1 and "No such attribute" in p.stderr.getvalue(): + return None + else: + raise + + return str(p.stdout.getvalue()) + + def df(self): + """ + Wrap df: return a dict of usage fields in bytes + """ + + p = self.run_shell(["df", "-B1", "."]) + lines = p.stdout.getvalue().strip().split("\n") + fs, total, used, avail = lines[1].split()[:4] + log.warning(lines) + + return { + "total": int(total), + "used": int(used), + "available": int(avail) + } + + def dir_checksum(self, path=None, follow_symlinks=False): + cmd = ["find"] + if follow_symlinks: + cmd.append("-L") + if path: + cmd.append(path) + cmd.extend(["-type", "f", "-exec", "md5sum", "{}", "+"]) + checksum_text = self.run_shell(cmd).stdout.getvalue().strip() + checksum_sorted = sorted(checksum_text.split('\n'), key=lambda v: v.split()[1]) + return hashlib.md5(('\n'.join(checksum_sorted)).encode('utf-8')).hexdigest() + + def validate_subvol_options(self): + mount_subvol_num = self.client_config.get('mount_subvol_num', None) + if self.cephfs_mntpt and mount_subvol_num is not None: + log.warning("You cannot specify both: cephfs_mntpt and mount_subvol_num") + log.info(f"Mounting subvol {mount_subvol_num} for now") + + if mount_subvol_num is not None: + # mount_subvol must be an index into the subvol path array for the fs + if not self.cephfs_name: + self.cephfs_name = 'cephfs' + assert(hasattr(self.ctx, "created_subvols")) + # mount_subvol must be specified under client.[0-9] yaml section + subvol_paths = self.ctx.created_subvols[self.cephfs_name] + path_to_mount = subvol_paths[mount_subvol_num] + self.cephfs_mntpt = path_to_mount diff --git a/qa/tasks/cephfs/test_acls.py b/qa/tasks/cephfs/test_acls.py new file mode 100644 index 000000000..48160dd8b --- /dev/null +++ b/qa/tasks/cephfs/test_acls.py @@ -0,0 +1,39 @@ +from logging import getLogger + +from io import StringIO +from tasks.cephfs.xfstests_dev import XFSTestsDev + + +log = getLogger(__name__) + + +class TestACLs(XFSTestsDev): + + def test_acls(self): + from tasks.cephfs.fuse_mount import FuseMount + from tasks.cephfs.kernel_mount import KernelMount + + if isinstance(self.mount_a, FuseMount): + log.info('client is fuse mounted') + elif isinstance(self.mount_a, KernelMount): + log.info('client is kernel mounted') + + # XXX: check_status is set to False so that we can check for command's + # failure on our own (since this command doesn't set right error code + # and error message in some cases) and print custom log messages + # accordingly. + proc = self.mount_a.client_remote.run(args=['sudo', 'env', 'DIFF_LENGTH=0', + './check', 'generic/099'], cwd=self.xfstests_repo_path, stdout=StringIO(), + stderr=StringIO(), timeout=30, check_status=False,omit_sudo=False, + label='running tests for ACLs from xfstests-dev') + + if proc.returncode != 0: + log.info('Command failed.') + log.info(f'Command return value: {proc.returncode}') + stdout, stderr = proc.stdout.getvalue(), proc.stderr.getvalue() + log.info(f'Command stdout -\n{stdout}') + log.info(f'Command stderr -\n{stderr}') + + self.assertEqual(proc.returncode, 0) + success_line = 'Passed all 1 tests' + self.assertIn(success_line, stdout) diff --git a/qa/tasks/cephfs/test_admin.py b/qa/tasks/cephfs/test_admin.py new file mode 100644 index 000000000..9890381c6 --- /dev/null +++ b/qa/tasks/cephfs/test_admin.py @@ -0,0 +1,1494 @@ +import errno +import json +import logging +import time +import uuid +from io import StringIO +from os.path import join as os_path_join + +from teuthology.exceptions import CommandFailedError + +from tasks.cephfs.cephfs_test_case import CephFSTestCase, classhook +from tasks.cephfs.filesystem import FileLayout, FSMissing +from tasks.cephfs.fuse_mount import FuseMount +from tasks.cephfs.caps_helper import CapTester + +log = logging.getLogger(__name__) + +class TestAdminCommands(CephFSTestCase): + """ + Tests for administration command. + """ + + CLIENTS_REQUIRED = 1 + MDSS_REQUIRED = 1 + + def check_pool_application_metadata_key_value(self, pool, app, key, value): + output = self.fs.mon_manager.raw_cluster_cmd( + 'osd', 'pool', 'application', 'get', pool, app, key) + self.assertEqual(str(output.strip()), value) + + def setup_ec_pools(self, n, metadata=True, overwrites=True): + if metadata: + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', n+"-meta", "8") + cmd = ['osd', 'erasure-code-profile', 'set', n+"-profile", "m=2", "k=2", "crush-failure-domain=osd"] + self.fs.mon_manager.raw_cluster_cmd(*cmd) + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', n+"-data", "8", "erasure", n+"-profile") + if overwrites: + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'set', n+"-data", 'allow_ec_overwrites', 'true') + +@classhook('_add_valid_tell') +class TestValidTell(TestAdminCommands): + @classmethod + def _add_valid_tell(cls): + tells = [ + ['cache', 'status'], + ['damage', 'ls'], + ['dump_blocked_ops'], + ['dump_blocked_ops_count'], + ['dump_historic_ops'], + ['dump_historic_ops_by_duration'], + ['dump_mempools'], + ['dump_ops_in_flight'], + ['flush', 'journal'], + ['get', 'subtrees'], + ['ops', 'locks'], + ['ops'], + ['status'], + ['version'], + ] + def test(c): + def f(self): + J = self.fs.rank_tell(c) + json.dumps(J) + log.debug("dumped:\n%s", str(J)) + return f + for c in tells: + setattr(cls, 'test_valid_' + '_'.join(c), test(c)) + +class TestFsStatus(TestAdminCommands): + """ + Test "ceph fs status subcommand. + """ + + def test_fs_status(self): + """ + That `ceph fs status` command functions. + """ + + s = self.fs.mon_manager.raw_cluster_cmd("fs", "status") + self.assertTrue("active" in s) + + mdsmap = json.loads(self.fs.mon_manager.raw_cluster_cmd("fs", "status", "--format=json-pretty"))["mdsmap"] + self.assertEqual(mdsmap[0]["state"], "active") + + mdsmap = json.loads(self.fs.mon_manager.raw_cluster_cmd("fs", "status", "--format=json"))["mdsmap"] + self.assertEqual(mdsmap[0]["state"], "active") + + +class TestAddDataPool(TestAdminCommands): + """ + Test "ceph fs add_data_pool" subcommand. + """ + + def test_add_data_pool_root(self): + """ + That a new data pool can be added and used for the root directory. + """ + + p = self.fs.add_data_pool("foo") + self.fs.set_dir_layout(self.mount_a, ".", FileLayout(pool=p)) + + def test_add_data_pool_application_metadata(self): + """ + That the application metadata set on a newly added data pool is as expected. + """ + pool_name = "foo" + mon_cmd = self.fs.mon_manager.raw_cluster_cmd + mon_cmd('osd', 'pool', 'create', pool_name, '--pg_num_min', + str(self.fs.pg_num_min)) + # Check whether https://tracker.ceph.com/issues/43061 is fixed + mon_cmd('osd', 'pool', 'application', 'enable', pool_name, 'cephfs') + self.fs.add_data_pool(pool_name, create=False) + self.check_pool_application_metadata_key_value( + pool_name, 'cephfs', 'data', self.fs.name) + + def test_add_data_pool_subdir(self): + """ + That a new data pool can be added and used for a sub-directory. + """ + + p = self.fs.add_data_pool("foo") + self.mount_a.run_shell("mkdir subdir") + self.fs.set_dir_layout(self.mount_a, "subdir", FileLayout(pool=p)) + + def test_add_data_pool_non_alphamueric_name_as_subdir(self): + """ + That a new data pool with non-alphanumeric name can be added and used for a sub-directory. + """ + p = self.fs.add_data_pool("I-am-data_pool00.") + self.mount_a.run_shell("mkdir subdir") + self.fs.set_dir_layout(self.mount_a, "subdir", FileLayout(pool=p)) + + def test_add_data_pool_ec(self): + """ + That a new EC data pool can be added. + """ + + n = "test_add_data_pool_ec" + self.setup_ec_pools(n, metadata=False) + self.fs.add_data_pool(n+"-data", create=False) + + def test_add_already_in_use_data_pool(self): + """ + That command try to add data pool which is already in use with another fs. + """ + + # create first data pool, metadata pool and add with filesystem + first_fs = "first_fs" + first_metadata_pool = "first_metadata_pool" + first_data_pool = "first_data_pool" + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', first_metadata_pool) + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', first_data_pool) + self.fs.mon_manager.raw_cluster_cmd('fs', 'new', first_fs, first_metadata_pool, first_data_pool) + + # create second data pool, metadata pool and add with filesystem + second_fs = "second_fs" + second_metadata_pool = "second_metadata_pool" + second_data_pool = "second_data_pool" + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', second_metadata_pool) + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', second_data_pool) + self.fs.mon_manager.raw_cluster_cmd('fs', 'new', second_fs, second_metadata_pool, second_data_pool) + + # try to add 'first_data_pool' with 'second_fs' + # Expecting EINVAL exit status because 'first_data_pool' is already in use with 'first_fs' + try: + self.fs.mon_manager.raw_cluster_cmd('fs', 'add_data_pool', second_fs, first_data_pool) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.EINVAL) + else: + self.fail("Expected EINVAL because data pool is already in use as data pool for first_fs") + + def test_add_already_in_use_metadata_pool(self): + """ + That command try to add metadata pool which is already in use with another fs. + """ + + # create first data pool, metadata pool and add with filesystem + first_fs = "first_fs" + first_metadata_pool = "first_metadata_pool" + first_data_pool = "first_data_pool" + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', first_metadata_pool) + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', first_data_pool) + self.fs.mon_manager.raw_cluster_cmd('fs', 'new', first_fs, first_metadata_pool, first_data_pool) + + # create second data pool, metadata pool and add with filesystem + second_fs = "second_fs" + second_metadata_pool = "second_metadata_pool" + second_data_pool = "second_data_pool" + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', second_metadata_pool) + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', second_data_pool) + self.fs.mon_manager.raw_cluster_cmd('fs', 'new', second_fs, second_metadata_pool, second_data_pool) + + # try to add 'second_metadata_pool' with 'first_fs' as a data pool + # Expecting EINVAL exit status because 'second_metadata_pool' + # is already in use with 'second_fs' as a metadata pool + try: + self.fs.mon_manager.raw_cluster_cmd('fs', 'add_data_pool', first_fs, second_metadata_pool) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.EINVAL) + else: + self.fail("Expected EINVAL because data pool is already in use as metadata pool for 'second_fs'") + +class TestFsNew(TestAdminCommands): + """ + Test "ceph fs new" subcommand. + """ + MDSS_REQUIRED = 3 + + def test_fsnames_can_only_by_goodchars(self): + n = 'test_fsnames_can_only_by_goodchars' + metapoolname, datapoolname = n+'-testmetapool', n+'-testdatapool' + badname = n+'badname@#' + + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', + n+metapoolname) + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', + n+datapoolname) + + # test that fsname not with "goodchars" fails + args = ['fs', 'new', badname, metapoolname, datapoolname] + proc = self.fs.mon_manager.run_cluster_cmd(args=args,stderr=StringIO(), + check_status=False) + self.assertIn('invalid chars', proc.stderr.getvalue().lower()) + + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'rm', metapoolname, + metapoolname, + '--yes-i-really-really-mean-it-not-faking') + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'rm', datapoolname, + datapoolname, + '--yes-i-really-really-mean-it-not-faking') + + def test_new_default_ec(self): + """ + That a new file system warns/fails with an EC default data pool. + """ + + self.mount_a.umount_wait(require_clean=True) + self.mds_cluster.delete_all_filesystems() + n = "test_new_default_ec" + self.setup_ec_pools(n) + try: + self.fs.mon_manager.raw_cluster_cmd('fs', 'new', n, n+"-meta", n+"-data") + except CommandFailedError as e: + if e.exitstatus == 22: + pass + else: + raise + else: + raise RuntimeError("expected failure") + + def test_new_default_ec_force(self): + """ + That a new file system succeeds with an EC default data pool with --force. + """ + + self.mount_a.umount_wait(require_clean=True) + self.mds_cluster.delete_all_filesystems() + n = "test_new_default_ec_force" + self.setup_ec_pools(n) + self.fs.mon_manager.raw_cluster_cmd('fs', 'new', n, n+"-meta", n+"-data", "--force") + + def test_new_default_ec_no_overwrite(self): + """ + That a new file system fails with an EC default data pool without overwrite. + """ + + self.mount_a.umount_wait(require_clean=True) + self.mds_cluster.delete_all_filesystems() + n = "test_new_default_ec_no_overwrite" + self.setup_ec_pools(n, overwrites=False) + try: + self.fs.mon_manager.raw_cluster_cmd('fs', 'new', n, n+"-meta", n+"-data") + except CommandFailedError as e: + if e.exitstatus == 22: + pass + else: + raise + else: + raise RuntimeError("expected failure") + # and even with --force ! + try: + self.fs.mon_manager.raw_cluster_cmd('fs', 'new', n, n+"-meta", n+"-data", "--force") + except CommandFailedError as e: + if e.exitstatus == 22: + pass + else: + raise + else: + raise RuntimeError("expected failure") + + def test_fs_new_pool_application_metadata(self): + """ + That the application metadata set on the pools of a newly created filesystem are as expected. + """ + self.mount_a.umount_wait(require_clean=True) + self.mds_cluster.delete_all_filesystems() + fs_name = "test_fs_new_pool_application" + keys = ['metadata', 'data'] + pool_names = [fs_name+'-'+key for key in keys] + mon_cmd = self.fs.mon_manager.raw_cluster_cmd + for p in pool_names: + mon_cmd('osd', 'pool', 'create', p, '--pg_num_min', str(self.fs.pg_num_min)) + mon_cmd('osd', 'pool', 'application', 'enable', p, 'cephfs') + mon_cmd('fs', 'new', fs_name, pool_names[0], pool_names[1]) + for i in range(2): + self.check_pool_application_metadata_key_value( + pool_names[i], 'cephfs', keys[i], fs_name) + + def test_fs_new_with_specific_id(self): + """ + That a file system can be created with a specific ID. + """ + fs_name = "test_fs_specific_id" + fscid = 100 + keys = ['metadata', 'data'] + pool_names = [fs_name+'-'+key for key in keys] + for p in pool_names: + self.run_cluster_cmd(f'osd pool create {p}') + self.run_cluster_cmd(f'fs new {fs_name} {pool_names[0]} {pool_names[1]} --fscid {fscid} --force') + self.fs.status().get_fsmap(fscid) + for i in range(2): + self.check_pool_application_metadata_key_value(pool_names[i], 'cephfs', keys[i], fs_name) + + def test_fs_new_with_specific_id_idempotency(self): + """ + That command to create file system with specific ID is idempotent. + """ + fs_name = "test_fs_specific_id" + fscid = 100 + keys = ['metadata', 'data'] + pool_names = [fs_name+'-'+key for key in keys] + for p in pool_names: + self.run_cluster_cmd(f'osd pool create {p}') + self.run_cluster_cmd(f'fs new {fs_name} {pool_names[0]} {pool_names[1]} --fscid {fscid} --force') + self.run_cluster_cmd(f'fs new {fs_name} {pool_names[0]} {pool_names[1]} --fscid {fscid} --force') + self.fs.status().get_fsmap(fscid) + + def test_fs_new_with_specific_id_fails_without_force_flag(self): + """ + That command to create file system with specific ID fails without '--force' flag. + """ + fs_name = "test_fs_specific_id" + fscid = 100 + keys = ['metadata', 'data'] + pool_names = [fs_name+'-'+key for key in keys] + for p in pool_names: + self.run_cluster_cmd(f'osd pool create {p}') + try: + self.run_cluster_cmd(f'fs new {fs_name} {pool_names[0]} {pool_names[1]} --fscid {fscid}') + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EINVAL, + "invalid error code on creating a file system with specifc ID without --force flag") + else: + self.fail("expected creating file system with specific ID without '--force' flag to fail") + + def test_fs_new_with_specific_id_fails_already_in_use(self): + """ + That creating file system with ID already in use fails. + """ + fs_name = "test_fs_specific_id" + # file system ID already in use + fscid = self.fs.status().map['filesystems'][0]['id'] + keys = ['metadata', 'data'] + pool_names = [fs_name+'-'+key for key in keys] + for p in pool_names: + self.run_cluster_cmd(f'osd pool create {p}') + try: + self.run_cluster_cmd(f'fs new {fs_name} {pool_names[0]} {pool_names[1]} --fscid {fscid} --force') + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EINVAL, + "invalid error code on creating a file system with specifc ID that is already in use") + else: + self.fail("expected creating file system with ID already in use to fail") + + def test_fs_new_metadata_pool_already_in_use(self): + """ + That creating file system with metadata pool already in use. + """ + + # create first data pool, metadata pool and add with filesystem + first_fs = "first_fs" + first_metadata_pool = "first_metadata_pool" + first_data_pool = "first_data_pool" + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', first_metadata_pool) + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', first_data_pool) + self.fs.mon_manager.raw_cluster_cmd('fs', 'new', first_fs, first_metadata_pool, first_data_pool) + + second_fs = "second_fs" + second_data_pool = "second_data_pool" + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', second_data_pool) + + # try to create new fs 'second_fs' with following configuration + # metadata pool -> 'first_metadata_pool' + # data pool -> 'second_data_pool' + # Expecting EINVAL exit status because 'first_metadata_pool' + # is already in use with 'first_fs' + try: + self.fs.mon_manager.raw_cluster_cmd('fs', 'new', second_fs, first_metadata_pool, second_data_pool) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.EINVAL) + else: + self.fail("Expected EINVAL because metadata pool is already in use for 'first_fs'") + + def test_fs_new_data_pool_already_in_use(self): + """ + That creating file system with data pool already in use. + """ + + # create first data pool, metadata pool and add with filesystem + first_fs = "first_fs" + first_metadata_pool = "first_metadata_pool" + first_data_pool = "first_data_pool" + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', first_metadata_pool) + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', first_data_pool) + self.fs.mon_manager.raw_cluster_cmd('fs', 'new', first_fs, first_metadata_pool, first_data_pool) + + second_fs = "second_fs" + second_metadata_pool = "second_metadata_pool" + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', second_metadata_pool) + + # try to create new fs 'second_fs' with following configuration + # metadata pool -> 'second_metadata_pool' + # data pool -> 'first_data_pool' + # Expecting EINVAL exit status because 'first_data_pool' + # is already in use with 'first_fs' + try: + self.fs.mon_manager.raw_cluster_cmd('fs', 'new', second_fs, second_metadata_pool, first_data_pool) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.EINVAL) + else: + self.fail("Expected EINVAL because data pool is already in use for 'first_fs'") + + def test_fs_new_metadata_and_data_pool_in_use_by_another_same_fs(self): + """ + That creating file system with metadata and data pool which is already in use by another same fs. + """ + + # create first data pool, metadata pool and add with filesystem + first_fs = "first_fs" + first_metadata_pool = "first_metadata_pool" + first_data_pool = "first_data_pool" + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', first_metadata_pool) + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', first_data_pool) + self.fs.mon_manager.raw_cluster_cmd('fs', 'new', first_fs, first_metadata_pool, first_data_pool) + + second_fs = "second_fs" + + # try to create new fs 'second_fs' with following configuration + # metadata pool -> 'first_metadata_pool' + # data pool -> 'first_data_pool' + # Expecting EINVAL exit status because 'first_metadata_pool' and 'first_data_pool' + # is already in use with 'first_fs' + try: + self.fs.mon_manager.raw_cluster_cmd('fs', 'new', second_fs, first_metadata_pool, first_data_pool) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.EINVAL) + else: + self.fail("Expected EINVAL because metadata and data pool is already in use for 'first_fs'") + + def test_fs_new_metadata_and_data_pool_in_use_by_different_fs(self): + """ + That creating file system with metadata and data pool which is already in use by different fs. + """ + + # create first data pool, metadata pool and add with filesystem + first_fs = "first_fs" + first_metadata_pool = "first_metadata_pool" + first_data_pool = "first_data_pool" + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', first_metadata_pool) + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', first_data_pool) + self.fs.mon_manager.raw_cluster_cmd('fs', 'new', first_fs, first_metadata_pool, first_data_pool) + + # create second data pool, metadata pool and add with filesystem + second_fs = "second_fs" + second_metadata_pool = "second_metadata_pool" + second_data_pool = "second_data_pool" + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', second_metadata_pool) + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', second_data_pool) + self.fs.mon_manager.raw_cluster_cmd('fs', 'new', second_fs, second_metadata_pool, second_data_pool) + + third_fs = "third_fs" + + # try to create new fs 'third_fs' with following configuration + # metadata pool -> 'first_metadata_pool' + # data pool -> 'second_data_pool' + # Expecting EINVAL exit status because 'first_metadata_pool' and 'second_data_pool' + # is already in use with 'first_fs' and 'second_fs' + try: + self.fs.mon_manager.raw_cluster_cmd('fs', 'new', third_fs, first_metadata_pool, second_data_pool) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.EINVAL) + else: + self.fail("Expected EINVAL because metadata and data pool is already in use for 'first_fs' and 'second_fs'") + + def test_fs_new_interchange_already_in_use_metadata_and_data_pool_of_same_fs(self): + """ + That creating file system with interchanging metadata and data pool which is already in use by same fs. + """ + + # create first data pool, metadata pool and add with filesystem + first_fs = "first_fs" + first_metadata_pool = "first_metadata_pool" + first_data_pool = "first_data_pool" + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', first_metadata_pool) + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', first_data_pool) + self.fs.mon_manager.raw_cluster_cmd('fs', 'new', first_fs, first_metadata_pool, first_data_pool) + + second_fs = "second_fs" + + # try to create new fs 'second_fs' with following configuration + # metadata pool -> 'first_data_pool' (already used as data pool for 'first_fs') + # data pool -> 'first_metadata_pool' (already used as metadata pool for 'first_fs') + # Expecting EINVAL exit status because 'first_data_pool' and 'first_metadata_pool' + # is already in use with 'first_fs' + try: + self.fs.mon_manager.raw_cluster_cmd('fs', 'new', second_fs, first_data_pool, first_metadata_pool) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.EINVAL) + else: + self.fail("Expected EINVAL because metadata and data pool is already in use for 'first_fs'") + + def test_fs_new_interchange_already_in_use_metadata_and_data_pool_of_different_fs(self): + """ + That creating file system with interchanging metadata and data pool which is already in use by defferent fs. + """ + + # create first data pool, metadata pool and add with filesystem + first_fs = "first_fs" + first_metadata_pool = "first_metadata_pool" + first_data_pool = "first_data_pool" + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', first_metadata_pool) + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', first_data_pool) + self.fs.mon_manager.raw_cluster_cmd('fs', 'new', first_fs, first_metadata_pool, first_data_pool) + + # create second data pool, metadata pool and add with filesystem + second_fs = "second_fs" + second_metadata_pool = "second_metadata_pool" + second_data_pool = "second_data_pool" + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', second_metadata_pool) + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', second_data_pool) + self.fs.mon_manager.raw_cluster_cmd('fs', 'new', second_fs, second_metadata_pool, second_data_pool) + + third_fs = "third_fs" + + # try to create new fs 'third_fs' with following configuration + # metadata pool -> 'first_data_pool' (already used as data pool for 'first_fs') + # data pool -> 'second_metadata_pool' (already used as metadata pool for 'second_fs') + # Expecting EINVAL exit status because 'first_data_pool' and 'second_metadata_pool' + # is already in use with 'first_fs' and 'second_fs' + try: + self.fs.mon_manager.raw_cluster_cmd('fs', 'new', third_fs, first_data_pool, second_metadata_pool) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.EINVAL) + else: + self.fail("Expected EINVAL because metadata and data pool is already in use for 'first_fs' and 'second_fs'") + + def test_fs_new_metadata_pool_already_in_use_with_rbd(self): + """ + That creating new file system with metadata pool already used by rbd. + """ + + # create pool and initialise with rbd + new_pool = "new_pool" + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', new_pool) + self.ctx.cluster.run(args=['rbd', 'pool', 'init', new_pool]) + + new_fs = "new_fs" + new_data_pool = "new_data_pool" + + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', new_data_pool) + + # try to create new fs 'new_fs' with following configuration + # metadata pool -> 'new_pool' (already used by rbd app) + # data pool -> 'new_data_pool' + # Expecting EINVAL exit status because 'new_pool' is already in use with 'rbd' app + try: + self.fs.mon_manager.raw_cluster_cmd('fs', 'new', new_fs, new_pool, new_data_pool) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.EINVAL) + else: + self.fail("Expected EINVAL because metadata pool is already in use for rbd") + + def test_fs_new_data_pool_already_in_use_with_rbd(self): + """ + That creating new file system with data pool already used by rbd. + """ + + # create pool and initialise with rbd + new_pool = "new_pool" + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', new_pool) + self.ctx.cluster.run(args=['rbd', 'pool', 'init', new_pool]) + + new_fs = "new_fs" + new_metadata_pool = "new_metadata_pool" + + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', new_metadata_pool) + + # try to create new fs 'new_fs' with following configuration + # metadata pool -> 'new_metadata_pool' + # data pool -> 'new_pool' (already used by rbd app) + # Expecting EINVAL exit status because 'new_pool' is already in use with 'rbd' app + try: + self.fs.mon_manager.raw_cluster_cmd('fs', 'new', new_fs, new_metadata_pool, new_pool) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.EINVAL) + else: + self.fail("Expected EINVAL because data pool is already in use for rbd") + +class TestRenameCommand(TestAdminCommands): + """ + Tests for rename command. + """ + + CLIENTS_REQUIRED = 1 + MDSS_REQUIRED = 2 + + def test_fs_rename(self): + """ + That the file system can be renamed, and the application metadata set on its pools are as expected. + """ + # Renaming the file system breaks this mount as the client uses + # file system specific authorization. The client cannot read + # or write even if the client's cephx ID caps are updated to access + # the new file system name without the client being unmounted and + # re-mounted. + self.mount_a.umount_wait(require_clean=True) + orig_fs_name = self.fs.name + new_fs_name = 'new_cephfs' + client_id = 'test_new_cephfs' + + self.run_cluster_cmd(f'fs rename {orig_fs_name} {new_fs_name} --yes-i-really-mean-it') + + # authorize a cephx ID access to the renamed file system. + # use the ID to write to the file system. + self.fs.name = new_fs_name + keyring = self.fs.authorize(client_id, ('/', 'rw')) + keyring_path = self.mount_a.client_remote.mktemp(data=keyring) + self.mount_a.remount(client_id=client_id, + client_keyring_path=keyring_path, + cephfs_mntpt='/', + cephfs_name=self.fs.name) + filedata, filename = 'some data on fs', 'file_on_fs' + filepath = os_path_join(self.mount_a.hostfs_mntpt, filename) + self.mount_a.write_file(filepath, filedata) + self.check_pool_application_metadata_key_value( + self.fs.get_data_pool_name(), 'cephfs', 'data', new_fs_name) + self.check_pool_application_metadata_key_value( + self.fs.get_metadata_pool_name(), 'cephfs', 'metadata', new_fs_name) + + # cleanup + self.mount_a.umount_wait() + self.run_cluster_cmd(f'auth rm client.{client_id}') + + def test_fs_rename_idempotency(self): + """ + That the file system rename operation is idempotent. + """ + # Renaming the file system breaks this mount as the client uses + # file system specific authorization. + self.mount_a.umount_wait(require_clean=True) + orig_fs_name = self.fs.name + new_fs_name = 'new_cephfs' + + self.run_cluster_cmd(f'fs rename {orig_fs_name} {new_fs_name} --yes-i-really-mean-it') + self.run_cluster_cmd(f'fs rename {orig_fs_name} {new_fs_name} --yes-i-really-mean-it') + + # original file system name does not appear in `fs ls` command + self.assertFalse(self.fs.exists()) + self.fs.name = new_fs_name + self.assertTrue(self.fs.exists()) + + def test_fs_rename_fs_new_fails_with_old_fsname_existing_pools(self): + """ + That after renaming a file system, creating a file system with + old name and existing FS pools fails. + """ + # Renaming the file system breaks this mount as the client uses + # file system specific authorization. + self.mount_a.umount_wait(require_clean=True) + orig_fs_name = self.fs.name + new_fs_name = 'new_cephfs' + data_pool = self.fs.get_data_pool_name() + metadata_pool = self.fs.get_metadata_pool_name() + self.run_cluster_cmd(f'fs rename {orig_fs_name} {new_fs_name} --yes-i-really-mean-it') + + try: + self.run_cluster_cmd(f"fs new {orig_fs_name} {metadata_pool} {data_pool}") + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EINVAL, + "invalid error code on creating a new file system with old " + "name and existing pools.") + else: + self.fail("expected creating new file system with old name and " + "existing pools to fail.") + + try: + self.run_cluster_cmd(f"fs new {orig_fs_name} {metadata_pool} {data_pool} --force") + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EINVAL, + "invalid error code on creating a new file system with old " + "name, existing pools and --force flag.") + else: + self.fail("expected creating new file system with old name, " + "existing pools, and --force flag to fail.") + + try: + self.run_cluster_cmd(f"fs new {orig_fs_name} {metadata_pool} {data_pool} " + "--allow-dangerous-metadata-overlay") + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EINVAL, + "invalid error code on creating a new file system with old name, " + "existing pools and --allow-dangerous-metadata-overlay flag.") + else: + self.fail("expected creating new file system with old name, " + "existing pools, and --allow-dangerous-metadata-overlay flag to fail.") + + def test_fs_rename_fails_without_yes_i_really_mean_it_flag(self): + """ + That renaming a file system without '--yes-i-really-mean-it' flag fails. + """ + try: + self.run_cluster_cmd(f"fs rename {self.fs.name} new_fs") + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EPERM, + "invalid error code on renaming a file system without the " + "'--yes-i-really-mean-it' flag") + else: + self.fail("expected renaming of file system without the " + "'--yes-i-really-mean-it' flag to fail ") + + def test_fs_rename_fails_for_non_existent_fs(self): + """ + That renaming a non-existent file system fails. + """ + try: + self.run_cluster_cmd("fs rename non_existent_fs new_fs --yes-i-really-mean-it") + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.ENOENT, "invalid error code on renaming a non-existent fs") + else: + self.fail("expected renaming of a non-existent file system to fail") + + def test_fs_rename_fails_new_name_already_in_use(self): + """ + That renaming a file system fails if the new name refers to an existing file system. + """ + self.fs2 = self.mds_cluster.newfs(name='cephfs2', create=True) + + try: + self.run_cluster_cmd(f"fs rename {self.fs.name} {self.fs2.name} --yes-i-really-mean-it") + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EINVAL, + "invalid error code on renaming to a fs name that is already in use") + else: + self.fail("expected renaming to a new file system name that is already in use to fail.") + + def test_fs_rename_fails_with_mirroring_enabled(self): + """ + That renaming a file system fails if mirroring is enabled on it. + """ + orig_fs_name = self.fs.name + new_fs_name = 'new_cephfs' + + self.run_cluster_cmd(f'fs mirror enable {orig_fs_name}') + try: + self.run_cluster_cmd(f'fs rename {orig_fs_name} {new_fs_name} --yes-i-really-mean-it') + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EPERM, "invalid error code on renaming a mirrored file system") + else: + self.fail("expected renaming of a mirrored file system to fail") + self.run_cluster_cmd(f'fs mirror disable {orig_fs_name}') + + +class TestDump(CephFSTestCase): + CLIENTS_REQUIRED = 0 + MDSS_REQUIRED = 1 + + def test_fs_dump_epoch(self): + """ + That dumping a specific epoch works. + """ + + status1 = self.fs.status() + status2 = self.fs.status(epoch=status1["epoch"]-1) + self.assertEqual(status1["epoch"], status2["epoch"]+1) + + def test_fsmap_trim(self): + """ + That the fsmap is trimmed normally. + """ + + paxos_service_trim_min = 25 + self.config_set('mon', 'paxos_service_trim_min', paxos_service_trim_min) + mon_max_mdsmap_epochs = 20 + self.config_set('mon', 'mon_max_mdsmap_epochs', mon_max_mdsmap_epochs) + + status = self.fs.status() + epoch = status["epoch"] + + # for N mutations + mutations = paxos_service_trim_min + mon_max_mdsmap_epochs + b = False + for i in range(mutations): + self.fs.set_joinable(b) + b = not b + + time.sleep(10) # for tick/compaction + + try: + self.fs.status(epoch=epoch) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.ENOENT, "invalid error code when trying to fetch FSMap that was trimmed") + else: + self.fail("trimming did not occur as expected") + + def test_fsmap_force_trim(self): + """ + That the fsmap is trimmed forcefully. + """ + + status = self.fs.status() + epoch = status["epoch"] + + paxos_service_trim_min = 1 + self.config_set('mon', 'paxos_service_trim_min', paxos_service_trim_min) + mon_mds_force_trim_to = epoch+1 + self.config_set('mon', 'mon_mds_force_trim_to', mon_mds_force_trim_to) + + # force a new fsmap + self.fs.set_joinable(False) + time.sleep(10) # for tick/compaction + + status = self.fs.status() + log.debug(f"new epoch is {status['epoch']}") + self.fs.status(epoch=epoch+1) # epoch+1 is not trimmed, may not == status["epoch"] + + try: + self.fs.status(epoch=epoch) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.ENOENT, "invalid error code when trying to fetch FSMap that was trimmed") + else: + self.fail("trimming did not occur as expected") + + +class TestRequiredClientFeatures(CephFSTestCase): + CLIENTS_REQUIRED = 0 + MDSS_REQUIRED = 1 + + def test_required_client_features(self): + """ + That `ceph fs required_client_features` command functions. + """ + + def is_required(index): + out = self.fs.mon_manager.raw_cluster_cmd('fs', 'get', self.fs.name, '--format=json-pretty') + features = json.loads(out)['mdsmap']['required_client_features'] + if "feature_{0}".format(index) in features: + return True; + return False; + + features = json.loads(self.fs.mon_manager.raw_cluster_cmd('fs', 'feature', 'ls', '--format=json-pretty')) + self.assertGreater(len(features), 0); + + for f in features: + self.fs.required_client_features('rm', str(f['index'])) + + for f in features: + index = f['index'] + feature = f['name'] + if feature == 'reserved': + feature = str(index) + + if index % 3 == 0: + continue; + self.fs.required_client_features('add', feature) + self.assertTrue(is_required(index)) + + if index % 2 == 0: + continue; + self.fs.required_client_features('rm', feature) + self.assertFalse(is_required(index)) + + def test_required_client_feature_add_reserved(self): + """ + That `ceph fs required_client_features X add reserved` fails. + """ + + p = self.fs.required_client_features('add', 'reserved', check_status=False, stderr=StringIO()) + self.assertIn('Invalid feature name', p.stderr.getvalue()) + + def test_required_client_feature_rm_reserved(self): + """ + That `ceph fs required_client_features X rm reserved` fails. + """ + + p = self.fs.required_client_features('rm', 'reserved', check_status=False, stderr=StringIO()) + self.assertIn('Invalid feature name', p.stderr.getvalue()) + + def test_required_client_feature_add_reserved_bit(self): + """ + That `ceph fs required_client_features X add <reserved_bit>` passes. + """ + + p = self.fs.required_client_features('add', '1', stderr=StringIO()) + self.assertIn("added feature 'reserved' to required_client_features", p.stderr.getvalue()) + + def test_required_client_feature_rm_reserved_bit(self): + """ + That `ceph fs required_client_features X rm <reserved_bit>` passes. + """ + + self.fs.required_client_features('add', '1') + p = self.fs.required_client_features('rm', '1', stderr=StringIO()) + self.assertIn("removed feature 'reserved' from required_client_features", p.stderr.getvalue()) + +class TestCompatCommands(CephFSTestCase): + """ + """ + + CLIENTS_REQUIRED = 0 + MDSS_REQUIRED = 3 + + def test_add_compat(self): + """ + Test adding a compat. + """ + + self.fs.fail() + self.fs.add_compat(63, 'placeholder') + mdsmap = self.fs.get_mds_map() + self.assertIn("feature_63", mdsmap['compat']['compat']) + + def test_add_incompat(self): + """ + Test adding an incompat. + """ + + self.fs.fail() + self.fs.add_incompat(63, 'placeholder') + mdsmap = self.fs.get_mds_map() + log.info(f"{mdsmap}") + self.assertIn("feature_63", mdsmap['compat']['incompat']) + + def test_rm_compat(self): + """ + Test removing a compat. + """ + + self.fs.fail() + self.fs.add_compat(63, 'placeholder') + self.fs.rm_compat(63) + mdsmap = self.fs.get_mds_map() + self.assertNotIn("feature_63", mdsmap['compat']['compat']) + + def test_rm_incompat(self): + """ + Test removing an incompat. + """ + + self.fs.fail() + self.fs.add_incompat(63, 'placeholder') + self.fs.rm_incompat(63) + mdsmap = self.fs.get_mds_map() + self.assertNotIn("feature_63", mdsmap['compat']['incompat']) + + def test_standby_compat(self): + """ + That adding a compat does not prevent standbys from joining. + """ + + self.fs.fail() + self.fs.add_compat(63, "placeholder") + self.fs.set_joinable() + self.fs.wait_for_daemons() + mdsmap = self.fs.get_mds_map() + self.assertIn("feature_63", mdsmap['compat']['compat']) + + def test_standby_incompat_reject(self): + """ + That adding an incompat feature prevents incompatible daemons from joining. + """ + + self.fs.fail() + self.fs.add_incompat(63, "placeholder") + self.fs.set_joinable() + try: + self.fs.wait_for_daemons(timeout=60) + except RuntimeError as e: + if "Timed out waiting for MDS daemons to become healthy" in str(e): + pass + else: + raise + else: + self.fail() + + def test_standby_incompat_upgrade(self): + """ + That an MDS can upgrade the compat of a fs. + """ + + self.fs.fail() + self.fs.rm_incompat(1) + self.fs.set_joinable() + self.fs.wait_for_daemons() + mdsmap = self.fs.get_mds_map() + self.assertIn("feature_1", mdsmap['compat']['incompat']) + + def test_standby_replay_not_upgradeable(self): + """ + That the mons will not upgrade the MDSMap compat if standby-replay is + enabled. + """ + + self.fs.fail() + self.fs.rm_incompat(1) + self.fs.set_allow_standby_replay(True) + self.fs.set_joinable() + try: + self.fs.wait_for_daemons(timeout=60) + except RuntimeError as e: + if "Timed out waiting for MDS daemons to become healthy" in str(e): + pass + else: + raise + else: + self.fail() + + def test_standby_incompat_reject_multifs(self): + """ + Like test_standby_incompat_reject but with a second fs. + """ + + fs2 = self.mds_cluster.newfs(name="cephfs2", create=True) + fs2.fail() + fs2.add_incompat(63, 'placeholder') + fs2.set_joinable() + try: + fs2.wait_for_daemons(timeout=60) + except RuntimeError as e: + if "Timed out waiting for MDS daemons to become healthy" in str(e): + pass + else: + raise + else: + self.fail() + # did self.fs lose MDS or standbys suicide? + self.fs.wait_for_daemons() + mdsmap = fs2.get_mds_map() + self.assertIn("feature_63", mdsmap['compat']['incompat']) + +class TestConfigCommands(CephFSTestCase): + """ + Test that daemons and clients respond to the otherwise rarely-used + runtime config modification operations. + """ + + CLIENTS_REQUIRED = 1 + MDSS_REQUIRED = 1 + + def test_ceph_config_show(self): + """ + That I can successfully show MDS configuration. + """ + + names = self.fs.get_rank_names() + for n in names: + s = self.fs.mon_manager.raw_cluster_cmd("config", "show", "mds."+n) + self.assertTrue("NAME" in s) + self.assertTrue("mon_host" in s) + + + def test_client_config(self): + """ + That I can successfully issue asok "config set" commands + + :return: + """ + + if not isinstance(self.mount_a, FuseMount): + self.skipTest("Test only applies to FUSE clients") + + test_key = "client_cache_size" + test_val = "123" + self.mount_a.admin_socket(['config', 'set', test_key, test_val]) + out = self.mount_a.admin_socket(['config', 'get', test_key]) + self.assertEqual(out[test_key], test_val) + + + def test_mds_config_asok(self): + test_key = "mds_max_purge_ops" + test_val = "123" + self.fs.mds_asok(['config', 'set', test_key, test_val]) + out = self.fs.mds_asok(['config', 'get', test_key]) + self.assertEqual(out[test_key], test_val) + + def test_mds_dump_cache_asok(self): + cache_file = "cache_file" + timeout = "1" + self.fs.rank_asok(['dump', 'cache', cache_file, timeout]) + + def test_mds_config_tell(self): + test_key = "mds_max_purge_ops" + test_val = "123" + + self.fs.rank_tell(['injectargs', "--{0}={1}".format(test_key, test_val)]) + + # Read it back with asok because there is no `tell` equivalent + out = self.fs.rank_tell(['config', 'get', test_key]) + self.assertEqual(out[test_key], test_val) + + +class TestMirroringCommands(CephFSTestCase): + CLIENTS_REQUIRED = 1 + MDSS_REQUIRED = 1 + + def _enable_mirroring(self, fs_name): + self.fs.mon_manager.raw_cluster_cmd("fs", "mirror", "enable", fs_name) + + def _disable_mirroring(self, fs_name): + self.fs.mon_manager.raw_cluster_cmd("fs", "mirror", "disable", fs_name) + + def _add_peer(self, fs_name, peer_spec, remote_fs_name): + peer_uuid = str(uuid.uuid4()) + self.fs.mon_manager.raw_cluster_cmd("fs", "mirror", "peer_add", fs_name, peer_uuid, peer_spec, remote_fs_name) + + def _remove_peer(self, fs_name, peer_uuid): + self.fs.mon_manager.raw_cluster_cmd("fs", "mirror", "peer_remove", fs_name, peer_uuid) + + def _verify_mirroring(self, fs_name, flag_str): + status = self.fs.status() + fs_map = status.get_fsmap_byname(fs_name) + if flag_str == 'enabled': + self.assertTrue('mirror_info' in fs_map) + elif flag_str == 'disabled': + self.assertTrue('mirror_info' not in fs_map) + else: + raise RuntimeError(f'invalid flag_str {flag_str}') + + def _get_peer_uuid(self, fs_name, peer_spec): + status = self.fs.status() + fs_map = status.get_fsmap_byname(fs_name) + mirror_info = fs_map.get('mirror_info', None) + self.assertTrue(mirror_info is not None) + for peer_uuid, remote in mirror_info['peers'].items(): + client_name = remote['remote']['client_name'] + cluster_name = remote['remote']['cluster_name'] + spec = f'{client_name}@{cluster_name}' + if spec == peer_spec: + return peer_uuid + return None + + def test_mirroring_command(self): + """basic mirroring command test -- enable, disable mirroring on a + filesystem""" + self._enable_mirroring(self.fs.name) + self._verify_mirroring(self.fs.name, "enabled") + self._disable_mirroring(self.fs.name) + self._verify_mirroring(self.fs.name, "disabled") + + def test_mirroring_peer_commands(self): + """test adding and removing peers to a mirror enabled filesystem""" + self._enable_mirroring(self.fs.name) + self._add_peer(self.fs.name, "client.site-b@site-b", "fs_b") + self._add_peer(self.fs.name, "client.site-c@site-c", "fs_c") + self._verify_mirroring(self.fs.name, "enabled") + uuid_peer_b = self._get_peer_uuid(self.fs.name, "client.site-b@site-b") + uuid_peer_c = self._get_peer_uuid(self.fs.name, "client.site-c@site-c") + self.assertTrue(uuid_peer_b is not None) + self.assertTrue(uuid_peer_c is not None) + self._remove_peer(self.fs.name, uuid_peer_b) + self._remove_peer(self.fs.name, uuid_peer_c) + self._disable_mirroring(self.fs.name) + self._verify_mirroring(self.fs.name, "disabled") + + def test_mirroring_command_idempotency(self): + """test to check idempotency of mirroring family of commands """ + self._enable_mirroring(self.fs.name) + self._verify_mirroring(self.fs.name, "enabled") + self._enable_mirroring(self.fs.name) + # add peer + self._add_peer(self.fs.name, "client.site-b@site-b", "fs_b") + uuid_peer_b1 = self._get_peer_uuid(self.fs.name, "client.site-b@site-b") + self.assertTrue(uuid_peer_b1 is not None) + # adding the peer again should be idempotent + self._add_peer(self.fs.name, "client.site-b@site-b", "fs_b") + uuid_peer_b2 = self._get_peer_uuid(self.fs.name, "client.site-b@site-b") + self.assertTrue(uuid_peer_b2 is not None) + self.assertTrue(uuid_peer_b1 == uuid_peer_b2) + # remove peer + self._remove_peer(self.fs.name, uuid_peer_b1) + uuid_peer_b3 = self._get_peer_uuid(self.fs.name, "client.site-b@site-b") + self.assertTrue(uuid_peer_b3 is None) + # removing the peer again should be idempotent + self._remove_peer(self.fs.name, uuid_peer_b1) + self._disable_mirroring(self.fs.name) + self._verify_mirroring(self.fs.name, "disabled") + self._disable_mirroring(self.fs.name) + + def test_mirroring_disable_with_peers(self): + """test disabling mirroring for a filesystem with active peers""" + self._enable_mirroring(self.fs.name) + self._add_peer(self.fs.name, "client.site-b@site-b", "fs_b") + self._verify_mirroring(self.fs.name, "enabled") + uuid_peer_b = self._get_peer_uuid(self.fs.name, "client.site-b@site-b") + self.assertTrue(uuid_peer_b is not None) + self._disable_mirroring(self.fs.name) + self._verify_mirroring(self.fs.name, "disabled") + # enable mirroring to check old peers + self._enable_mirroring(self.fs.name) + self._verify_mirroring(self.fs.name, "enabled") + # peer should be gone + uuid_peer_b = self._get_peer_uuid(self.fs.name, "client.site-b@site-b") + self.assertTrue(uuid_peer_b is None) + self._disable_mirroring(self.fs.name) + self._verify_mirroring(self.fs.name, "disabled") + + def test_mirroring_with_filesystem_reset(self): + """test to verify mirroring state post filesystem reset""" + self._enable_mirroring(self.fs.name) + self._add_peer(self.fs.name, "client.site-b@site-b", "fs_b") + self._verify_mirroring(self.fs.name, "enabled") + uuid_peer_b = self._get_peer_uuid(self.fs.name, "client.site-b@site-b") + self.assertTrue(uuid_peer_b is not None) + # reset filesystem + self.fs.fail() + self.fs.reset() + self.fs.wait_for_daemons() + self._verify_mirroring(self.fs.name, "disabled") + + +class TestFsAuthorize(CephFSTestCase): + client_id = 'testuser' + client_name = 'client.' + client_id + + def test_single_path_r(self): + PERM = 'r' + FS_AUTH_CAPS = (('/', PERM),) + self.captester = CapTester() + self.setup_test_env(FS_AUTH_CAPS) + + self.captester.run_mon_cap_tests(self.fs, self.client_id) + self.captester.run_mds_cap_tests(PERM) + + def test_single_path_rw(self): + PERM = 'rw' + FS_AUTH_CAPS = (('/', PERM),) + self.captester = CapTester() + self.setup_test_env(FS_AUTH_CAPS) + + self.captester.run_mon_cap_tests(self.fs, self.client_id) + self.captester.run_mds_cap_tests(PERM) + + def test_single_path_rootsquash(self): + PERM = 'rw' + FS_AUTH_CAPS = (('/', PERM, 'root_squash'),) + self.captester = CapTester() + self.setup_test_env(FS_AUTH_CAPS) + + # testing MDS caps... + # Since root_squash is set in client caps, client can read but not + # write even thought access level is set to "rw". + self.captester.conduct_pos_test_for_read_caps() + self.captester.conduct_neg_test_for_write_caps(sudo_write=True) + + def test_single_path_authorize_on_nonalphanumeric_fsname(self): + """ + That fs authorize command works on filesystems with names having [_.-] + characters + """ + self.mount_a.umount_wait(require_clean=True) + self.mds_cluster.delete_all_filesystems() + fs_name = "cephfs-_." + self.fs = self.mds_cluster.newfs(name=fs_name) + self.fs.wait_for_daemons() + self.run_cluster_cmd(f'auth caps client.{self.mount_a.client_id} ' + f'mon "allow r" ' + f'osd "allow rw pool={self.fs.get_data_pool_name()}" ' + f'mds allow') + self.mount_a.remount(cephfs_name=self.fs.name) + PERM = 'rw' + FS_AUTH_CAPS = (('/', PERM),) + self.captester = CapTester() + self.setup_test_env(FS_AUTH_CAPS) + self.captester.run_mds_cap_tests(PERM) + + def test_multiple_path_r(self): + PERM = 'r' + FS_AUTH_CAPS = (('/dir1/dir12', PERM), ('/dir2/dir22', PERM)) + for c in FS_AUTH_CAPS: + self.mount_a.run_shell(f'mkdir -p .{c[0]}') + self.captesters = (CapTester(), CapTester()) + self.setup_test_env(FS_AUTH_CAPS) + + self.run_cap_test_one_by_one(FS_AUTH_CAPS) + + def test_multiple_path_rw(self): + PERM = 'rw' + FS_AUTH_CAPS = (('/dir1/dir12', PERM), ('/dir2/dir22', PERM)) + for c in FS_AUTH_CAPS: + self.mount_a.run_shell(f'mkdir -p .{c[0]}') + self.captesters = (CapTester(), CapTester()) + self.setup_test_env(FS_AUTH_CAPS) + + self.run_cap_test_one_by_one(FS_AUTH_CAPS) + + def run_cap_test_one_by_one(self, fs_auth_caps): + keyring = self.run_cluster_cmd(f'auth get {self.client_name}') + for i, c in enumerate(fs_auth_caps): + self.assertIn(i, (0, 1)) + PATH = c[0] + PERM = c[1] + self._remount(keyring, PATH) + # actual tests... + self.captesters[i].run_mon_cap_tests(self.fs, self.client_id) + self.captesters[i].run_mds_cap_tests(PERM, PATH) + + def tearDown(self): + self.mount_a.umount_wait() + self.run_cluster_cmd(f'auth rm {self.client_name}') + + super(type(self), self).tearDown() + + def _remount(self, keyring, path='/'): + keyring_path = self.mount_a.client_remote.mktemp(data=keyring) + self.mount_a.remount(client_id=self.client_id, + client_keyring_path=keyring_path, + cephfs_mntpt=path) + + def setup_for_single_path(self, fs_auth_caps): + self.captester.write_test_files((self.mount_a,), '/') + keyring = self.fs.authorize(self.client_id, fs_auth_caps) + self._remount(keyring) + + def setup_for_multiple_paths(self, fs_auth_caps): + for i, c in enumerate(fs_auth_caps): + PATH = c[0] + self.captesters[i].write_test_files((self.mount_a,), PATH) + + self.fs.authorize(self.client_id, fs_auth_caps) + + def setup_test_env(self, fs_auth_caps): + if len(fs_auth_caps) == 1: + self.setup_for_single_path(fs_auth_caps[0]) + else: + self.setup_for_multiple_paths(fs_auth_caps) + + +class TestAdminCommandIdempotency(CephFSTestCase): + """ + Tests for administration command idempotency. + """ + + CLIENTS_REQUIRED = 0 + MDSS_REQUIRED = 1 + + def test_rm_idempotency(self): + """ + That a removing a fs twice is idempotent. + """ + + data_pools = self.fs.get_data_pool_names(refresh=True) + self.fs.fail() + self.fs.rm() + try: + self.fs.get_mds_map() + except FSMissing: + pass + else: + self.fail("get_mds_map should raise") + p = self.fs.rm() + self.assertIn("does not exist", p.stderr.getvalue()) + self.fs.remove_pools(data_pools) + + +class TestAdminCommandDumpTree(CephFSTestCase): + """ + Tests for administration command subtrees. + """ + + CLIENTS_REQUIRED = 0 + MDSS_REQUIRED = 1 + + def test_dump_subtrees(self): + """ + Dump all the subtrees to make sure the MDS daemon won't crash. + """ + + subtrees = self.fs.mds_asok(['get', 'subtrees']) + log.info(f"dumping {len(subtrees)} subtrees:") + for subtree in subtrees: + log.info(f" subtree: '{subtree['dir']['path']}'") + self.fs.mds_asok(['dump', 'tree', subtree['dir']['path']]) + + log.info("dumping 2 special subtrees:") + log.info(" subtree: '/'") + self.fs.mds_asok(['dump', 'tree', '/']) + log.info(" subtree: '~mdsdir'") + self.fs.mds_asok(['dump', 'tree', '~mdsdir']) + +class TestAdminCommandDumpLoads(CephFSTestCase): + """ + Tests for administration command dump loads. + """ + + CLIENTS_REQUIRED = 0 + MDSS_REQUIRED = 1 + + def test_dump_loads(self): + """ + make sure depth limit param is considered when dump loads for a MDS daemon. + """ + + log.info("dumping loads") + loads = self.fs.mds_asok(['dump', 'loads', '1']) + self.assertIsNotNone(loads) + self.assertIn("dirfrags", loads) + for d in loads["dirfrags"]: + self.assertLessEqual(d["path"].count("/"), 1) + +class TestFsBalRankMask(CephFSTestCase): + """ + Tests ceph fs set <fs_name> bal_rank_mask + """ + + CLIENTS_REQUIRED = 0 + MDSS_REQUIRED = 2 + + def test_bal_rank_mask(self): + """ + check whether a specified bal_rank_mask value is valid or not. + """ + bal_rank_mask = '0x0' + log.info(f"set bal_rank_mask {bal_rank_mask}") + self.fs.set_bal_rank_mask(bal_rank_mask) + self.assertEqual(bal_rank_mask, self.fs.get_var('bal_rank_mask')) + + bal_rank_mask = '0' + log.info(f"set bal_rank_mask {bal_rank_mask}") + self.fs.set_bal_rank_mask(bal_rank_mask) + self.assertEqual(bal_rank_mask, self.fs.get_var('bal_rank_mask')) + + bal_rank_mask = '-1' + log.info(f"set bal_rank_mask {bal_rank_mask}") + self.fs.set_bal_rank_mask(bal_rank_mask) + self.assertEqual(bal_rank_mask, self.fs.get_var('bal_rank_mask')) + + bal_rank_mask = 'all' + log.info(f"set bal_rank_mask {bal_rank_mask}") + self.fs.set_bal_rank_mask(bal_rank_mask) + self.assertEqual(bal_rank_mask, self.fs.get_var('bal_rank_mask')) + + bal_rank_mask = '0x1' + log.info(f"set bal_rank_mask {bal_rank_mask}") + self.fs.set_bal_rank_mask(bal_rank_mask) + self.assertEqual(bal_rank_mask, self.fs.get_var('bal_rank_mask')) + + bal_rank_mask = '1' + log.info(f"set bal_rank_mask {bal_rank_mask}") + self.fs.set_bal_rank_mask(bal_rank_mask) + self.assertEqual(bal_rank_mask, self.fs.get_var('bal_rank_mask')) + + bal_rank_mask = 'f0' + log.info(f"set bal_rank_mask {bal_rank_mask}") + self.fs.set_bal_rank_mask(bal_rank_mask) + self.assertEqual(bal_rank_mask, self.fs.get_var('bal_rank_mask')) + + bal_rank_mask = 'ab' + log.info(f"set bal_rank_mask {bal_rank_mask}") + self.fs.set_bal_rank_mask(bal_rank_mask) + self.assertEqual(bal_rank_mask, self.fs.get_var('bal_rank_mask')) + + bal_rank_mask = '0xfff0' + log.info(f"set bal_rank_mask {bal_rank_mask}") + self.fs.set_bal_rank_mask(bal_rank_mask) + self.assertEqual(bal_rank_mask, self.fs.get_var('bal_rank_mask')) + + MAX_MDS = 256 + bal_rank_mask = '0x' + 'f' * int(MAX_MDS / 4) + log.info(f"set bal_rank_mask {bal_rank_mask}") + self.fs.set_bal_rank_mask(bal_rank_mask) + self.assertEqual(bal_rank_mask, self.fs.get_var('bal_rank_mask')) + + bal_rank_mask = '' + log.info("set bal_rank_mask to empty string") + try: + self.fs.set_bal_rank_mask(bal_rank_mask) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.EINVAL) + + bal_rank_mask = '0x1' + 'f' * int(MAX_MDS / 4) + log.info(f"set bal_rank_mask {bal_rank_mask}") + try: + self.fs.set_bal_rank_mask(bal_rank_mask) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.EINVAL) diff --git a/qa/tasks/cephfs/test_auto_repair.py b/qa/tasks/cephfs/test_auto_repair.py new file mode 100644 index 000000000..e6f0a8f0b --- /dev/null +++ b/qa/tasks/cephfs/test_auto_repair.py @@ -0,0 +1,88 @@ + +""" +Exercise the MDS's auto repair functions +""" + +import logging +import time + +from teuthology.exceptions import CommandFailedError +from tasks.cephfs.cephfs_test_case import CephFSTestCase + + +log = logging.getLogger(__name__) + + +# Arbitrary timeouts for operations involving restarting +# an MDS or waiting for it to come up +MDS_RESTART_GRACE = 60 + + +class TestMDSAutoRepair(CephFSTestCase): + def test_backtrace_repair(self): + """ + MDS should verify/fix backtrace on fetch dirfrag + """ + + self.mount_a.run_shell(["mkdir", "testdir1"]) + self.mount_a.run_shell(["touch", "testdir1/testfile"]) + dir_objname = "{:x}.00000000".format(self.mount_a.path_to_ino("testdir1")) + + # drop inodes caps + self.mount_a.umount_wait() + + # flush journal entries to dirfrag objects, and expire journal + self.fs.mds_asok(['flush', 'journal']) + + # Restart the MDS to drop the metadata cache (because we expired the journal, + # nothing gets replayed into cache on restart) + self.fs.rank_fail() + self.fs.wait_for_daemons() + + # remove testdir1's backtrace + self.fs.radosm(["rmxattr", dir_objname, "parent"]) + + # readdir (fetch dirfrag) should fix testdir1's backtrace + self.mount_a.mount_wait() + self.mount_a.run_shell(["ls", "testdir1"]) + + # flush journal entries to dirfrag objects + self.fs.mds_asok(['flush', 'journal']) + + # check if backtrace exists + self.fs.radosm(["getxattr", dir_objname, "parent"]) + + def test_mds_readonly(self): + """ + test if MDS behave correct when it's readonly + """ + # operation should successd when MDS is not readonly + self.mount_a.run_shell(["touch", "test_file1"]) + writer = self.mount_a.write_background(loop=True) + + time.sleep(10) + self.assertFalse(writer.finished) + + # force MDS to read-only mode + self.fs.mds_asok(['force_readonly']) + time.sleep(10) + + # touching test file should fail + try: + self.mount_a.run_shell(["touch", "test_file1"]) + except CommandFailedError: + pass + else: + self.assertTrue(False) + + # background writer also should fail + self.assertTrue(writer.finished) + + # The MDS should report its readonly health state to the mon + self.wait_for_health("MDS_READ_ONLY", timeout=30) + + # restart mds to make it writable + self.fs.mds_fail_restart() + self.fs.wait_for_daemons() + + self.wait_for_health_clear(timeout=30) diff --git a/qa/tasks/cephfs/test_backtrace.py b/qa/tasks/cephfs/test_backtrace.py new file mode 100644 index 000000000..6b094569b --- /dev/null +++ b/qa/tasks/cephfs/test_backtrace.py @@ -0,0 +1,102 @@ + +from tasks.cephfs.cephfs_test_case import CephFSTestCase +from tasks.cephfs.filesystem import ObjectNotFound + +class TestBacktrace(CephFSTestCase): + def test_backtrace(self): + """ + That the 'parent' 'layout' and 'symlink' xattrs on the head objects of files + are updated correctly. + """ + + old_data_pool_name = self.fs.get_data_pool_name() + old_pool_id = self.fs.get_data_pool_id() + + # Not enabling symlink recovery option should not store symlink xattr + self.config_set('mds', 'mds_symlink_recovery', 'false') + self.mount_a.run_shell(["mkdir", "sym_dir0"]) + self.mount_a.run_shell(["touch", "sym_dir0/file1"]) + self.mount_a.run_shell(["ln", "-s", "sym_dir0/file1", "sym_dir0/symlink_file1"]) + file_ino = self.mount_a.path_to_ino("sym_dir0/symlink_file1", follow_symlinks=False) + + self.fs.mds_asok(["flush", "journal"]) + with self.assertRaises(ObjectNotFound): + self.fs.read_symlink(file_ino) + + # Enabling symlink recovery option should store symlink xattr for symlinks + self.config_set('mds', 'mds_symlink_recovery', 'true') + self.mount_a.run_shell(["mkdir", "sym_dir"]) + self.mount_a.run_shell(["touch", "sym_dir/file1"]) + self.mount_a.run_shell(["ln", "-s", "./file1", "sym_dir/symlink_file1"]) + file_ino = self.mount_a.path_to_ino("sym_dir/symlink_file1", follow_symlinks=False) + + self.fs.mds_asok(["flush", "journal"]) + symlink = self.fs.read_symlink(file_ino) + self.assertEqual(symlink, { + "s" : "./file1", + }) + + # Create a file for subsequent checks + self.mount_a.run_shell(["mkdir", "parent_a"]) + self.mount_a.run_shell(["touch", "parent_a/alpha"]) + file_ino = self.mount_a.path_to_ino("parent_a/alpha") + + # That backtrace and layout are written after initial flush + self.fs.mds_asok(["flush", "journal"]) + backtrace = self.fs.read_backtrace(file_ino) + self.assertEqual(['alpha', 'parent_a'], [a['dname'] for a in backtrace['ancestors']]) + layout = self.fs.read_layout(file_ino) + self.assertDictEqual(layout, { + "stripe_unit": 4194304, + "stripe_count": 1, + "object_size": 4194304, + "pool_id": old_pool_id, + "pool_ns": "", + }) + self.assertEqual(backtrace['pool'], old_pool_id) + + # That backtrace is written after parentage changes + self.mount_a.run_shell(["mkdir", "parent_b"]) + self.mount_a.run_shell(["mv", "parent_a/alpha", "parent_b/alpha"]) + + self.fs.mds_asok(["flush", "journal"]) + backtrace = self.fs.read_backtrace(file_ino) + self.assertEqual(['alpha', 'parent_b'], [a['dname'] for a in backtrace['ancestors']]) + + # Create a new data pool + new_pool_name = "data_new" + new_pool_id = self.fs.add_data_pool(new_pool_name) + + # That an object which has switched pools gets its backtrace updated + self.mount_a.setfattr("./parent_b/alpha", + "ceph.file.layout.pool", new_pool_name) + self.fs.mds_asok(["flush", "journal"]) + backtrace_old_pool = self.fs.read_backtrace(file_ino, pool=old_data_pool_name) + self.assertEqual(backtrace_old_pool['pool'], new_pool_id) + backtrace_new_pool = self.fs.read_backtrace(file_ino, pool=new_pool_name) + self.assertEqual(backtrace_new_pool['pool'], new_pool_id) + new_pool_layout = self.fs.read_layout(file_ino, pool=new_pool_name) + self.assertEqual(new_pool_layout['pool_id'], new_pool_id) + self.assertEqual(new_pool_layout['pool_ns'], '') + + # That subsequent linkage changes are only written to new pool backtrace + self.mount_a.run_shell(["mkdir", "parent_c"]) + self.mount_a.run_shell(["mv", "parent_b/alpha", "parent_c/alpha"]) + self.fs.mds_asok(["flush", "journal"]) + backtrace_old_pool = self.fs.read_backtrace(file_ino, pool=old_data_pool_name) + self.assertEqual(['alpha', 'parent_b'], [a['dname'] for a in backtrace_old_pool['ancestors']]) + backtrace_new_pool = self.fs.read_backtrace(file_ino, pool=new_pool_name) + self.assertEqual(['alpha', 'parent_c'], [a['dname'] for a in backtrace_new_pool['ancestors']]) + + # That layout is written to new pool after change to other field in layout + self.mount_a.setfattr("./parent_c/alpha", + "ceph.file.layout.object_size", "8388608") + + self.fs.mds_asok(["flush", "journal"]) + new_pool_layout = self.fs.read_layout(file_ino, pool=new_pool_name) + self.assertEqual(new_pool_layout['object_size'], 8388608) + + # ...but not to the old pool: the old pool's backtrace points to the new pool, and that's enough, + # we don't update the layout in all the old pools whenever it changes + old_pool_layout = self.fs.read_layout(file_ino, pool=old_data_pool_name) + self.assertEqual(old_pool_layout['object_size'], 4194304) diff --git a/qa/tasks/cephfs/test_cap_flush.py b/qa/tasks/cephfs/test_cap_flush.py new file mode 100644 index 000000000..70fdc3893 --- /dev/null +++ b/qa/tasks/cephfs/test_cap_flush.py @@ -0,0 +1,58 @@ + +import os +import time +from textwrap import dedent +from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology + +class TestCapFlush(CephFSTestCase): + @for_teuthology + def test_replay_create(self): + """ + MDS starts to handle client caps when it enters clientreplay stage. + When handling a client cap in clientreplay stage, it's possible that + corresponding inode does not exist because the client request which + creates inode hasn't been replayed. + """ + + dir_path = os.path.join(self.mount_a.mountpoint, "testdir") + py_script = dedent(""" + import os + os.mkdir("{0}") + fd = os.open("{0}", os.O_RDONLY) + os.fchmod(fd, 0o777) + os.fsync(fd) + """).format(dir_path) + self.mount_a.run_python(py_script) + + self.fs.mds_asok(["flush", "journal"]) + + # client will only get unsafe replay + self.fs.mds_asok(["config", "set", "mds_log_pause", "1"]) + + file_name = "testfile" + file_path = dir_path + "/" + file_name + + # Create a file and modify its mode. ceph-fuse will mark Ax cap dirty + py_script = dedent(""" + import os + os.chdir("{0}") + os.setgid(65534) + os.setuid(65534) + fd = os.open("{1}", os.O_CREAT | os.O_RDWR, 0o644) + os.fchmod(fd, 0o640) + """).format(dir_path, file_name) + self.mount_a.run_python(py_script, sudo=True) + + # Modify file mode by different user. ceph-fuse will send a setattr request + self.mount_a.run_shell(["sudo", "chmod", "600", file_path], wait=False, omit_sudo=False) + + time.sleep(10) + + # Restart mds. Client will re-send the unsafe request and cap flush + self.fs.rank_fail() + self.fs.wait_for_daemons() + + mode = self.mount_a.run_shell(['stat', '-c' '%a', file_path]).stdout.getvalue().strip() + # If the cap flush get dropped, mode should be 0644. + # (Ax cap stays in dirty state, which prevents setattr reply from updating file mode) + self.assertEqual(mode, "600") diff --git a/qa/tasks/cephfs/test_cephfs_shell.py b/qa/tasks/cephfs/test_cephfs_shell.py new file mode 100644 index 000000000..9f7434762 --- /dev/null +++ b/qa/tasks/cephfs/test_cephfs_shell.py @@ -0,0 +1,1167 @@ +""" +NOTE: For running this tests locally (using vstart_runner.py), export the +path to src/tools/cephfs/shell/cephfs-shell module to $PATH. Running +"export PATH=$PATH:$(cd ../src/tools/cephfs/shell && pwd)" from the build dir +will update the environment without hassles of typing the path correctly. +""" +from io import StringIO +from os import path +import crypt +import logging +from tempfile import mkstemp as tempfile_mkstemp +import math +from time import sleep +from tasks.cephfs.cephfs_test_case import CephFSTestCase +from teuthology.exceptions import CommandFailedError +from textwrap import dedent + +log = logging.getLogger(__name__) + + +def humansize(nbytes): + suffixes = ['B', 'K', 'M', 'G', 'T', 'P'] + i = 0 + while nbytes >= 1024 and i < len(suffixes) - 1: + nbytes /= 1024. + i += 1 + nbytes = math.ceil(nbytes) + f = ('%d' % nbytes).rstrip('.') + return '%s%s' % (f, suffixes[i]) + + +def ensure_str(s): + if isinstance(s, str): + return s + if isinstance(s, bytes): + return s.decode() + raise TypeError("not expecting type '%s'" % type(s)) + + +class TestCephFSShell(CephFSTestCase): + CLIENTS_REQUIRED = 1 + + def setUp(self): + super(TestCephFSShell, self).setUp() + + conf_contents = "[cephfs-shell]\ncolors = False\ndebug = True\n" + confpath = self.mount_a.client_remote.sh('mktemp').strip() + self.mount_a.client_remote.write_file(confpath, conf_contents) + self.default_shell_conf_path = confpath + + def run_cephfs_shell_cmd(self, cmd, mount_x=None, shell_conf_path=None, + opts=None, stdout=None, stderr=None, stdin=None, + check_status=True): + stdout = stdout or StringIO() + stderr = stderr or StringIO() + if mount_x is None: + mount_x = self.mount_a + if isinstance(cmd, list): + cmd = " ".join(cmd) + if not shell_conf_path: + shell_conf_path = self.default_shell_conf_path + + args = ["cephfs-shell", "-c", shell_conf_path] + if opts: + args += opts + args.extend(("--", cmd)) + + log.info("Running command: {}".format(" ".join(args))) + return mount_x.client_remote.run(args=args, stdout=stdout, + stderr=stderr, stdin=stdin, + check_status=check_status) + + def negtest_cephfs_shell_cmd(self, **kwargs): + """ + This method verifies that cephfs shell command fails with expected + return value and/or error message. + + kwargs is expected to hold the arguments same as + run_cephfs_shell_cmd() with the following exceptions - + * It should not contain check_status (since commands are expected + to fail, check_status is hardcoded to False). + * It is optional to set expected error message and return value to + dict members 'errmsg' and 'retval' respectively. + + This method servers as shorthand for codeblocks like - + + try: + proc = self.run_cephfs_shell_cmd(args=['some', 'cmd'], + check_status=False, + stdout=stdout) + except CommandFailedError as e: + self.assertNotIn('some error message', + proc.stderr.getvalue.lower()) + + + try: + proc = self.run_cephfs_shell_cmd(args=['some', 'cmd'], + check_status=False, + stdout=stdout) + except CommandFailedError as e: + self.assertNotEqual(1, proc.returncode) + """ + retval = kwargs.pop('retval', None) + errmsg = kwargs.pop('errmsg', None) + kwargs['check_status'] = False + + proc = self.run_cephfs_shell_cmd(**kwargs) + if retval: + self.assertEqual(proc.returncode, retval) + else: + self.assertNotEqual(proc.returncode, 0) + if errmsg: + self.assertIn(errmsg, proc.stderr.getvalue().lower()) + + return proc + + def get_cephfs_shell_cmd_output(self, cmd, mount_x=None, + shell_conf_path=None, opts=None, + stdout=None, stdin=None, + check_status=True): + return ensure_str(self.run_cephfs_shell_cmd( + cmd=cmd, mount_x=mount_x, shell_conf_path=shell_conf_path, + opts=opts, stdout=stdout, stdin=stdin, + check_status=check_status).stdout.getvalue().strip()) + + def get_cephfs_shell_cmd_error(self, cmd, mount_x=None, + shell_conf_path=None, opts=None, + stderr=None, stdin=None, check_status=True): + return ensure_str(self.run_cephfs_shell_cmd( + cmd=cmd, mount_x=mount_x, shell_conf_path=shell_conf_path, + opts=opts, stderr=stderr, stdin=stdin, + check_status=check_status).stderr.getvalue().strip()) + + def run_cephfs_shell_script(self, script, mount_x=None, + shell_conf_path=None, opts=None, stdout=None, + stderr=None, stdin=None, check_status=True): + stdout = stdout or StringIO() + stderr = stderr or StringIO() + if mount_x is None: + mount_x = self.mount_a + + scriptpath = tempfile_mkstemp(prefix='test-cephfs', text=True)[1] + with open(scriptpath, 'w') as scriptfile: + scriptfile.write(script) + # copy script to the machine running cephfs-shell. + mount_x.client_remote.put_file(scriptpath, scriptpath) + mount_x.run_shell_payload(f"chmod 755 {scriptpath}") + + args = ["cephfs-shell", '-b', scriptpath] + if shell_conf_path: + args[1:1] = ["-c", shell_conf_path] + log.info('Running script \"' + scriptpath + '\"') + return mount_x.client_remote.run(args=args, stdout=stdout, + stderr=stderr, stdin=stdin, + check_status=True) + + def get_cephfs_shell_script_output(self, script, mount_x=None, + shell_conf_path=None, opts=None, + stdout=None, stdin=None, + check_status=True): + return ensure_str(self.run_cephfs_shell_script( + script=script, mount_x=mount_x, shell_conf_path=shell_conf_path, + opts=opts, stdout=stdout, stdin=stdin, + check_status=check_status).stdout.getvalue().strip()) + + +class TestGeneric(TestCephFSShell): + + def test_mistyped_cmd(self): + with self.assertRaises(CommandFailedError) as cm: + self.run_cephfs_shell_cmd('lsx') + self.assertEqual(cm.exception.exitstatus, 127) + + +class TestMkdir(TestCephFSShell): + def test_mkdir(self): + """ + Test that mkdir creates directory + """ + o = self.get_cephfs_shell_cmd_output("mkdir d1") + log.info("cephfs-shell output:\n{}".format(o)) + + o = self.mount_a.stat('d1') + log.info("mount_a output:\n{}".format(o)) + + def test_mkdir_with_070000_octal_mode(self): + """ + Test that mkdir fails with octal mode greater than 07777 + """ + self.negtest_cephfs_shell_cmd(cmd="mkdir -m 070000 d2") + try: + self.mount_a.stat('d2') + except CommandFailedError: + pass + + def test_mkdir_with_negative_octal_mode(self): + """ + Test that mkdir fails with negative octal mode + """ + self.negtest_cephfs_shell_cmd(cmd="mkdir -m -0755 d3") + try: + self.mount_a.stat('d3') + except CommandFailedError: + pass + + def test_mkdir_with_non_octal_mode(self): + """ + Test that mkdir passes with non-octal mode + """ + o = self.get_cephfs_shell_cmd_output("mkdir -m u=rwx d4") + log.info("cephfs-shell output:\n{}".format(o)) + + # mkdir d4 should pass + o = self.mount_a.stat('d4') + assert ((o['st_mode'] & 0o700) == 0o700) + + def test_mkdir_with_bad_non_octal_mode(self): + """ + Test that mkdir failes with bad non-octal mode + """ + self.negtest_cephfs_shell_cmd(cmd="mkdir -m ugx=0755 d5") + try: + self.mount_a.stat('d5') + except CommandFailedError: + pass + + def test_mkdir_path_without_path_option(self): + """ + Test that mkdir fails without path option for creating path + """ + self.negtest_cephfs_shell_cmd(cmd="mkdir d5/d6/d7") + try: + self.mount_a.stat('d5/d6/d7') + except CommandFailedError: + pass + + def test_mkdir_path_with_path_option(self): + """ + Test that mkdir passes with path option for creating path + """ + o = self.get_cephfs_shell_cmd_output("mkdir -p d5/d6/d7") + log.info("cephfs-shell output:\n{}".format(o)) + + # mkdir d5/d6/d7 should pass + o = self.mount_a.stat('d5/d6/d7') + log.info("mount_a output:\n{}".format(o)) + + +class TestRmdir(TestCephFSShell): + dir_name = "test_dir" + + def dir_does_not_exists(self): + """ + Tests that directory does not exists + """ + try: + self.mount_a.stat(self.dir_name) + except CommandFailedError as e: + if e.exitstatus == 2: + return 0 + raise + + def test_rmdir(self): + """ + Test that rmdir deletes directory + """ + self.run_cephfs_shell_cmd("mkdir " + self.dir_name) + self.run_cephfs_shell_cmd("rmdir " + self.dir_name) + self.dir_does_not_exists() + + def test_rmdir_non_existing_dir(self): + """ + Test that rmdir does not delete a non existing directory + """ + self.negtest_cephfs_shell_cmd(cmd="rmdir test_dir") + self.dir_does_not_exists() + + def test_rmdir_dir_with_file(self): + """ + Test that rmdir does not delete directory containing file + """ + self.run_cephfs_shell_cmd("mkdir " + self.dir_name) + + self.run_cephfs_shell_cmd("put - test_dir/dumpfile", stdin="Valid File") + # see comment below + # with self.assertRaises(CommandFailedError) as cm: + with self.assertRaises(CommandFailedError): + self.run_cephfs_shell_cmd("rmdir " + self.dir_name) + # TODO: we need to check for exit code and error message as well. + # skipping it for not since error codes used by cephfs-shell are not + # standard and they may change soon. + # self.assertEqual(cm.exception.exitcode, 39) + self.mount_a.stat(self.dir_name) + + def test_rmdir_existing_file(self): + """ + Test that rmdir does not delete a file + """ + self.run_cephfs_shell_cmd("put - dumpfile", stdin="Valid File") + self.negtest_cephfs_shell_cmd(cmd="rmdir dumpfile") + self.mount_a.stat("dumpfile") + + def test_rmdir_p(self): + """ + Test that rmdir -p deletes all empty directories in the root + directory passed + """ + self.run_cephfs_shell_cmd("mkdir -p test_dir/t1/t2/t3") + self.run_cephfs_shell_cmd("rmdir -p " + self.dir_name) + self.dir_does_not_exists() + + def test_rmdir_p_valid_path(self): + """ + Test that rmdir -p deletes all empty directories in the path passed + """ + self.run_cephfs_shell_cmd("mkdir -p test_dir/t1/t2/t3") + self.run_cephfs_shell_cmd("rmdir -p test_dir/t1/t2/t3") + self.dir_does_not_exists() + + def test_rmdir_p_non_existing_dir(self): + """ + Test that rmdir -p does not delete an invalid directory + """ + self.negtest_cephfs_shell_cmd(cmd="rmdir -p test_dir") + self.dir_does_not_exists() + + def test_rmdir_p_dir_with_file(self): + """ + Test that rmdir -p does not delete the directory containing a file + """ + self.run_cephfs_shell_cmd("mkdir " + self.dir_name) + self.run_cephfs_shell_cmd("put - test_dir/dumpfile", + stdin="Valid File") + self.run_cephfs_shell_cmd("rmdir -p " + self.dir_name) + self.mount_a.stat(self.dir_name) + + +class TestLn(TestCephFSShell): + dir1 = 'test_dir1' + dir2 = 'test_dir2' + dump_id = 11 + s = 'somedata' + dump_file = 'dump11' + + def test_soft_link_without_link_name(self): + self.run_cephfs_shell_cmd(f'mkdir -p {self.dir1}/{self.dir2}') + self.mount_a.write_file(path=f'{self.dir1}/{self.dump_file}', + data=self.s) + self.run_cephfs_shell_script(script=dedent(f''' + cd /{self.dir1}/{self.dir2} + ln -s ../{self.dump_file}''')) + o = self.get_cephfs_shell_cmd_output(f'cat /{self.dir1}/{self.dir2}' + f'/{self.dump_file}') + self.assertEqual(self.s, o) + + def test_soft_link_with_link_name(self): + self.run_cephfs_shell_cmd(f'mkdir -p {self.dir1}/{self.dir2}') + self.mount_a.write_file(path=f'{self.dir1}/{self.dump_file}', + data=self.s) + self.run_cephfs_shell_cmd(f'ln -s /{self.dir1}/{self.dump_file} ' + f'/{self.dir1}/{self.dir2}/') + o = self.get_cephfs_shell_cmd_output(f'cat /{self.dir1}/{self.dir2}' + f'/{self.dump_file}') + self.assertEqual(self.s, o) + + def test_hard_link_without_link_name(self): + self.run_cephfs_shell_cmd(f'mkdir -p {self.dir1}/{self.dir2}') + self.mount_a.write_file(path=f'{self.dir1}/{self.dump_file}', + data=self.s) + self.run_cephfs_shell_script(script=dedent(f''' + cd /{self.dir1}/{self.dir2} + ln ../{self.dump_file}''')) + o = self.get_cephfs_shell_cmd_output(f'cat /{self.dir1}/{self.dir2}' + f'/{self.dump_file}') + self.assertEqual(self.s, o) + + def test_hard_link_with_link_name(self): + self.run_cephfs_shell_cmd(f'mkdir -p {self.dir1}/{self.dir2}') + self.mount_a.write_file(path=f'{self.dir1}/{self.dump_file}', + data=self.s) + self.run_cephfs_shell_cmd(f'ln /{self.dir1}/{self.dump_file} ' + f'/{self.dir1}/{self.dir2}/') + o = self.get_cephfs_shell_cmd_output(f'cat /{self.dir1}/{self.dir2}' + f'/{self.dump_file}') + self.assertEqual(self.s, o) + + def test_hard_link_to_dir_not_allowed(self): + self.run_cephfs_shell_cmd(f'mkdir {self.dir1}') + self.run_cephfs_shell_cmd(f'mkdir {self.dir2}') + r = self.run_cephfs_shell_cmd(f'ln /{self.dir1} /{self.dir2}/', + check_status=False) + self.assertEqual(r.returncode, 3) + + def test_target_exists_in_dir(self): + self.mount_a.write_file(path=f'{self.dump_file}', data=self.s) + r = self.run_cephfs_shell_cmd(f'ln {self.dump_file} {self.dump_file}', + check_status=False) + self.assertEqual(r.returncode, 1) + + def test_incorrect_dir(self): + self.mount_a.write_file(path=f'{self.dump_file}', data=self.s) + r = self.run_cephfs_shell_cmd(f'ln {self.dump_file} /dir1/', + check_status=False) + self.assertEqual(r.returncode, 5) + + +class TestGetAndPut(TestCephFSShell): + def test_get_with_target_name(self): + """ + Test that get passes with target name + """ + s = 'C' * 1024 + s_hash = crypt.crypt(s, '.A') + o = self.get_cephfs_shell_cmd_output("put - dump4", stdin=s) + log.info("cephfs-shell output:\n{}".format(o)) + + # put - dump4 should pass + o = self.mount_a.stat('dump4') + log.info("mount_a output:\n{}".format(o)) + + o = self.get_cephfs_shell_cmd_output("get dump4 ./dump4") + log.info("cephfs-shell output:\n{}".format(o)) + + # NOTE: cwd=None because we want to run it at CWD, not at cephfs mntpt. + o = self.mount_a.run_shell('cat dump4', cwd=None).stdout.getvalue(). \ + strip() + o_hash = crypt.crypt(o, '.A') + + # s_hash must be equal to o_hash + log.info("s_hash:{}".format(s_hash)) + log.info("o_hash:{}".format(o_hash)) + assert (s_hash == o_hash) + + # cleanup + self.mount_a.run_shell("rm dump4", cwd=None, check_status=False) + + def test_get_without_target_name(self): + """ + Test that get should fail when there is no target name + """ + s = 'Somedata' + # put - dump5 should pass + self.get_cephfs_shell_cmd_output("put - dump5", stdin=s) + + self.mount_a.stat('dump5') + + # get dump5 should fail as there is no local_path mentioned + with self.assertRaises(CommandFailedError): + self.get_cephfs_shell_cmd_output("get dump5") + + # stat dump would return non-zero exit code as get dump failed + # cwd=None because we want to run it at CWD, not at cephfs mntpt. + r = self.mount_a.run_shell('stat dump5', cwd=None, + check_status=False).returncode + self.assertEqual(r, 1) + + def test_get_doesnt_create_dir(self): + # if get cmd is creating subdirs on its own then dump7 will be + # stored as ./dump7/tmp/dump7 and not ./dump7, therefore + # if doing `cat ./dump7` returns non-zero exit code(i.e. 1) then + # it implies that no such file exists at that location + dir_abspath = path.join(self.mount_a.mountpoint, 'tmp') + self.mount_a.run_shell_payload(f"mkdir {dir_abspath}") + self.mount_a.client_remote.write_file(path.join(dir_abspath, 'dump7'), + 'somedata') + self.get_cephfs_shell_cmd_output("get /tmp/dump7 ./dump7") + # test that dump7 exists + self.mount_a.run_shell("cat ./dump7", cwd=None) + + # cleanup + self.mount_a.run_shell(args='rm dump7', cwd=None, check_status=False) + + def test_get_to_console(self): + """ + Test that get passes with target name + """ + s = 'E' * 1024 + s_hash = crypt.crypt(s, '.A') + o = self.get_cephfs_shell_cmd_output("put - dump6", stdin=s) + log.info("cephfs-shell output:\n{}".format(o)) + + # put - dump6 should pass + o = self.mount_a.stat('dump6') + log.info("mount_a output:\n{}".format(o)) + + # get dump6 - should pass + o = self.get_cephfs_shell_cmd_output("get dump6 -") + o_hash = crypt.crypt(o, '.A') + log.info("cephfs-shell output:\n{}".format(o)) + + # s_hash must be equal to o_hash + log.info("s_hash:{}".format(s_hash)) + log.info("o_hash:{}".format(o_hash)) + assert (s_hash == o_hash) + + + def test_put_without_target_name(self): + """ + put - should fail as the cmd expects both arguments are mandatory. + """ + with self.assertRaises(CommandFailedError): + self.get_cephfs_shell_cmd_output("put -") + + def test_put_validate_local_path(self): + """ + This test is intended to make sure local_path is validated before + trying to put the file from local fs to cephfs and the command + put ./dumpXYZ dump8 would fail as dumpXYX doesn't exist. + """ + with self.assertRaises(CommandFailedError): + o = self.get_cephfs_shell_cmd_output("put ./dumpXYZ dump8") + log.info("cephfs-shell output:\n{}".format(o)) + +class TestSnapshots(TestCephFSShell): + def test_snap(self): + """ + Test that snapshot creation and deletion work + """ + sd = self.fs.get_config('client_snapdir') + sdn = "data_dir/{}/snap1".format(sd) + + # create a data dir and dump some files into it + self.get_cephfs_shell_cmd_output("mkdir data_dir") + s = 'A' * 10240 + o = self.get_cephfs_shell_cmd_output("put - data_dir/data_a", stdin=s) + s = 'B' * 10240 + o = self.get_cephfs_shell_cmd_output("put - data_dir/data_b", stdin=s) + s = 'C' * 10240 + o = self.get_cephfs_shell_cmd_output("put - data_dir/data_c", stdin=s) + s = 'D' * 10240 + o = self.get_cephfs_shell_cmd_output("put - data_dir/data_d", stdin=s) + s = 'E' * 10240 + o = self.get_cephfs_shell_cmd_output("put - data_dir/data_e", stdin=s) + + o = self.get_cephfs_shell_cmd_output("ls -l /data_dir") + log.info("cephfs-shell output:\n{}".format(o)) + + # create the snapshot - must pass + o = self.get_cephfs_shell_cmd_output("snap create snap1 /data_dir") + log.info("cephfs-shell output:\n{}".format(o)) + self.assertEqual("", o) + o = self.mount_a.stat(sdn) + log.info("mount_a output:\n{}".format(o)) + self.assertIn('st_mode', o) + + # create the same snapshot again - must fail with an error message + self.negtest_cephfs_shell_cmd(cmd="snap create snap1 /data_dir", + errmsg="snapshot 'snap1' already exists") + o = self.mount_a.stat(sdn) + log.info("mount_a output:\n{}".format(o)) + self.assertIn('st_mode', o) + + # delete the snapshot - must pass + o = self.get_cephfs_shell_cmd_output("snap delete snap1 /data_dir") + log.info("cephfs-shell output:\n{}".format(o)) + self.assertEqual("", o) + try: + o = self.mount_a.stat(sdn) + except CommandFailedError: + # snap dir should not exist anymore + pass + log.info("mount_a output:\n{}".format(o)) + self.assertNotIn('st_mode', o) + + # delete the same snapshot again - must fail with an error message + self.negtest_cephfs_shell_cmd(cmd="snap delete snap1 /data_dir", + errmsg="'snap1': no such snapshot") + try: + o = self.mount_a.stat(sdn) + except CommandFailedError: + pass + log.info("mount_a output:\n{}".format(o)) + self.assertNotIn('st_mode', o) + + +class TestCD(TestCephFSShell): + CLIENTS_REQUIRED = 1 + + def test_cd_with_no_args(self): + """ + Test that when cd is issued without any arguments, CWD is changed + to root directory. + """ + path = 'dir1/dir2/dir3' + self.mount_a.run_shell_payload(f"mkdir -p {path}") + expected_cwd = '/' + + script = 'cd {}\ncd\ncwd\n'.format(path) + output = self.get_cephfs_shell_script_output(script) + self.assertEqual(output, expected_cwd) + + def test_cd_with_args(self): + """ + Test that when cd is issued with an argument, CWD is changed + to the path passed in the argument. + """ + path = 'dir1/dir2/dir3' + self.mount_a.run_shell_payload(f"mkdir -p {path}") + expected_cwd = '/dir1/dir2/dir3' + + script = 'cd {}\ncwd\n'.format(path) + output = self.get_cephfs_shell_script_output(script) + self.assertEqual(output, expected_cwd) + + +class TestDU(TestCephFSShell): + CLIENTS_REQUIRED = 1 + + def test_du_works_for_regfiles(self): + regfilename = 'some_regfile' + regfile_abspath = path.join(self.mount_a.mountpoint, regfilename) + self.mount_a.client_remote.write_file(regfile_abspath, 'somedata') + + size = humansize(self.mount_a.stat(regfile_abspath)['st_size']) + expected_output = r'{}{}{}'.format(size, " +", regfilename) + + du_output = self.get_cephfs_shell_cmd_output('du ' + regfilename) + self.assertRegex(du_output, expected_output) + + def test_du_works_for_non_empty_dirs(self): + dirname = 'some_directory' + dir_abspath = path.join(self.mount_a.mountpoint, dirname) + regfilename = 'some_regfile' + regfile_abspath = path.join(dir_abspath, regfilename) + self.mount_a.run_shell_payload(f"mkdir {dir_abspath}") + self.mount_a.client_remote.write_file(regfile_abspath, 'somedata') + + # XXX: we stat `regfile_abspath` here because ceph du reports + # a non-empty + # directory's size as sum of sizes of all files under it. + size = humansize(self.mount_a.stat(regfile_abspath)['st_size']) + expected_output = r'{}{}{}'.format(size, " +", dirname) + + sleep(10) + du_output = self.get_cephfs_shell_cmd_output('du ' + dirname) + self.assertRegex(du_output, expected_output) + + def test_du_works_for_empty_dirs(self): + dirname = 'some_directory' + dir_abspath = path.join(self.mount_a.mountpoint, dirname) + self.mount_a.run_shell_payload(f"mkdir {dir_abspath}") + + size = humansize(self.mount_a.stat(dir_abspath)['st_size']) + expected_output = r'{}{}{}'.format(size, " +", dirname) + + du_output = self.get_cephfs_shell_cmd_output('du ' + dirname) + self.assertRegex(du_output, expected_output) + + def test_du_works_for_hardlinks(self): + regfilename = 'some_regfile' + regfile_abspath = path.join(self.mount_a.mountpoint, regfilename) + self.mount_a.client_remote.write_file(regfile_abspath, 'somedata') + hlinkname = 'some_hardlink' + hlink_abspath = path.join(self.mount_a.mountpoint, hlinkname) + self.mount_a.run_shell_payload(f"ln {regfile_abspath} {hlink_abspath}") + + size = humansize(self.mount_a.stat(hlink_abspath)['st_size']) + expected_output = r'{}{}{}'.format(size, " +", hlinkname) + + du_output = self.get_cephfs_shell_cmd_output('du ' + hlinkname) + self.assertRegex(du_output, expected_output) + + def test_du_works_for_softlinks_to_files(self): + regfilename = 'some_regfile' + regfile_abspath = path.join(self.mount_a.mountpoint, regfilename) + self.mount_a.client_remote.write_file(regfile_abspath, 'somedata') + slinkname = 'some_softlink' + slink_abspath = path.join(self.mount_a.mountpoint, slinkname) + self.mount_a.run_shell_payload( + f"ln -s {regfile_abspath} {slink_abspath}") + + size = humansize(self.mount_a.lstat(slink_abspath)['st_size']) + expected_output = r'{}{}{}'.format(size, " +", slinkname) + + du_output = self.get_cephfs_shell_cmd_output('du ' + slinkname) + self.assertRegex(du_output, expected_output) + + def test_du_works_for_softlinks_to_dirs(self): + dirname = 'some_directory' + dir_abspath = path.join(self.mount_a.mountpoint, dirname) + self.mount_a.run_shell_payload(f"mkdir {dir_abspath}") + slinkname = 'some_softlink' + slink_abspath = path.join(self.mount_a.mountpoint, slinkname) + self.mount_a.run_shell_payload(f"ln -s {dir_abspath} {slink_abspath}") + + size = humansize(self.mount_a.lstat(slink_abspath)['st_size']) + expected_output = r'{}{}{}'.format(size, " +", slinkname) + + du_output = self.get_cephfs_shell_cmd_output('du ' + slinkname) + self.assertRegex(du_output, expected_output) + + # NOTE: tests using these are pretty slow since to this methods sleeps for + # 15 seconds + def _setup_files(self, return_path_to_files=False, path_prefix='./'): + dirname = 'dir1' + regfilename = 'regfile' + hlinkname = 'hlink' + slinkname = 'slink1' + slink2name = 'slink2' + + dir_abspath = path.join(self.mount_a.mountpoint, dirname) + regfile_abspath = path.join(self.mount_a.mountpoint, regfilename) + hlink_abspath = path.join(self.mount_a.mountpoint, hlinkname) + slink_abspath = path.join(self.mount_a.mountpoint, slinkname) + slink2_abspath = path.join(self.mount_a.mountpoint, slink2name) + + self.mount_a.run_shell_payload(f"mkdir {dir_abspath}") + self.mount_a.run_shell_payload(f"touch {regfile_abspath}") + self.mount_a.run_shell_payload(f"ln {regfile_abspath} {hlink_abspath}") + self.mount_a.run_shell_payload( + f"ln -s {regfile_abspath} {slink_abspath}") + self.mount_a.run_shell_payload(f"ln -s {dir_abspath} {slink2_abspath}") + + dir2_name = 'dir2' + dir21_name = 'dir21' + regfile121_name = 'regfile121' + dir2_abspath = path.join(self.mount_a.mountpoint, dir2_name) + dir21_abspath = path.join(dir2_abspath, dir21_name) + regfile121_abspath = path.join(dir21_abspath, regfile121_name) + self.mount_a.run_shell_payload(f"mkdir -p {dir21_abspath}") + self.mount_a.run_shell_payload(f"touch {regfile121_abspath}") + + self.mount_a.client_remote.write_file(regfile_abspath, 'somedata') + self.mount_a.client_remote.write_file(regfile121_abspath, + 'somemoredata') + + # TODO: is there a way to trigger/force update ceph.dir.rbytes? + # wait so that attr ceph.dir.rbytes gets a chance to be updated. + sleep(20) + + expected_patterns = [] + path_to_files = [] + + def append_expected_output_pattern(f): + if f == '/': + expected_patterns.append(r'{}{}{}'.format(size, " +", '.' + f)) + else: + expected_patterns.append(r'{}{}{}'.format( + size, " +", + path_prefix + path.relpath(f, self.mount_a.mountpoint))) + + for f in [dir_abspath, regfile_abspath, regfile121_abspath, + hlink_abspath, slink_abspath, slink2_abspath]: + size = humansize(self.mount_a.stat( + f, follow_symlinks=False)['st_size']) + append_expected_output_pattern(f) + + # get size for directories containig regfiles within + for f in [dir2_abspath, dir21_abspath]: + size = humansize(self.mount_a.stat(regfile121_abspath, + follow_symlinks=False)[ + 'st_size']) + append_expected_output_pattern(f) + + # get size for CephFS root + size = 0 + for f in [regfile_abspath, regfile121_abspath, slink_abspath, + slink2_abspath]: + size += self.mount_a.stat(f, follow_symlinks=False)['st_size'] + size = humansize(size) + append_expected_output_pattern('/') + + if return_path_to_files: + for p in [dir_abspath, regfile_abspath, dir2_abspath, + dir21_abspath, regfile121_abspath, hlink_abspath, + slink_abspath, slink2_abspath]: + path_to_files.append(path.relpath(p, self.mount_a.mountpoint)) + + return expected_patterns, path_to_files + else: + return expected_patterns + + def test_du_works_recursively_with_no_path_in_args(self): + expected_patterns_in_output = self._setup_files() + du_output = self.get_cephfs_shell_cmd_output('du -r') + + for expected_output in expected_patterns_in_output: + self.assertRegex(du_output, expected_output) + + def test_du_with_path_in_args(self): + expected_patterns_in_output, path_to_files = self._setup_files( + True, path_prefix='') + + args = ['du', '/'] + for p in path_to_files: + args.append(p) + du_output = self.get_cephfs_shell_cmd_output(args) + + for expected_output in expected_patterns_in_output: + self.assertRegex(du_output, expected_output) + + def test_du_with_no_args(self): + expected_patterns_in_output = self._setup_files() + + du_output = self.get_cephfs_shell_cmd_output('du') + + for expected_output in expected_patterns_in_output: + # Since CWD is CephFS root and being non-recursive expect only + # CWD in DU report. + if expected_output.find('/') == len(expected_output) - 1: + self.assertRegex(du_output, expected_output) + + +class TestDF(TestCephFSShell): + def validate_df(self, filename): + df_output = self.get_cephfs_shell_cmd_output('df ' + filename) + log.info("cephfs-shell df output:\n{}".format(df_output)) + + shell_df = df_output.splitlines()[1].split() + + block_size = int(self.mount_a.df()["total"]) // 1024 + log.info("cephfs df block size output:{}\n".format(block_size)) + + st_size = int(self.mount_a.stat(filename)["st_size"]) + log.info("cephfs stat used output:{}".format(st_size)) + log.info("cephfs available:{}\n".format(block_size - st_size)) + + self.assertTupleEqual((block_size, st_size, block_size - st_size), + (int(shell_df[0]), int(shell_df[1]), + int(shell_df[2]))) + + def test_df_with_no_args(self): + expected_output = '' + df_output = self.get_cephfs_shell_cmd_output('df') + assert df_output == expected_output + + def test_df_for_valid_directory(self): + dir_name = 'dir1' + mount_output = self.mount_a.run_shell_payload(f"mkdir {dir_name}") + log.info("cephfs-shell mount output:\n{}".format(mount_output)) + self.validate_df(dir_name) + + def test_df_for_invalid_directory(self): + dir_abspath = path.join(self.mount_a.mountpoint, 'non-existent-dir') + self.negtest_cephfs_shell_cmd(cmd='df ' + dir_abspath, + errmsg='error in stat') + + def test_df_for_valid_file(self): + s = 'df test' * 14145016 + o = self.get_cephfs_shell_cmd_output("put - dumpfile", stdin=s) + log.info("cephfs-shell output:\n{}".format(o)) + self.validate_df("dumpfile") + + +class TestQuota(TestCephFSShell): + dir_name = 'testdir' + + def create_dir(self): + mount_output = self.get_cephfs_shell_cmd_output( + 'mkdir ' + self.dir_name) + log.info("cephfs-shell mount output:\n{}".format(mount_output)) + + def set_and_get_quota_vals(self, input_val, check_status=True): + self.run_cephfs_shell_cmd(['quota', 'set', '--max_bytes', + input_val[0], '--max_files', input_val[1], + self.dir_name], check_status=check_status) + + quota_output = self.get_cephfs_shell_cmd_output( + ['quota', 'get', self.dir_name], + check_status=check_status) + + quota_output = quota_output.split() + return quota_output[1], quota_output[3] + + def test_set(self): + self.create_dir() + set_values = ('6', '2') + self.assertTupleEqual(self.set_and_get_quota_vals(set_values), + set_values) + + def test_replace_values(self): + self.test_set() + set_values = ('20', '4') + self.assertTupleEqual(self.set_and_get_quota_vals(set_values), + set_values) + + def test_set_invalid_dir(self): + set_values = ('5', '5') + try: + self.assertTupleEqual(self.set_and_get_quota_vals( + set_values, False), set_values) + raise Exception( + "Something went wrong!! Values set for non existing directory") + except IndexError: + # Test should pass as values cannot be set for non + # existing directory + pass + + def test_set_invalid_values(self): + self.create_dir() + set_values = ('-6', '-5') + try: + self.assertTupleEqual(self.set_and_get_quota_vals(set_values, + False), + set_values) + raise Exception("Something went wrong!! Invalid values set") + except IndexError: + # Test should pass as invalid values cannot be set + pass + + def test_exceed_file_limit(self): + self.test_set() + dir_abspath = path.join(self.mount_a.mountpoint, self.dir_name) + self.mount_a.run_shell_payload(f"touch {dir_abspath}/file1") + file2 = path.join(dir_abspath, "file2") + try: + self.mount_a.run_shell_payload(f"touch {file2}") + raise Exception( + "Something went wrong!! File creation should have failed") + except CommandFailedError: + # Test should pass as file quota set to 2 + # Additional condition to confirm file creation failure + if not path.exists(file2): + return 0 + raise + + def test_exceed_write_limit(self): + self.test_set() + dir_abspath = path.join(self.mount_a.mountpoint, self.dir_name) + filename = 'test_file' + file_abspath = path.join(dir_abspath, filename) + try: + # Write should fail as bytes quota is set to 6 + self.mount_a.client_remote.write_file(file_abspath, + 'Disk raise Exception') + raise Exception("Write should have failed") + except CommandFailedError: + # Test should pass only when write command fails + path_exists = path.exists(file_abspath) + if not path_exists: + # Testing with teuthology: No file is created. + return 0 + elif path_exists and not path.getsize(file_abspath): + # Testing on Fedora 30: When write fails, empty + # file gets created. + return 0 + else: + raise + + +class TestXattr(TestCephFSShell): + dir_name = 'testdir' + + def create_dir(self): + self.run_cephfs_shell_cmd('mkdir ' + self.dir_name) + + def set_get_list_xattr_vals(self, input_val, negtest=False): + setxattr_output = self.get_cephfs_shell_cmd_output( + ['setxattr', self.dir_name, input_val[0], input_val[1]]) + log.info("cephfs-shell setxattr output:\n{}".format(setxattr_output)) + + getxattr_output = self.get_cephfs_shell_cmd_output( + ['getxattr', self.dir_name, input_val[0]]) + log.info("cephfs-shell getxattr output:\n{}".format(getxattr_output)) + + listxattr_output = self.get_cephfs_shell_cmd_output( + ['listxattr', self.dir_name]) + log.info("cephfs-shell listxattr output:\n{}".format(listxattr_output)) + + return listxattr_output, getxattr_output + + def test_set(self): + self.create_dir() + set_values = ('user.key', '2') + self.assertTupleEqual(self.set_get_list_xattr_vals(set_values), + set_values) + + def test_reset(self): + self.test_set() + set_values = ('user.key', '4') + self.assertTupleEqual(self.set_get_list_xattr_vals(set_values), + set_values) + + def test_non_existing_dir(self): + input_val = ('user.key', '9') + self.negtest_cephfs_shell_cmd( + cmd=['setxattr', self.dir_name, input_val[0], + input_val[1]]) + self.negtest_cephfs_shell_cmd( + cmd=['getxattr', self.dir_name, input_val[0]]) + self.negtest_cephfs_shell_cmd(cmd=['listxattr', self.dir_name]) + + +class TestLS(TestCephFSShell): + dir_name = 'test_dir' + hidden_dir_name = '.test_hidden_dir' + + def test_ls(self): + """ Test that ls prints files in CWD. """ + self.run_cephfs_shell_cmd(f'mkdir {self.dir_name}') + + ls_output = self.get_cephfs_shell_cmd_output("ls") + log.info(f"output of ls command:\n{ls_output}") + + self.assertIn(self.dir_name, ls_output) + + def test_ls_a(self): + """ Test ls -a prints hidden files in CWD.""" + + self.run_cephfs_shell_cmd(f'mkdir {self.hidden_dir_name}') + + ls_a_output = self.get_cephfs_shell_cmd_output(['ls', '-a']) + log.info(f"output of ls -a command:\n{ls_a_output}") + + self.assertIn(self.hidden_dir_name, ls_a_output) + + def test_ls_does_not_print_hidden_dir(self): + """ Test ls command does not print hidden directory """ + + self.run_cephfs_shell_cmd(f'mkdir {self.hidden_dir_name}') + + ls_output = self.get_cephfs_shell_cmd_output("ls") + log.info(f"output of ls command:\n{ls_output}") + + self.assertNotIn(self.hidden_dir_name, ls_output) + + def test_ls_a_prints_non_hidden_dir(self): + """ Test ls -a command prints non hidden directory """ + + self.run_cephfs_shell_cmd( + f'mkdir {self.hidden_dir_name} {self.dir_name}') + + ls_a_output = self.get_cephfs_shell_cmd_output(['ls', '-a']) + log.info(f"output of ls -a command:\n{ls_a_output}") + + self.assertIn(self.dir_name, ls_a_output) + + def test_ls_H_prints_human_readable_file_size(self): + """ Test "ls -lH" prints human readable file size.""" + + file_sizes = ['1', '1K', '1M', '1G'] + file_names = ['dump1', 'dump2', 'dump3', 'dump4'] + + for (file_size, file_name) in zip(file_sizes, file_names): + temp_file = self.mount_a.client_remote.mktemp(file_name) + self.mount_a.run_shell_payload( + f"fallocate -l {file_size} {temp_file}") + self.mount_a.run_shell_payload(f'mv {temp_file} ./') + + ls_H_output = self.get_cephfs_shell_cmd_output(['ls', '-lH']) + + ls_H_file_size = set() + for line in ls_H_output.split('\n'): + ls_H_file_size.add(line.split()[1]) + + # test that file sizes are in human readable format + self.assertEqual({'1B', '1K', '1M', '1G'}, ls_H_file_size) + + def test_ls_s_sort_by_size(self): + """ Test "ls -S" sorts file listing by file_size """ + test_file1 = "test_file1.txt" + test_file2 = "test_file2.txt" + file1_content = 'A' * 102 + file2_content = 'B' * 10 + + self.run_cephfs_shell_cmd(f"write {test_file1}", stdin=file1_content) + self.run_cephfs_shell_cmd(f"write {test_file2}", stdin=file2_content) + + ls_s_output = self.get_cephfs_shell_cmd_output(['ls', '-lS']) + + file_sizes = [] + for line in ls_s_output.split('\n'): + file_sizes.append(line.split()[1]) + + # test that file size are in ascending order + self.assertEqual(file_sizes, sorted(file_sizes)) + + +class TestMisc(TestCephFSShell): + def test_issue_cephfs_shell_cmd_at_invocation(self): + """ + Test that `cephfs-shell -c conf cmd` works. + """ + # choosing a long name since short ones have a higher probability + # of getting matched by coincidence. + dirname = 'somedirectory' + self.run_cephfs_shell_cmd(['mkdir', dirname]) + + output = self.mount_a.client_remote.sh(['cephfs-shell', 'ls']). \ + strip() + + self.assertRegex(output, dirname) + + def test_help(self): + """ + Test that help outputs commands. + """ + o = self.get_cephfs_shell_cmd_output("help all") + log.info("output:\n{}".format(o)) + + + def test_chmod(self): + """Test chmod is allowed above o0777 """ + + test_file1 = "test_file2.txt" + file1_content = 'A' * 102 + self.run_cephfs_shell_cmd(f"write {test_file1}", stdin=file1_content) + self.run_cephfs_shell_cmd(f"chmod 01777 {test_file1}") + +class TestShellOpts(TestCephFSShell): + """ + Contains tests for shell options from conf file and shell prompt. + """ + + def setUp(self): + super(type(self), self).setUp() + + # output of following command - + # editor - was: 'vim' + # now: '?' + # editor: '?' + self.editor_val = self.get_cephfs_shell_cmd_output( + 'set editor ?, set editor').split('\n')[2] + self.editor_val = self.editor_val.split(':')[1]. \ + replace("'", "", 2).strip() + + def write_tempconf(self, confcontents): + self.tempconfpath = self.mount_a.client_remote.mktemp( + suffix='cephfs-shell.conf') + self.mount_a.client_remote.write_file(self.tempconfpath, + confcontents) + + def test_reading_conf(self): + self.write_tempconf("[cephfs-shell]\neditor = ???") + + # output of following command - + # CephFS:~/>>> set editor + # editor: 'vim' + final_editor_val = self.get_cephfs_shell_cmd_output( + cmd='set editor', shell_conf_path=self.tempconfpath) + final_editor_val = final_editor_val.split(': ')[1] + final_editor_val = final_editor_val.replace("'", "", 2) + + self.assertNotEqual(self.editor_val, final_editor_val) + + def test_reading_conf_with_dup_opt(self): + """ + Read conf without duplicate sections/options. + """ + self.write_tempconf("[cephfs-shell]\neditor = ???\neditor = " + + self.editor_val) + + # output of following command - + # CephFS:~/>>> set editor + # editor: 'vim' + final_editor_val = self.get_cephfs_shell_cmd_output( + cmd='set editor', shell_conf_path=self.tempconfpath) + final_editor_val = final_editor_val.split(': ')[1] + final_editor_val = final_editor_val.replace("'", "", 2) + + self.assertEqual(self.editor_val, final_editor_val) + + def test_setting_opt_after_reading_conf(self): + self.write_tempconf("[cephfs-shell]\neditor = ???") + + # output of following command - + # editor - was: vim + # now: vim + # editor: vim + final_editor_val = self.get_cephfs_shell_cmd_output( + cmd='set editor %s, set editor' % self.editor_val, + shell_conf_path=self.tempconfpath) + final_editor_val = final_editor_val.split('\n')[2] + final_editor_val = final_editor_val.split(': ')[1] + final_editor_val = final_editor_val.replace("'", "", 2) + + self.assertEqual(self.editor_val, final_editor_val) diff --git a/qa/tasks/cephfs/test_client_limits.py b/qa/tasks/cephfs/test_client_limits.py new file mode 100644 index 000000000..c4215df33 --- /dev/null +++ b/qa/tasks/cephfs/test_client_limits.py @@ -0,0 +1,397 @@ + +""" +Exercise the MDS's behaviour when clients and the MDCache reach or +exceed the limits of how many caps/inodes they should hold. +""" + +import logging +from textwrap import dedent +from tasks.ceph_test_case import TestTimeoutError +from tasks.cephfs.cephfs_test_case import CephFSTestCase, needs_trimming +from tasks.cephfs.fuse_mount import FuseMount +from teuthology.exceptions import CommandFailedError +import os +from io import StringIO + + +log = logging.getLogger(__name__) + + +# Arbitrary timeouts for operations involving restarting +# an MDS or waiting for it to come up +MDS_RESTART_GRACE = 60 + +# Hardcoded values from Server::recall_client_state +CAP_RECALL_RATIO = 0.8 +CAP_RECALL_MIN = 100 + + +class TestClientLimits(CephFSTestCase): + CLIENTS_REQUIRED = 2 + + def _test_client_pin(self, use_subdir, open_files): + """ + When a client pins an inode in its cache, for example because the file is held open, + it should reject requests from the MDS to trim these caps. The MDS should complain + to the user that it is unable to enforce its cache size limits because of this + objectionable client. + + :param use_subdir: whether to put test files in a subdir or use root + """ + + # Set MDS cache memory limit to a low value that will make the MDS to + # ask the client to trim the caps. + cache_memory_limit = "1K" + + self.config_set('mds', 'mds_cache_memory_limit', cache_memory_limit) + self.config_set('mds', 'mds_recall_max_caps', int(open_files/2)) + self.config_set('mds', 'mds_recall_warning_threshold', open_files) + + mds_min_caps_per_client = int(self.config_get('mds', "mds_min_caps_per_client")) + self.config_set('mds', 'mds_min_caps_working_set', mds_min_caps_per_client) + mds_max_caps_per_client = int(self.config_get('mds', "mds_max_caps_per_client")) + mds_recall_warning_decay_rate = float(self.config_get('mds', "mds_recall_warning_decay_rate")) + self.assertGreaterEqual(open_files, mds_min_caps_per_client) + + mount_a_client_id = self.mount_a.get_global_id() + path = "subdir" if use_subdir else "." + open_proc = self.mount_a.open_n_background(path, open_files) + + # Client should now hold: + # `open_files` caps for the open files + # 1 cap for root + # 1 cap for subdir + self.wait_until_equal(lambda: self.get_session(mount_a_client_id)['num_caps'], + open_files + (2 if use_subdir else 1), + timeout=600, + reject_fn=lambda x: x > open_files + 2) + + # MDS should not be happy about that, as the client is failing to comply + # with the SESSION_RECALL messages it is being sent + self.wait_for_health("MDS_CLIENT_RECALL", mds_recall_warning_decay_rate*2) + + # We can also test that the MDS health warning for oversized + # cache is functioning as intended. + self.wait_for_health("MDS_CACHE_OVERSIZED", mds_recall_warning_decay_rate*2) + + # When the client closes the files, it should retain only as many caps as allowed + # under the SESSION_RECALL policy + log.info("Terminating process holding files open") + self.mount_a._kill_background(open_proc) + + # The remaining caps should comply with the numbers sent from MDS in SESSION_RECALL message, + # which depend on the caps outstanding, cache size and overall ratio + def expected_caps(): + num_caps = self.get_session(mount_a_client_id)['num_caps'] + if num_caps <= mds_min_caps_per_client: + return True + elif num_caps <= mds_max_caps_per_client: + return True + else: + return False + + self.wait_until_true(expected_caps, timeout=60) + + @needs_trimming + def test_client_pin_root(self): + self._test_client_pin(False, 400) + + @needs_trimming + def test_client_pin(self): + self._test_client_pin(True, 800) + + @needs_trimming + def test_client_pin_mincaps(self): + self._test_client_pin(True, 200) + + def test_client_min_caps_working_set(self): + """ + When a client has inodes pinned in its cache (open files), that the MDS + will not warn about the client not responding to cache pressure when + the number of caps is below mds_min_caps_working_set. + """ + + # Set MDS cache memory limit to a low value that will make the MDS to + # ask the client to trim the caps. + cache_memory_limit = "1K" + open_files = 400 + + self.config_set('mds', 'mds_cache_memory_limit', cache_memory_limit) + self.config_set('mds', 'mds_recall_max_caps', int(open_files/2)) + self.config_set('mds', 'mds_recall_warning_threshold', open_files) + self.config_set('mds', 'mds_min_caps_working_set', open_files*2) + + mds_min_caps_per_client = int(self.config_get('mds', "mds_min_caps_per_client")) + mds_recall_warning_decay_rate = float(self.config_get('mds', "mds_recall_warning_decay_rate")) + self.assertGreaterEqual(open_files, mds_min_caps_per_client) + + mount_a_client_id = self.mount_a.get_global_id() + self.mount_a.open_n_background("subdir", open_files) + + # Client should now hold: + # `open_files` caps for the open files + # 1 cap for root + # 1 cap for subdir + self.wait_until_equal(lambda: self.get_session(mount_a_client_id)['num_caps'], + open_files + 2, + timeout=600, + reject_fn=lambda x: x > open_files + 2) + + # We can also test that the MDS health warning for oversized + # cache is functioning as intended. + self.wait_for_health("MDS_CACHE_OVERSIZED", mds_recall_warning_decay_rate*2) + + try: + # MDS should not be happy about that but it's not sending + # MDS_CLIENT_RECALL warnings because the client's caps are below + # mds_min_caps_working_set. + self.wait_for_health("MDS_CLIENT_RECALL", mds_recall_warning_decay_rate*2) + except TestTimeoutError: + pass + else: + raise RuntimeError("expected no client recall warning") + + def test_cap_acquisition_throttle_readdir(self): + """ + Mostly readdir acquires caps faster than the mds recalls, so the cap + acquisition via readdir is throttled by retrying the readdir after + a fraction of second (0.5) by default when throttling condition is met. + """ + + subdir_count = 4 + files_per_dir = 25 + + # throttle in a way so that two dir reads are already hitting it. + throttle_value = (files_per_dir * 3) // 2 + + # activate throttling logic by setting max per client to a low value + self.config_set('mds', 'mds_max_caps_per_client', 1) + self.config_set('mds', 'mds_session_cap_acquisition_throttle', throttle_value) + + # Create files split across {subdir_count} directories, {per_dir_count} in each dir + for i in range(1, subdir_count+1): + self.mount_a.create_n_files("dir{0}/file".format(i), files_per_dir, sync=True) + + mount_a_client_id = self.mount_a.get_global_id() + + # recursive readdir. macOs wants an explicit directory for `find`. + proc = self.mount_a.run_shell_payload("find . | wc", stderr=StringIO()) + # return code may be None if the command got interrupted + self.assertTrue(proc.returncode is None or proc.returncode == 0, proc.stderr.getvalue()) + + # validate the throttle condition to be hit atleast once + cap_acquisition_throttle_hit_count = self.perf_dump()['mds_server']['cap_acquisition_throttle'] + self.assertGreaterEqual(cap_acquisition_throttle_hit_count, 1) + + # validate cap_acquisition decay counter after readdir to NOT exceed the throttle value + # plus one batch that could have been taken immediately before querying + # assuming the batch is equal to the per dir file count. + cap_acquisition_value = self.get_session(mount_a_client_id)['cap_acquisition']['value'] + self.assertLessEqual(cap_acquisition_value, files_per_dir + throttle_value) + + # make sure that the throttle was reported in the events + def historic_ops_have_event(expected_event): + ops_dump = self.fs.rank_tell(['dump_historic_ops']) + # reverse the events and the ops assuming that later ops would be throttled + for op in reversed(ops_dump['ops']): + for ev in reversed(op.get('type_data', {}).get('events', [])): + if ev['event'] == expected_event: + return True + return False + + self.assertTrue(historic_ops_have_event('cap_acquisition_throttle')) + + def test_client_release_bug(self): + """ + When a client has a bug (which we will simulate) preventing it from releasing caps, + the MDS should notice that releases are not being sent promptly, and generate a health + metric to that effect. + """ + + # The debug hook to inject the failure only exists in the fuse client + if not isinstance(self.mount_a, FuseMount): + self.skipTest("Require FUSE client to inject client release failure") + + self.set_conf('client.{0}'.format(self.mount_a.client_id), 'client inject release failure', 'true') + self.mount_a.teardown() + self.mount_a.mount_wait() + mount_a_client_id = self.mount_a.get_global_id() + + # Client A creates a file. He will hold the write caps on the file, and later (simulated bug) fail + # to comply with the MDSs request to release that cap + self.mount_a.run_shell(["touch", "file1"]) + + # Client B tries to stat the file that client A created + rproc = self.mount_b.write_background("file1") + + # After session_timeout, we should see a health warning (extra lag from + # MDS beacon period) + session_timeout = self.fs.get_var("session_timeout") + self.wait_for_health("MDS_CLIENT_LATE_RELEASE", session_timeout + 10) + + # Client B should still be stuck + self.assertFalse(rproc.finished) + + # Kill client A + self.mount_a.kill() + self.mount_a.kill_cleanup() + + # Client B should complete + self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) + rproc.wait() + + def test_client_blocklisted_oldest_tid(self): + """ + that a client is blocklisted when its encoded session metadata exceeds the + configured threshold (due to ever growing `completed_requests` caused due + to an unidentified bug (in the client or the MDS)). + """ + + # num of requests client issues + max_requests = 10000 + + # The debug hook to inject the failure only exists in the fuse client + if not isinstance(self.mount_a, FuseMount): + self.skipTest("Require FUSE client to inject client release failure") + + self.config_set('client', 'client inject fixed oldest tid', 'true') + self.mount_a.teardown() + self.mount_a.mount_wait() + + self.config_set('mds', 'mds_max_completed_requests', max_requests); + + # Create lots of files + self.mount_a.create_n_files("testdir/file1", max_requests + 100) + + # Create a few files synchronously. This makes sure previous requests are completed + self.mount_a.create_n_files("testdir/file2", 5, True) + + # Wait for the health warnings. Assume mds can handle 10 request per second at least + self.wait_for_health("MDS_CLIENT_OLDEST_TID", max_requests // 10, check_in_detail=str(self.mount_a.client_id)) + + # set the threshold low so that it has a high probability of + # hitting. + self.config_set('mds', 'mds_session_metadata_threshold', 5000); + + # Create lot many files synchronously. This would hit the session metadata threshold + # causing the client to get blocklisted. + with self.assertRaises(CommandFailedError): + self.mount_a.create_n_files("testdir/file2", 100000, True) + + self.mds_cluster.is_addr_blocklisted(self.mount_a.get_global_addr()) + # the mds should bump up the relevant perf counter + pd = self.perf_dump() + self.assertGreater(pd['mds_sessions']['mdthresh_evicted'], 0) + + # reset the config + self.config_set('client', 'client inject fixed oldest tid', 'false') + + self.mount_a.kill_cleanup() + self.mount_a.mount_wait() + + def test_client_oldest_tid(self): + """ + When a client does not advance its oldest tid, the MDS should notice that + and generate health warnings. + """ + + # num of requests client issues + max_requests = 1000 + + # The debug hook to inject the failure only exists in the fuse client + if not isinstance(self.mount_a, FuseMount): + self.skipTest("Require FUSE client to inject client release failure") + + self.set_conf('client', 'client inject fixed oldest tid', 'true') + self.mount_a.teardown() + self.mount_a.mount_wait() + + self.fs.mds_asok(['config', 'set', 'mds_max_completed_requests', '{0}'.format(max_requests)]) + + # Create lots of files + self.mount_a.create_n_files("testdir/file1", max_requests + 100) + + # Create a few files synchronously. This makes sure previous requests are completed + self.mount_a.create_n_files("testdir/file2", 5, True) + + # Wait for the health warnings. Assume mds can handle 10 request per second at least + self.wait_for_health("MDS_CLIENT_OLDEST_TID", max_requests // 10) + + def _test_client_cache_size(self, mount_subdir): + """ + check if client invalidate kernel dcache according to its cache size config + """ + + # The debug hook to inject the failure only exists in the fuse client + if not isinstance(self.mount_a, FuseMount): + self.skipTest("Require FUSE client to inject client release failure") + + if mount_subdir: + # fuse assigns a fix inode number (1) to root inode. But in mounting into + # subdir case, the actual inode number of root is not 1. This mismatch + # confuses fuse_lowlevel_notify_inval_entry() when invalidating dentries + # in root directory. + self.mount_a.run_shell(["mkdir", "subdir"]) + self.mount_a.umount_wait() + self.set_conf('client', 'client mountpoint', '/subdir') + self.mount_a.mount_wait() + root_ino = self.mount_a.path_to_ino(".") + self.assertEqual(root_ino, 1); + + dir_path = os.path.join(self.mount_a.mountpoint, "testdir") + + mkdir_script = dedent(""" + import os + os.mkdir("{path}") + for n in range(0, {num_dirs}): + os.mkdir("{path}/dir{{0}}".format(n)) + """) + + num_dirs = 1000 + self.mount_a.run_python(mkdir_script.format(path=dir_path, num_dirs=num_dirs)) + self.mount_a.run_shell(["sync"]) + + dentry_count, dentry_pinned_count = self.mount_a.get_dentry_count() + self.assertGreaterEqual(dentry_count, num_dirs) + self.assertGreaterEqual(dentry_pinned_count, num_dirs) + + cache_size = num_dirs // 10 + self.mount_a.set_cache_size(cache_size) + + def trimmed(): + dentry_count, dentry_pinned_count = self.mount_a.get_dentry_count() + log.info("waiting, dentry_count, dentry_pinned_count: {0}, {1}".format( + dentry_count, dentry_pinned_count + )) + if dentry_count > cache_size or dentry_pinned_count > cache_size: + return False + + return True + + self.wait_until_true(trimmed, 30) + + @needs_trimming + def test_client_cache_size(self): + self._test_client_cache_size(False) + self._test_client_cache_size(True) + + def test_client_max_caps(self): + """ + That the MDS will not let a client sit above mds_max_caps_per_client caps. + """ + + mds_min_caps_per_client = int(self.config_get('mds', "mds_min_caps_per_client")) + mds_max_caps_per_client = 2*mds_min_caps_per_client + self.config_set('mds', 'mds_max_caps_per_client', mds_max_caps_per_client) + + self.mount_a.create_n_files("foo/", 3*mds_max_caps_per_client, sync=True) + + mount_a_client_id = self.mount_a.get_global_id() + def expected_caps(): + num_caps = self.get_session(mount_a_client_id)['num_caps'] + if num_caps <= mds_max_caps_per_client: + return True + else: + return False + + self.wait_until_true(expected_caps, timeout=60) diff --git a/qa/tasks/cephfs/test_client_recovery.py b/qa/tasks/cephfs/test_client_recovery.py new file mode 100644 index 000000000..1bd6884a9 --- /dev/null +++ b/qa/tasks/cephfs/test_client_recovery.py @@ -0,0 +1,757 @@ + +""" +Teuthology task for exercising CephFS client recovery +""" + +import logging +from textwrap import dedent +import time +import distutils.version as version +import random +import re +import string +import os + +from teuthology.orchestra import run +from teuthology.exceptions import CommandFailedError +from tasks.cephfs.fuse_mount import FuseMount +from tasks.cephfs.cephfs_test_case import CephFSTestCase +from teuthology.packaging import get_package_version + +log = logging.getLogger(__name__) + + +# Arbitrary timeouts for operations involving restarting +# an MDS or waiting for it to come up +MDS_RESTART_GRACE = 60 + + +class TestClientNetworkRecovery(CephFSTestCase): + REQUIRE_ONE_CLIENT_REMOTE = True + CLIENTS_REQUIRED = 2 + + LOAD_SETTINGS = ["mds_reconnect_timeout", "ms_max_backoff"] + + # Environment references + mds_reconnect_timeout = None + ms_max_backoff = None + + def test_network_death(self): + """ + Simulate software freeze or temporary network failure. + + Check that the client blocks I/O during failure, and completes + I/O after failure. + """ + + session_timeout = self.fs.get_var("session_timeout") + self.fs.mds_asok(['config', 'set', 'mds_defer_session_stale', 'false']) + + # We only need one client + self.mount_b.umount_wait() + + # Initially our one client session should be visible + client_id = self.mount_a.get_global_id() + ls_data = self._session_list() + self.assert_session_count(1, ls_data) + self.assertEqual(ls_data[0]['id'], client_id) + self.assert_session_state(client_id, "open") + + # ...and capable of doing I/O without blocking + self.mount_a.create_files() + + # ...but if we turn off the network + self.fs.set_clients_block(True) + + # ...and try and start an I/O + write_blocked = self.mount_a.write_background() + + # ...then it should block + self.assertFalse(write_blocked.finished) + self.assert_session_state(client_id, "open") + time.sleep(session_timeout * 1.5) # Long enough for MDS to consider session stale + self.assertFalse(write_blocked.finished) + self.assert_session_state(client_id, "stale") + + # ...until we re-enable I/O + self.fs.set_clients_block(False) + + # ...when it should complete promptly + a = time.time() + self.wait_until_true(lambda: write_blocked.finished, self.ms_max_backoff * 2) + write_blocked.wait() # Already know we're finished, wait() to raise exception on errors + recovery_time = time.time() - a + log.info("recovery time: {0}".format(recovery_time)) + self.assert_session_state(client_id, "open") + + +class TestClientRecovery(CephFSTestCase): + CLIENTS_REQUIRED = 2 + + LOAD_SETTINGS = ["mds_reconnect_timeout", "ms_max_backoff"] + + # Environment references + mds_reconnect_timeout = None + ms_max_backoff = None + + def test_basic(self): + # Check that two clients come up healthy and see each others' files + # ===================================================== + self.mount_a.create_files() + self.mount_a.check_files() + self.mount_a.umount_wait() + + self.mount_b.check_files() + + self.mount_a.mount_wait() + + # Check that the admin socket interface is correctly reporting + # two sessions + # ===================================================== + ls_data = self._session_list() + self.assert_session_count(2, ls_data) + + self.assertSetEqual( + set([l['id'] for l in ls_data]), + {self.mount_a.get_global_id(), self.mount_b.get_global_id()} + ) + + def test_restart(self): + # Check that after an MDS restart both clients reconnect and continue + # to handle I/O + # ===================================================== + self.fs.mds_fail_restart() + self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE) + + self.mount_a.create_destroy() + self.mount_b.create_destroy() + + def _session_num_caps(self, client_id): + ls_data = self.fs.mds_asok(['session', 'ls']) + return int(self._session_by_id(ls_data).get(client_id, {'num_caps': None})['num_caps']) + + def test_reconnect_timeout(self): + # Reconnect timeout + # ================= + # Check that if I stop an MDS and a client goes away, the MDS waits + # for the reconnect period + + mount_a_client_id = self.mount_a.get_global_id() + + self.fs.fail() + + self.mount_a.umount_wait(force=True) + + self.fs.set_joinable() + + self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE) + # Check that the MDS locally reports its state correctly + status = self.fs.mds_asok(['status']) + self.assertIn("reconnect_status", status) + + ls_data = self._session_list() + self.assert_session_count(2, ls_data) + + # The session for the dead client should have the 'reconnect' flag set + self.assertTrue(self.get_session(mount_a_client_id)['reconnecting']) + + # Wait for the reconnect state to clear, this should take the + # reconnect timeout period. + in_reconnect_for = self.fs.wait_for_state('up:active', timeout=self.mds_reconnect_timeout * 2) + # Check that the period we waited to enter active is within a factor + # of two of the reconnect timeout. + self.assertGreater(in_reconnect_for, self.mds_reconnect_timeout // 2, + "Should have been in reconnect phase for {0} but only took {1}".format( + self.mds_reconnect_timeout, in_reconnect_for + )) + + self.assert_session_count(1) + + # Check that the client that timed out during reconnect can + # mount again and do I/O + self.mount_a.mount_wait() + self.mount_a.create_destroy() + + self.assert_session_count(2) + + def test_reconnect_eviction(self): + # Eviction during reconnect + # ========================= + mount_a_client_id = self.mount_a.get_global_id() + + self.fs.fail() + + # The mount goes away while the MDS is offline + self.mount_a.kill() + + # wait for it to die + time.sleep(5) + + self.fs.set_joinable() + + # Enter reconnect phase + self.fs.wait_for_state('up:reconnect', reject='up:active', timeout=MDS_RESTART_GRACE) + self.assert_session_count(2) + + # Evict the stuck client + self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) + self.assert_session_count(1) + + # Observe that we proceed to active phase without waiting full reconnect timeout + evict_til_active = self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE) + # Once we evict the troublemaker, the reconnect phase should complete + # in well under the reconnect timeout. + self.assertLess(evict_til_active, self.mds_reconnect_timeout * 0.5, + "reconnect did not complete soon enough after eviction, took {0}".format( + evict_til_active + )) + + # We killed earlier so must clean up before trying to use again + self.mount_a.kill_cleanup() + + # Bring the client back + self.mount_a.mount_wait() + self.mount_a.create_destroy() + + def _test_stale_caps(self, write): + session_timeout = self.fs.get_var("session_timeout") + + # Capability release from stale session + # ===================================== + if write: + content = ''.join(random.choices(string.ascii_uppercase + string.digits, k=16)) + cap_holder = self.mount_a.open_background(content=content) + else: + content = '' + self.mount_a.run_shell(["touch", "background_file"]) + self.mount_a.umount_wait() + self.mount_a.mount_wait() + cap_holder = self.mount_a.open_background(write=False) + + self.assert_session_count(2) + mount_a_gid = self.mount_a.get_global_id() + + # Wait for the file to be visible from another client, indicating + # that mount_a has completed its network ops + self.mount_b.wait_for_visible(size=len(content)) + + # Simulate client death + self.mount_a.suspend_netns() + + # wait for it to die so it doesn't voluntarily release buffer cap + time.sleep(5) + + try: + # Now, after session_timeout seconds, the waiter should + # complete their operation when the MDS marks the holder's + # session stale. + cap_waiter = self.mount_b.write_background() + a = time.time() + cap_waiter.wait() + b = time.time() + + # Should have succeeded + self.assertEqual(cap_waiter.exitstatus, 0) + + if write: + self.assert_session_count(1) + else: + self.assert_session_state(mount_a_gid, "stale") + + cap_waited = b - a + log.info("cap_waiter waited {0}s".format(cap_waited)) + self.assertTrue(session_timeout / 2.0 <= cap_waited <= session_timeout * 2.0, + "Capability handover took {0}, expected approx {1}".format( + cap_waited, session_timeout + )) + finally: + self.mount_a.resume_netns() # allow the mount to recover otherwise background proc is unkillable + self.mount_a._kill_background(cap_holder) + + def test_stale_read_caps(self): + self._test_stale_caps(False) + + def test_stale_write_caps(self): + self._test_stale_caps(True) + + def test_evicted_caps(self): + # Eviction while holding a capability + # =================================== + + session_timeout = self.fs.get_var("session_timeout") + + # Take out a write capability on a file on client A, + # and then immediately kill it. + cap_holder = self.mount_a.open_background() + mount_a_client_id = self.mount_a.get_global_id() + + # Wait for the file to be visible from another client, indicating + # that mount_a has completed its network ops + self.mount_b.wait_for_visible() + + # Simulate client death + self.mount_a.suspend_netns() + + # wait for it to die so it doesn't voluntarily release buffer cap + time.sleep(5) + + try: + # The waiter should get stuck waiting for the capability + # held on the MDS by the now-dead client A + cap_waiter = self.mount_b.write_background() + time.sleep(5) + self.assertFalse(cap_waiter.finished) + + self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) + # Now, because I evicted the old holder of the capability, it should + # immediately get handed over to the waiter + a = time.time() + cap_waiter.wait() + b = time.time() + cap_waited = b - a + log.info("cap_waiter waited {0}s".format(cap_waited)) + # This is the check that it happened 'now' rather than waiting + # for the session timeout + self.assertLess(cap_waited, session_timeout / 2.0, + "Capability handover took {0}, expected less than {1}".format( + cap_waited, session_timeout / 2.0 + )) + + finally: + self.mount_a.resume_netns() # allow the mount to recover otherwise background proc is unkillable + self.mount_a._kill_background(cap_holder) + + def test_trim_caps(self): + # Trim capability when reconnecting MDS + # =================================== + + count = 500 + # Create lots of files + for i in range(count): + self.mount_a.run_shell(["touch", "f{0}".format(i)]) + + # Populate mount_b's cache + self.mount_b.run_shell(["ls", "-l"]) + + client_id = self.mount_b.get_global_id() + num_caps = self._session_num_caps(client_id) + self.assertGreaterEqual(num_caps, count) + + # Restart MDS. client should trim its cache when reconnecting to the MDS + self.fs.mds_fail_restart() + self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE) + + num_caps = self._session_num_caps(client_id) + self.assertLess(num_caps, count, + "should have less than {0} capabilities, have {1}".format( + count, num_caps + )) + + def _is_flockable(self): + a_version_str = get_package_version(self.mount_a.client_remote, "fuse") + b_version_str = get_package_version(self.mount_b.client_remote, "fuse") + flock_version_str = "2.9" + + version_regex = re.compile(r"[0-9\.]+") + a_result = version_regex.match(a_version_str) + self.assertTrue(a_result) + b_result = version_regex.match(b_version_str) + self.assertTrue(b_result) + a_version = version.StrictVersion(a_result.group()) + b_version = version.StrictVersion(b_result.group()) + flock_version=version.StrictVersion(flock_version_str) + + if (a_version >= flock_version and b_version >= flock_version): + log.info("flock locks are available") + return True + else: + log.info("not testing flock locks, machines have versions {av} and {bv}".format( + av=a_version_str,bv=b_version_str)) + return False + + def test_filelock(self): + """ + Check that file lock doesn't get lost after an MDS restart + """ + + flockable = self._is_flockable() + lock_holder = self.mount_a.lock_background(do_flock=flockable) + + self.mount_b.wait_for_visible("background_file-2") + self.mount_b.check_filelock(do_flock=flockable) + + self.fs.mds_fail_restart() + self.fs.wait_for_state('up:active', timeout=MDS_RESTART_GRACE) + + self.mount_b.check_filelock(do_flock=flockable) + + self.mount_a._kill_background(lock_holder) + + def test_filelock_eviction(self): + """ + Check that file lock held by evicted client is given to + waiting client. + """ + if not self._is_flockable(): + self.skipTest("flock is not available") + + lock_holder = self.mount_a.lock_background() + self.mount_b.wait_for_visible("background_file-2") + self.mount_b.check_filelock() + + lock_taker = self.mount_b.lock_and_release() + # Check the taker is waiting (doesn't get it immediately) + time.sleep(2) + self.assertFalse(lock_holder.finished) + self.assertFalse(lock_taker.finished) + + try: + mount_a_client_id = self.mount_a.get_global_id() + self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) + + # Evicting mount_a should let mount_b's attempt to take the lock + # succeed + self.wait_until_true(lambda: lock_taker.finished, timeout=10) + finally: + self.mount_a._kill_background(lock_holder) + + # teardown() doesn't quite handle this case cleanly, so help it out + self.mount_a.kill() + self.mount_a.kill_cleanup() + + # Bring the client back + self.mount_a.mount_wait() + + def test_dir_fsync(self): + self._test_fsync(True); + + def test_create_fsync(self): + self._test_fsync(False); + + def _test_fsync(self, dirfsync): + """ + That calls to fsync guarantee visibility of metadata to another + client immediately after the fsyncing client dies. + """ + + # Leave this guy out until he's needed + self.mount_b.umount_wait() + + # Create dir + child dentry on client A, and fsync the dir + path = os.path.join(self.mount_a.mountpoint, "subdir") + self.mount_a.run_python( + dedent(""" + import os + import time + + path = "{path}" + + print("Starting creation...") + start = time.time() + + os.mkdir(path) + dfd = os.open(path, os.O_DIRECTORY) + + fd = open(os.path.join(path, "childfile"), "w") + print("Finished creation in {{0}}s".format(time.time() - start)) + + print("Starting fsync...") + start = time.time() + if {dirfsync}: + os.fsync(dfd) + else: + os.fsync(fd) + print("Finished fsync in {{0}}s".format(time.time() - start)) + """.format(path=path,dirfsync=str(dirfsync))) + ) + + # Immediately kill the MDS and then client A + self.fs.fail() + self.mount_a.kill() + self.mount_a.kill_cleanup() + + # Restart the MDS. Wait for it to come up, it'll have to time out in clientreplay + self.fs.set_joinable() + log.info("Waiting for reconnect...") + self.fs.wait_for_state("up:reconnect") + log.info("Waiting for active...") + self.fs.wait_for_state("up:active", timeout=MDS_RESTART_GRACE + self.mds_reconnect_timeout) + log.info("Reached active...") + + # Is the child dentry visible from mount B? + self.mount_b.mount_wait() + self.mount_b.run_shell(["ls", "subdir/childfile"]) + + def test_unmount_for_evicted_client(self): + """Test if client hangs on unmount after evicting the client.""" + mount_a_client_id = self.mount_a.get_global_id() + self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) + + self.mount_a.umount_wait(require_clean=True, timeout=30) + + def test_mount_after_evicted_client(self): + """Test if a new mount of same fs works after client eviction.""" + + # trash this : we need it to use same remote as mount_a + self.mount_b.umount_wait() + + cl = self.mount_a.__class__ + + # create a new instance of mount_a's class with most of the + # same settings, but mounted on mount_b's mountpoint. + m = cl(ctx=self.mount_a.ctx, + client_config=self.mount_a.client_config, + test_dir=self.mount_a.test_dir, + client_id=self.mount_a.client_id, + client_remote=self.mount_a.client_remote, + client_keyring_path=self.mount_a.client_keyring_path, + cephfs_name=self.mount_a.cephfs_name, + cephfs_mntpt= self.mount_a.cephfs_mntpt, + hostfs_mntpt=self.mount_b.hostfs_mntpt, + brxnet=self.mount_a.ceph_brx_net) + + # evict mount_a + mount_a_client_id = self.mount_a.get_global_id() + self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) + + m.mount_wait() + m.create_files() + m.check_files() + m.umount_wait(require_clean=True) + + def test_stale_renew(self): + if not isinstance(self.mount_a, FuseMount): + self.skipTest("Require FUSE client to handle signal STOP/CONT") + + session_timeout = self.fs.get_var("session_timeout") + + self.mount_a.run_shell(["mkdir", "testdir"]) + self.mount_a.run_shell(["touch", "testdir/file1"]) + # populate readdir cache + self.mount_a.run_shell(["ls", "testdir"]) + self.mount_b.run_shell(["ls", "testdir"]) + + # check if readdir cache is effective + initial_readdirs = self.fs.mds_asok(['perf', 'dump', 'mds_server', 'req_readdir_latency']) + self.mount_b.run_shell(["ls", "testdir"]) + current_readdirs = self.fs.mds_asok(['perf', 'dump', 'mds_server', 'req_readdir_latency']) + self.assertEqual(current_readdirs, initial_readdirs); + + mount_b_gid = self.mount_b.get_global_id() + # stop ceph-fuse process of mount_b + self.mount_b.suspend_netns() + + self.assert_session_state(mount_b_gid, "open") + time.sleep(session_timeout * 1.5) # Long enough for MDS to consider session stale + + self.mount_a.run_shell(["touch", "testdir/file2"]) + self.assert_session_state(mount_b_gid, "stale") + + # resume ceph-fuse process of mount_b + self.mount_b.resume_netns() + # Is the new file visible from mount_b? (caps become invalid after session stale) + self.mount_b.run_shell(["ls", "testdir/file2"]) + + def test_abort_conn(self): + """ + Check that abort_conn() skips closing mds sessions. + """ + if not isinstance(self.mount_a, FuseMount): + self.skipTest("Testing libcephfs function") + + self.fs.mds_asok(['config', 'set', 'mds_defer_session_stale', 'false']) + session_timeout = self.fs.get_var("session_timeout") + + self.mount_a.umount_wait() + self.mount_b.umount_wait() + + gid_str = self.mount_a.run_python(dedent(""" + import cephfs as libcephfs + cephfs = libcephfs.LibCephFS(conffile='') + cephfs.mount() + client_id = cephfs.get_instance_id() + cephfs.abort_conn() + print(client_id) + """) + ) + gid = int(gid_str); + + self.assert_session_state(gid, "open") + time.sleep(session_timeout * 1.5) # Long enough for MDS to consider session stale + self.assert_session_state(gid, "stale") + + def test_dont_mark_unresponsive_client_stale(self): + """ + Test that an unresponsive client holding caps is not marked stale or + evicted unless another clients wants its caps. + """ + if not isinstance(self.mount_a, FuseMount): + self.skipTest("Require FUSE client to handle signal STOP/CONT") + + # XXX: To conduct this test we need at least two clients since a + # single client is never evcited by MDS. + SESSION_TIMEOUT = 30 + SESSION_AUTOCLOSE = 50 + time_at_beg = time.time() + mount_a_gid = self.mount_a.get_global_id() + _ = self.mount_a.client_pid + self.fs.set_var('session_timeout', SESSION_TIMEOUT) + self.fs.set_var('session_autoclose', SESSION_AUTOCLOSE) + self.assert_session_count(2, self.fs.mds_asok(['session', 'ls'])) + + # test that client holding cap not required by any other client is not + # marked stale when it becomes unresponsive. + self.mount_a.run_shell(['mkdir', 'dir']) + self.mount_a.send_signal('sigstop') + time.sleep(SESSION_TIMEOUT + 2) + self.assert_session_state(mount_a_gid, "open") + + # test that other clients have to wait to get the caps from + # unresponsive client until session_autoclose. + self.mount_b.run_shell(['stat', 'dir']) + self.assert_session_count(1, self.fs.mds_asok(['session', 'ls'])) + self.assertLess(time.time(), time_at_beg + SESSION_AUTOCLOSE) + + self.mount_a.send_signal('sigcont') + + def test_config_session_timeout(self): + self.fs.mds_asok(['config', 'set', 'mds_defer_session_stale', 'false']) + session_timeout = self.fs.get_var("session_timeout") + mount_a_gid = self.mount_a.get_global_id() + + self.fs.mds_asok(['session', 'config', '%s' % mount_a_gid, 'timeout', '%s' % (session_timeout * 2)]) + + self.mount_a.kill(); + + self.assert_session_count(2) + + time.sleep(session_timeout * 1.5) + self.assert_session_state(mount_a_gid, "open") + + time.sleep(session_timeout) + self.assert_session_count(1) + + self.mount_a.kill_cleanup() + + def test_reconnect_after_blocklisted(self): + """ + Test reconnect after blocklisted. + - writing to a fd that was opened before blocklist should return -EBADF + - reading/writing to a file with lost file locks should return -EIO + - readonly fd should continue to work + """ + + self.mount_a.umount_wait() + + if isinstance(self.mount_a, FuseMount): + self.mount_a.mount_wait(mntargs=['--client_reconnect_stale=1', '--fuse_disable_pagecache=1']) + else: + try: + self.mount_a.mount_wait(mntopts=['recover_session=clean']) + except CommandFailedError: + self.mount_a.kill_cleanup() + self.skipTest("Not implemented in current kernel") + + self.mount_a.wait_until_mounted() + + path = os.path.join(self.mount_a.mountpoint, 'testfile_reconnect_after_blocklisted') + pyscript = dedent(""" + import os + import sys + import fcntl + import errno + import time + + fd1 = os.open("{path}.1", os.O_RDWR | os.O_CREAT, 0O666) + fd2 = os.open("{path}.1", os.O_RDONLY) + fd3 = os.open("{path}.2", os.O_RDWR | os.O_CREAT, 0O666) + fd4 = os.open("{path}.2", os.O_RDONLY) + + os.write(fd1, b'content') + os.read(fd2, 1); + + os.write(fd3, b'content') + os.read(fd4, 1); + fcntl.flock(fd4, fcntl.LOCK_SH | fcntl.LOCK_NB) + + print("blocklist") + sys.stdout.flush() + + sys.stdin.readline() + + # wait for mds to close session + time.sleep(10); + + # trigger 'open session' message. kclient relies on 'session reject' message + # to detect if itself is blocklisted + try: + os.stat("{path}.1") + except: + pass + + # wait for auto reconnect + time.sleep(10); + + try: + os.write(fd1, b'content') + except OSError as e: + if e.errno != errno.EBADF: + raise + else: + raise RuntimeError("write() failed to raise error") + + os.read(fd2, 1); + + try: + os.read(fd4, 1) + except OSError as e: + if e.errno != errno.EIO: + raise + else: + raise RuntimeError("read() failed to raise error") + """).format(path=path) + rproc = self.mount_a.client_remote.run( + args=['python3', '-c', pyscript], + wait=False, stdin=run.PIPE, stdout=run.PIPE) + + rproc.stdout.readline() + + mount_a_client_id = self.mount_a.get_global_id() + self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) + + rproc.stdin.writelines(['done\n']) + rproc.stdin.flush() + + rproc.wait() + self.assertEqual(rproc.exitstatus, 0) + + def test_refuse_client_session(self): + """ + Test that client cannot start session when file system flag + refuse_client_session is set + """ + + self.mount_a.umount_wait() + self.fs.set_refuse_client_session(True) + with self.assertRaises(CommandFailedError): + self.mount_a.mount_wait() + + def test_refuse_client_session_on_reconnect(self): + """ + Test that client cannot reconnect when filesystem comes online and + file system flag refuse_client_session is set + """ + + self.mount_a.create_files() + self.mount_a.check_files() + + self.fs.fail() + self.fs.set_refuse_client_session(True) + self.fs.set_joinable() + with self.assert_cluster_log('client could not reconnect as' + ' file system flag' + ' refuse_client_session is set'): + time.sleep(self.fs.get_var("session_timeout") * 1.5) + self.assertEqual(len(self.fs.mds_tell(["session", "ls"])), 0) + self.mount_a.umount_wait(force=True) + diff --git a/qa/tasks/cephfs/test_damage.py b/qa/tasks/cephfs/test_damage.py new file mode 100644 index 000000000..bfaa23453 --- /dev/null +++ b/qa/tasks/cephfs/test_damage.py @@ -0,0 +1,663 @@ +from io import BytesIO, StringIO +import json +import logging +import errno +import re +import time +from teuthology.contextutil import MaxWhileTries +from teuthology.exceptions import CommandFailedError +from teuthology.orchestra.run import wait +from tasks.cephfs.fuse_mount import FuseMount +from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology + +DAMAGED_ON_START = "damaged_on_start" +DAMAGED_ON_LS = "damaged_on_ls" +CRASHED = "server crashed" +NO_DAMAGE = "no damage" +READONLY = "readonly" +FAILED_CLIENT = "client failed" +FAILED_SERVER = "server failed" + +# An EIO in response to a stat from the client +EIO_ON_LS = "eio" + +# An EIO, but nothing in damage table (not ever what we expect) +EIO_NO_DAMAGE = "eio without damage entry" + + +log = logging.getLogger(__name__) + + +class TestDamage(CephFSTestCase): + def _simple_workload_write(self): + self.mount_a.run_shell(["mkdir", "subdir"]) + self.mount_a.write_n_mb("subdir/sixmegs", 6) + return self.mount_a.stat("subdir/sixmegs") + + def is_marked_damaged(self, rank): + mds_map = self.fs.get_mds_map() + return rank in mds_map['damaged'] + + @for_teuthology #459s + def test_object_deletion(self): + """ + That the MDS has a clean 'damaged' response to loss of any single metadata object + """ + + self._simple_workload_write() + + # Hmm, actually it would be nice to permute whether the metadata pool + # state contains sessions or not, but for the moment close this session + # to avoid waiting through reconnect on every MDS start. + self.mount_a.umount_wait() + for mds_name in self.fs.get_active_names(): + self.fs.mds_asok(["flush", "journal"], mds_name) + + self.fs.fail() + + serialized = self.fs.radosmo(['export', '-']) + + def is_ignored(obj_id, dentry=None): + """ + A filter to avoid redundantly mutating many similar objects (e.g. + stray dirfrags) or similar dentries (e.g. stray dir dentries) + """ + if re.match("60.\.00000000", obj_id) and obj_id != "600.00000000": + return True + + if dentry and obj_id == "100.00000000": + if re.match("stray.+_head", dentry) and dentry != "stray0_head": + return True + + return False + + def get_path(obj_id, dentry=None): + """ + What filesystem path does this object or dentry correspond to? i.e. + what should I poke to see EIO after damaging it? + """ + + if obj_id == "1.00000000" and dentry == "subdir_head": + return "./subdir" + elif obj_id == "10000000000.00000000" and dentry == "sixmegs_head": + return "./subdir/sixmegs" + + # None means ls will do an "ls -R" in hope of seeing some errors + return None + + objects = self.fs.radosmo(["ls"], stdout=StringIO()).strip().split("\n") + objects = [o for o in objects if not is_ignored(o)] + + # Find all objects with an OMAP header + omap_header_objs = [] + for o in objects: + header = self.fs.radosmo(["getomapheader", o], stdout=StringIO()) + # The rados CLI wraps the header output in a hex-printed style + header_bytes = int(re.match("header \((.+) bytes\)", header).group(1)) + if header_bytes > 0: + omap_header_objs.append(o) + + # Find all OMAP key/vals + omap_keys = [] + for o in objects: + keys_str = self.fs.radosmo(["listomapkeys", o], stdout=StringIO()) + if keys_str: + for key in keys_str.strip().split("\n"): + if not is_ignored(o, key): + omap_keys.append((o, key)) + + # Find objects that have data in their bodies + data_objects = [] + for obj_id in objects: + stat_out = self.fs.radosmo(["stat", obj_id], stdout=StringIO()) + size = int(re.match(".+, size (.+)$", stat_out).group(1)) + if size > 0: + data_objects.append(obj_id) + + # Define the various forms of damage we will inflict + class MetadataMutation(object): + def __init__(self, obj_id_, desc_, mutate_fn_, expectation_, ls_path=None): + self.obj_id = obj_id_ + self.desc = desc_ + self.mutate_fn = mutate_fn_ + self.expectation = expectation_ + if ls_path is None: + self.ls_path = "." + else: + self.ls_path = ls_path + + def __eq__(self, other): + return self.desc == other.desc + + def __hash__(self): + return hash(self.desc) + + junk = "deadbeef" * 10 + mutations = [] + + # Removals + for o in objects: + if o in [ + # JournalPointers are auto-replaced if missing (same path as upgrade) + "400.00000000", + # Missing dirfrags for non-system dirs result in empty directory + "10000000000.00000000", + # PurgeQueue is auto-created if not found on startup + "500.00000000", + # open file table is auto-created if not found on startup + "mds0_openfiles.0" + ]: + expectation = NO_DAMAGE + else: + expectation = DAMAGED_ON_START + + log.info("Expectation on rm '{0}' will be '{1}'".format( + o, expectation + )) + + mutations.append(MetadataMutation( + o, + "Delete {0}".format(o), + lambda o=o: self.fs.radosm(["rm", o]), + expectation + )) + + # Blatant corruptions + for obj_id in data_objects: + if obj_id == "500.00000000": + # purge queue corruption results in read-only FS + mutations.append(MetadataMutation( + obj_id, + "Corrupt {0}".format(obj_id), + lambda o=obj_id: self.fs.radosm(["put", o, "-"], stdin=StringIO(junk)), + READONLY + )) + else: + mutations.append(MetadataMutation( + obj_id, + "Corrupt {0}".format(obj_id), + lambda o=obj_id: self.fs.radosm(["put", o, "-"], stdin=StringIO(junk)), + DAMAGED_ON_START + )) + + # Truncations + for o in data_objects: + if o == "500.00000000": + # The PurgeQueue is allowed to be empty: Journaler interprets + # an empty header object as an empty journal. + expectation = NO_DAMAGE + else: + expectation = DAMAGED_ON_START + + mutations.append( + MetadataMutation( + o, + "Truncate {0}".format(o), + lambda o=o: self.fs.radosm(["truncate", o, "0"]), + expectation + )) + + # OMAP value corruptions + for o, k in omap_keys: + if o.startswith("100."): + # Anything in rank 0's 'mydir' + expectation = DAMAGED_ON_START + else: + expectation = EIO_ON_LS + + mutations.append( + MetadataMutation( + o, + "Corrupt omap key {0}:{1}".format(o, k), + lambda o=o,k=k: self.fs.radosm(["setomapval", o, k, junk]), + expectation, + get_path(o, k) + ) + ) + + # OMAP header corruptions + for o in omap_header_objs: + if re.match("60.\.00000000", o) \ + or o in ["1.00000000", "100.00000000", "mds0_sessionmap"]: + expectation = DAMAGED_ON_START + else: + expectation = NO_DAMAGE + + log.info("Expectation on corrupt header '{0}' will be '{1}'".format( + o, expectation + )) + + mutations.append( + MetadataMutation( + o, + "Corrupt omap header on {0}".format(o), + lambda o=o: self.fs.radosm(["setomapheader", o, junk]), + expectation + ) + ) + + results = {} + + for mutation in mutations: + log.info("Applying mutation '{0}'".format(mutation.desc)) + + # Reset MDS state + self.mount_a.umount_wait(force=True) + self.fs.fail() + self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0') + + # Reset RADOS pool state + self.fs.radosm(['import', '-'], stdin=BytesIO(serialized)) + + # Inject the mutation + mutation.mutate_fn() + + # Try starting the MDS + self.fs.set_joinable() + + # How long we'll wait between starting a daemon and expecting + # it to make it through startup, and potentially declare itself + # damaged to the mon cluster. + startup_timeout = 60 + + if mutation.expectation not in (EIO_ON_LS, DAMAGED_ON_LS, NO_DAMAGE): + if mutation.expectation == DAMAGED_ON_START: + # The MDS may pass through active before making it to damaged + try: + self.wait_until_true(lambda: self.is_marked_damaged(0), startup_timeout) + except RuntimeError: + pass + + # Wait for MDS to either come up or go into damaged state + try: + self.wait_until_true(lambda: self.is_marked_damaged(0) or self.fs.are_daemons_healthy(), startup_timeout) + except RuntimeError: + crashed = False + # Didn't make it to healthy or damaged, did it crash? + for daemon_id, daemon in self.fs.mds_daemons.items(): + if daemon.proc and daemon.proc.finished: + crashed = True + log.error("Daemon {0} crashed!".format(daemon_id)) + daemon.proc = None # So that subsequent stop() doesn't raise error + if not crashed: + # Didn't go health, didn't go damaged, didn't crash, so what? + raise + else: + log.info("Result: Mutation '{0}' led to crash".format(mutation.desc)) + results[mutation] = CRASHED + continue + if self.is_marked_damaged(0): + log.info("Result: Mutation '{0}' led to DAMAGED state".format(mutation.desc)) + results[mutation] = DAMAGED_ON_START + continue + else: + log.info("Mutation '{0}' did not prevent MDS startup, attempting ls...".format(mutation.desc)) + else: + try: + self.wait_until_true(self.fs.are_daemons_healthy, 60) + except RuntimeError: + log.info("Result: Mutation '{0}' should have left us healthy, actually not.".format(mutation.desc)) + if self.is_marked_damaged(0): + results[mutation] = DAMAGED_ON_START + else: + results[mutation] = FAILED_SERVER + continue + log.info("Daemons came up after mutation '{0}', proceeding to ls".format(mutation.desc)) + + # MDS is up, should go damaged on ls or client mount + self.mount_a.mount_wait() + if mutation.ls_path == ".": + proc = self.mount_a.run_shell(["ls", "-R", mutation.ls_path], wait=False) + else: + proc = self.mount_a.stat(mutation.ls_path, wait=False) + + if mutation.expectation == DAMAGED_ON_LS: + try: + self.wait_until_true(lambda: self.is_marked_damaged(0), 60) + log.info("Result: Mutation '{0}' led to DAMAGED state after ls".format(mutation.desc)) + results[mutation] = DAMAGED_ON_LS + except RuntimeError: + if self.fs.are_daemons_healthy(): + log.error("Result: Failed to go damaged on mutation '{0}', actually went active".format( + mutation.desc)) + results[mutation] = NO_DAMAGE + else: + log.error("Result: Failed to go damaged on mutation '{0}'".format(mutation.desc)) + results[mutation] = FAILED_SERVER + elif mutation.expectation == READONLY: + proc = self.mount_a.run_shell(["mkdir", "foo"], wait=False) + try: + proc.wait() + except CommandFailedError: + stderr = proc.stderr.getvalue() + log.info(stderr) + if "Read-only file system".lower() in stderr.lower(): + pass + else: + raise + else: + try: + wait([proc], 20) + log.info("Result: Mutation '{0}' did not caused DAMAGED state".format(mutation.desc)) + results[mutation] = NO_DAMAGE + except MaxWhileTries: + log.info("Result: Failed to complete client IO on mutation '{0}'".format(mutation.desc)) + results[mutation] = FAILED_CLIENT + except CommandFailedError as e: + if e.exitstatus == errno.EIO: + log.info("Result: EIO on client") + results[mutation] = EIO_ON_LS + else: + log.info("Result: unexpected error {0} on client".format(e)) + results[mutation] = FAILED_CLIENT + + if mutation.expectation == EIO_ON_LS: + # EIOs mean something handled by DamageTable: assert that it has + # been populated + damage = json.loads( + self.fs.mon_manager.raw_cluster_cmd( + 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), "damage", "ls", '--format=json-pretty')) + if len(damage) == 0: + results[mutation] = EIO_NO_DAMAGE + + failures = [(mutation, result) for (mutation, result) in results.items() if mutation.expectation != result] + if failures: + log.error("{0} mutations had unexpected outcomes:".format(len(failures))) + for mutation, result in failures: + log.error(" Expected '{0}' actually '{1}' from '{2}'".format( + mutation.expectation, result, mutation.desc + )) + raise RuntimeError("{0} mutations had unexpected outcomes".format(len(failures))) + else: + log.info("All {0} mutations had expected outcomes".format(len(mutations))) + + def test_damaged_dentry(self): + # Damage to dentrys is interesting because it leaves the + # directory's `complete` flag in a subtle state where + # we have marked the dir complete in order that folks + # can access it, but in actual fact there is a dentry + # missing + self.mount_a.run_shell(["mkdir", "subdir/"]) + + self.mount_a.run_shell(["touch", "subdir/file_undamaged"]) + self.mount_a.run_shell(["touch", "subdir/file_to_be_damaged"]) + + subdir_ino = self.mount_a.path_to_ino("subdir") + + self.mount_a.umount_wait() + for mds_name in self.fs.get_active_names(): + self.fs.mds_asok(["flush", "journal"], mds_name) + + self.fs.fail() + + # Corrupt a dentry + junk = "deadbeef" * 10 + dirfrag_obj = "{0:x}.00000000".format(subdir_ino) + self.fs.radosm(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk]) + + # Start up and try to list it + self.fs.set_joinable() + self.fs.wait_for_daemons() + + self.mount_a.mount_wait() + dentries = self.mount_a.ls("subdir/") + + # The damaged guy should have disappeared + self.assertEqual(dentries, ["file_undamaged"]) + + # I should get ENOENT if I try and read it normally, because + # the dir is considered complete + try: + self.mount_a.stat("subdir/file_to_be_damaged", wait=True) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.ENOENT) + else: + raise AssertionError("Expected ENOENT") + + # The fact that there is damaged should have bee recorded + damage = json.loads( + self.fs.mon_manager.raw_cluster_cmd( + 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), + "damage", "ls", '--format=json-pretty')) + self.assertEqual(len(damage), 1) + damage_id = damage[0]['id'] + + # If I try to create a dentry with the same name as the damaged guy + # then that should be forbidden + try: + self.mount_a.touch("subdir/file_to_be_damaged") + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.EIO) + else: + raise AssertionError("Expected EIO") + + # Attempting that touch will clear the client's complete flag, now + # when I stat it I'll get EIO instead of ENOENT + try: + self.mount_a.stat("subdir/file_to_be_damaged", wait=True) + except CommandFailedError as e: + if isinstance(self.mount_a, FuseMount): + self.assertEqual(e.exitstatus, errno.EIO) + else: + # Old kernel client handles this case differently + self.assertIn(e.exitstatus, [errno.ENOENT, errno.EIO]) + else: + raise AssertionError("Expected EIO") + + nfiles = self.mount_a.getfattr("./subdir", "ceph.dir.files") + self.assertEqual(nfiles, "2") + + self.mount_a.umount_wait() + + # Now repair the stats + scrub_json = self.fs.run_scrub(["start", "/subdir", "repair"]) + log.info(json.dumps(scrub_json, indent=2)) + + self.assertNotEqual(scrub_json, None) + self.assertEqual(scrub_json["return_code"], 0) + self.assertEqual(self.fs.wait_until_scrub_complete(tag=scrub_json["scrub_tag"]), True) + + # Check that the file count is now correct + self.mount_a.mount_wait() + nfiles = self.mount_a.getfattr("./subdir", "ceph.dir.files") + self.assertEqual(nfiles, "1") + + # Clean up the omap object + self.fs.radosm(["setomapval", dirfrag_obj, "file_to_be_damaged_head", junk]) + + # Clean up the damagetable entry + self.fs.mon_manager.raw_cluster_cmd( + 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), + "damage", "rm", "{did}".format(did=damage_id)) + + # Now I should be able to create a file with the same name as the + # damaged guy if I want. + self.mount_a.touch("subdir/file_to_be_damaged") + + def test_open_ino_errors(self): + """ + That errors encountered during opening inos are properly propagated + """ + + self.mount_a.run_shell(["mkdir", "dir1"]) + self.mount_a.run_shell(["touch", "dir1/file1"]) + self.mount_a.run_shell(["mkdir", "dir2"]) + self.mount_a.run_shell(["touch", "dir2/file2"]) + self.mount_a.run_shell(["mkdir", "testdir"]) + self.mount_a.run_shell(["ln", "dir1/file1", "testdir/hardlink1"]) + self.mount_a.run_shell(["ln", "dir2/file2", "testdir/hardlink2"]) + + file1_ino = self.mount_a.path_to_ino("dir1/file1") + file2_ino = self.mount_a.path_to_ino("dir2/file2") + dir2_ino = self.mount_a.path_to_ino("dir2") + + # Ensure everything is written to backing store + self.mount_a.umount_wait() + self.fs.mds_asok(["flush", "journal"]) + + # Drop everything from the MDS cache + self.fs.fail() + self.fs.journal_tool(['journal', 'reset'], 0) + self.fs.set_joinable() + self.fs.wait_for_daemons() + + self.mount_a.mount_wait() + + # Case 1: un-decodeable backtrace + + # Validate that the backtrace is present and decodable + self.fs.read_backtrace(file1_ino) + # Go corrupt the backtrace of alpha/target (used for resolving + # bravo/hardlink). + self.fs._write_data_xattr(file1_ino, "parent", "rhubarb") + + # Check that touching the hardlink gives EIO + ran = self.mount_a.run_shell(["stat", "testdir/hardlink1"], wait=False) + try: + ran.wait() + except CommandFailedError: + self.assertTrue("Input/output error" in ran.stderr.getvalue()) + + # Check that an entry is created in the damage table + damage = json.loads( + self.fs.mon_manager.raw_cluster_cmd( + 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), + "damage", "ls", '--format=json-pretty')) + self.assertEqual(len(damage), 1) + self.assertEqual(damage[0]['damage_type'], "backtrace") + self.assertEqual(damage[0]['ino'], file1_ino) + + self.fs.mon_manager.raw_cluster_cmd( + 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), + "damage", "rm", str(damage[0]['id'])) + + + # Case 2: missing dirfrag for the target inode + + self.fs.radosm(["rm", "{0:x}.00000000".format(dir2_ino)]) + + # Check that touching the hardlink gives EIO + ran = self.mount_a.run_shell(["stat", "testdir/hardlink2"], wait=False) + try: + ran.wait() + except CommandFailedError: + self.assertTrue("Input/output error" in ran.stderr.getvalue()) + + # Check that an entry is created in the damage table + damage = json.loads( + self.fs.mon_manager.raw_cluster_cmd( + 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), + "damage", "ls", '--format=json-pretty')) + self.assertEqual(len(damage), 2) + if damage[0]['damage_type'] == "backtrace" : + self.assertEqual(damage[0]['ino'], file2_ino) + self.assertEqual(damage[1]['damage_type'], "dir_frag") + self.assertEqual(damage[1]['ino'], dir2_ino) + else: + self.assertEqual(damage[0]['damage_type'], "dir_frag") + self.assertEqual(damage[0]['ino'], dir2_ino) + self.assertEqual(damage[1]['damage_type'], "backtrace") + self.assertEqual(damage[1]['ino'], file2_ino) + + for entry in damage: + self.fs.mon_manager.raw_cluster_cmd( + 'tell', 'mds.{0}'.format(self.fs.get_active_names()[0]), + "damage", "rm", str(entry['id'])) + + def test_dentry_first_existing(self): + """ + That the MDS won't abort when the dentry is already known to be damaged. + """ + + def verify_corrupt(): + info = self.fs.read_cache("/a", 0) + log.debug('%s', info) + self.assertEqual(len(info), 1) + dirfrags = info[0]['dirfrags'] + self.assertEqual(len(dirfrags), 1) + dentries = dirfrags[0]['dentries'] + self.assertEqual([dn['path'] for dn in dentries if dn['is_primary']], ['a/c']) + self.assertEqual(dentries[0]['snap_first'], 18446744073709551606) # SNAP_HEAD + + self.mount_a.run_shell_payload("mkdir -p a/b") + self.fs.flush() + self.config_set("mds", "mds_abort_on_newly_corrupt_dentry", False) + self.config_set("mds", "mds_inject_rename_corrupt_dentry_first", "1.0") + time.sleep(5) # for conf to percolate + self.mount_a.run_shell_payload("mv a/b a/c; sync .") + self.mount_a.umount() + verify_corrupt() + self.fs.fail() + self.config_rm("mds", "mds_inject_rename_corrupt_dentry_first") + self.config_set("mds", "mds_abort_on_newly_corrupt_dentry", False) + self.fs.set_joinable() + status = self.fs.status() + self.fs.flush() + self.assertFalse(self.fs.status().hadfailover(status)) + verify_corrupt() + + def test_dentry_first_preflush(self): + """ + That the MDS won't write a dentry with new damage to CDentry::first + to the journal. + """ + + rank0 = self.fs.get_rank() + self.fs.rank_freeze(True, rank=0) + self.mount_a.run_shell_payload("mkdir -p a/{b,c}/d") + self.fs.flush() + self.config_set("mds", "mds_inject_rename_corrupt_dentry_first", "1.0") + time.sleep(5) # for conf to percolate + with self.assert_cluster_log("MDS abort because newly corrupt dentry"): + p = self.mount_a.run_shell_payload("timeout 60 mv a/b a/z", wait=False) + self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout) + self.config_rm("mds", "mds_inject_rename_corrupt_dentry_first") + self.fs.rank_freeze(False, rank=0) + self.delete_mds_coredump(rank0['name']) + self.fs.mds_restart(rank0['name']) + self.fs.wait_for_daemons() + p.wait() + self.mount_a.run_shell_payload("stat a/ && find a/") + self.fs.flush() + + def test_dentry_first_precommit(self): + """ + That the MDS won't write a dentry with new damage to CDentry::first + to the directory object. + """ + + fscid = self.fs.id + self.mount_a.run_shell_payload("mkdir -p a/{b,c}/d; sync .") + self.mount_a.umount() # allow immediate scatter write back + self.fs.flush() + # now just twiddle some inode metadata on a regular file + self.mount_a.mount_wait() + self.mount_a.run_shell_payload("chmod 711 a/b/d; sync .") + self.mount_a.umount() # avoid journaling session related things + # okay, now cause the dentry to get damaged after loading from the journal + self.fs.fail() + self.config_set("mds", "mds_inject_journal_corrupt_dentry_first", "1.0") + time.sleep(5) # for conf to percolate + self.fs.set_joinable() + self.fs.wait_for_daemons() + rank0 = self.fs.get_rank() + self.fs.rank_freeze(True, rank=0) + # so now we want to trigger commit but this will crash, so: + with self.assert_cluster_log("MDS abort because newly corrupt dentry"): + c = ['--connect-timeout=60', 'tell', f"mds.{fscid}:0", "flush", "journal"] + p = self.ceph_cluster.mon_manager.run_cluster_cmd(args=c, wait=False, timeoutcmd=30) + self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(), timeout=self.fs.beacon_timeout) + self.config_rm("mds", "mds_inject_journal_corrupt_dentry_first") + self.fs.rank_freeze(False, rank=0) + self.delete_mds_coredump(rank0['name']) + self.fs.mds_restart(rank0['name']) + self.fs.wait_for_daemons() + try: + p.wait() + except CommandFailedError as e: + print(e) + else: + self.fail("flush journal should fail!") + self.mount_a.mount_wait() + self.mount_a.run_shell_payload("stat a/ && find a/") + self.fs.flush() diff --git a/qa/tasks/cephfs/test_data_scan.py b/qa/tasks/cephfs/test_data_scan.py new file mode 100644 index 000000000..9a93bd622 --- /dev/null +++ b/qa/tasks/cephfs/test_data_scan.py @@ -0,0 +1,796 @@ + +""" +Test our tools for recovering metadata from the data pool +""" +import json + +import logging +import os +import time +import traceback +import stat + +from io import BytesIO, StringIO +from collections import namedtuple, defaultdict +from textwrap import dedent + +from teuthology.exceptions import CommandFailedError +from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology + +log = logging.getLogger(__name__) + + +ValidationError = namedtuple("ValidationError", ["exception", "backtrace"]) + + +class Workload(object): + def __init__(self, filesystem, mount): + self._mount = mount + self._filesystem = filesystem + self._initial_state = None + + # Accumulate backtraces for every failed validation, and return them. Backtraces + # are rather verbose, but we only see them when something breaks, and they + # let us see which check failed without having to decorate each check with + # a string + self._errors = [] + + def assert_equal(self, a, b): + try: + if a != b: + raise AssertionError("{0} != {1}".format(a, b)) + except AssertionError as e: + self._errors.append( + ValidationError(e, traceback.format_exc(3)) + ) + + def assert_not_equal(self, a, b): + try: + if a == b: + raise AssertionError("{0} == {1}".format(a, b)) + except AssertionError as e: + self._errors.append( + ValidationError(e, traceback.format_exc(3)) + ) + + def assert_true(self, a): + try: + if not a: + raise AssertionError("{0} is not true".format(a)) + except AssertionError as e: + self._errors.append( + ValidationError(e, traceback.format_exc(3)) + ) + + def write(self): + """ + Write the workload files to the mount + """ + raise NotImplementedError() + + def validate(self): + """ + Read from the mount and validate that the workload files are present (i.e. have + survived or been reconstructed from the test scenario) + """ + raise NotImplementedError() + + def damage(self): + """ + Damage the filesystem pools in ways that will be interesting to recover from. By + default just wipe everything in the metadata pool + """ + # Delete every object in the metadata pool + pool = self._filesystem.get_metadata_pool_name() + self._filesystem.rados(["purge", pool, '--yes-i-really-really-mean-it']) + + def flush(self): + """ + Called after client unmount, after write: flush whatever you want + """ + self._filesystem.mds_asok(["flush", "journal"]) + + def scrub(self): + """ + Called as a final step post recovery before verification. Right now, this + doesn't bother if errors are found in scrub - just that the MDS doesn't + crash and burn during scrub. + """ + out_json = self._filesystem.run_scrub(["start", "/", "repair,recursive"]) + self.assert_not_equal(out_json, None) + self.assert_equal(out_json["return_code"], 0) + self.assert_equal(self._filesystem.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True) + +class SimpleWorkload(Workload): + """ + Single file, single directory, check that it gets recovered and so does its size + """ + def write(self): + self._mount.run_shell(["mkdir", "subdir"]) + self._mount.write_n_mb("subdir/sixmegs", 6) + self._initial_state = self._mount.stat("subdir/sixmegs") + + def validate(self): + self._mount.run_shell(["sudo", "ls", "subdir"], omit_sudo=False) + st = self._mount.stat("subdir/sixmegs", sudo=True) + self.assert_equal(st['st_size'], self._initial_state['st_size']) + return self._errors + + +class SymlinkWorkload(Workload): + """ + Symlink file, check that it gets recovered as symlink + """ + def write(self): + self._mount.run_shell(["mkdir", "symdir"]) + self._mount.write_n_mb("symdir/onemegs", 1) + self._mount.run_shell(["ln", "-s", "onemegs", "symdir/symlink_onemegs"]) + self._mount.run_shell(["ln", "-s", "symdir/onemegs", "symlink1_onemegs"]) + + def validate(self): + self._mount.run_shell(["sudo", "ls", "symdir"], omit_sudo=False) + st = self._mount.lstat("symdir/symlink_onemegs") + self.assert_true(stat.S_ISLNK(st['st_mode'])) + target = self._mount.readlink("symdir/symlink_onemegs") + self.assert_equal(target, "onemegs") + + st = self._mount.lstat("symlink1_onemegs") + self.assert_true(stat.S_ISLNK(st['st_mode'])) + target = self._mount.readlink("symlink1_onemegs") + self.assert_equal(target, "symdir/onemegs") + return self._errors + + +class MovedFile(Workload): + def write(self): + # Create a file whose backtrace disagrees with his eventual position + # in the metadata. We will see that he gets reconstructed in his + # original position according to his backtrace. + self._mount.run_shell(["mkdir", "subdir_alpha"]) + self._mount.run_shell(["mkdir", "subdir_bravo"]) + self._mount.write_n_mb("subdir_alpha/sixmegs", 6) + self._filesystem.mds_asok(["flush", "journal"]) + self._mount.run_shell(["mv", "subdir_alpha/sixmegs", "subdir_bravo/sixmegs"]) + self._initial_state = self._mount.stat("subdir_bravo/sixmegs") + + def flush(self): + pass + + def validate(self): + self.assert_equal(self._mount.ls(sudo=True), ["subdir_alpha"]) + st = self._mount.stat("subdir_alpha/sixmegs", sudo=True) + self.assert_equal(st['st_size'], self._initial_state['st_size']) + return self._errors + + +class BacktracelessFile(Workload): + def write(self): + self._mount.run_shell(["mkdir", "subdir"]) + self._mount.write_n_mb("subdir/sixmegs", 6) + self._initial_state = self._mount.stat("subdir/sixmegs") + + def flush(self): + # Never flush metadata, so backtrace won't be written + pass + + def validate(self): + ino_name = "%x" % self._initial_state["st_ino"] + + # The inode should be linked into lost+found because we had no path for it + self.assert_equal(self._mount.ls(sudo=True), ["lost+found"]) + self.assert_equal(self._mount.ls("lost+found", sudo=True), [ino_name]) + st = self._mount.stat(f"lost+found/{ino_name}", sudo=True) + + # We might not have got the name or path, but we should still get the size + self.assert_equal(st['st_size'], self._initial_state['st_size']) + + # remove the entry from lost+found directory + self._mount.run_shell(["sudo", "rm", "-f", f'lost+found/{ino_name}'], omit_sudo=False) + self.assert_equal(self._mount.ls("lost+found", sudo=True), []) + + return self._errors + + +class StripedStashedLayout(Workload): + def __init__(self, fs, m, pool=None): + super(StripedStashedLayout, self).__init__(fs, m) + + # Nice small stripes so we can quickly do our writes+validates + self.sc = 4 + self.ss = 65536 + self.os = 262144 + self.pool = pool and pool or self._filesystem.get_data_pool_name() + + self.interesting_sizes = [ + # Exactly stripe_count objects will exist + self.os * self.sc, + # Fewer than stripe_count objects will exist + self.os * self.sc // 2, + self.os * (self.sc - 1) + self.os // 2, + self.os * (self.sc - 1) + self.os // 2 - 1, + self.os * (self.sc + 1) + self.os // 2, + self.os * (self.sc + 1) + self.os // 2 + 1, + # More than stripe_count objects will exist + self.os * self.sc + self.os * self.sc // 2 + ] + + def write(self): + # Create a dir with a striped layout set on it + self._mount.run_shell(["mkdir", "stripey"]) + + self._mount.setfattr("./stripey", "ceph.dir.layout", + "stripe_unit={ss} stripe_count={sc} object_size={os} pool={pool}".format( + ss=self.ss, os=self.os, sc=self.sc, pool=self.pool + )) + + # Write files, then flush metadata so that its layout gets written into an xattr + for i, n_bytes in enumerate(self.interesting_sizes): + self._mount.write_test_pattern("stripey/flushed_file_{0}".format(i), n_bytes) + # This is really just validating the validator + self._mount.validate_test_pattern("stripey/flushed_file_{0}".format(i), n_bytes) + self._filesystem.mds_asok(["flush", "journal"]) + + # Write another file in the same way, but this time don't flush the metadata, + # so that it won't have the layout xattr + self._mount.write_test_pattern("stripey/unflushed_file", 1024 * 512) + self._mount.validate_test_pattern("stripey/unflushed_file", 1024 * 512) + + self._initial_state = { + "unflushed_ino": self._mount.path_to_ino("stripey/unflushed_file") + } + + def flush(self): + # Pass because we already selectively flushed during write + pass + + def validate(self): + # The first files should have been recovered into its original location + # with the correct layout: read back correct data + for i, n_bytes in enumerate(self.interesting_sizes): + try: + self._mount.validate_test_pattern("stripey/flushed_file_{0}".format(i), n_bytes) + except CommandFailedError as e: + self._errors.append( + ValidationError("File {0} (size {1}): {2}".format(i, n_bytes, e), traceback.format_exc(3)) + ) + + # The unflushed file should have been recovered into lost+found without + # the correct layout: read back junk + ino_name = "%x" % self._initial_state["unflushed_ino"] + self.assert_equal(self._mount.ls("lost+found", sudo=True), [ino_name]) + try: + self._mount.validate_test_pattern(os.path.join("lost+found", ino_name), 1024 * 512) + except CommandFailedError: + pass + else: + self._errors.append( + ValidationError("Unexpectedly valid data in unflushed striped file", "") + ) + + return self._errors + + +class ManyFilesWorkload(Workload): + def __init__(self, filesystem, mount, file_count): + super(ManyFilesWorkload, self).__init__(filesystem, mount) + self.file_count = file_count + + def write(self): + self._mount.run_shell(["mkdir", "subdir"]) + for n in range(0, self.file_count): + self._mount.write_test_pattern("subdir/{0}".format(n), 6 * 1024 * 1024) + + def validate(self): + for n in range(0, self.file_count): + try: + self._mount.validate_test_pattern("subdir/{0}".format(n), 6 * 1024 * 1024) + except CommandFailedError as e: + self._errors.append( + ValidationError("File {0}: {1}".format(n, e), traceback.format_exc(3)) + ) + + return self._errors + + +class MovedDir(Workload): + def write(self): + # Create a nested dir that we will then move. Two files with two different + # backtraces referring to the moved dir, claiming two different locations for + # it. We will see that only one backtrace wins and the dir ends up with + # single linkage. + self._mount.run_shell(["mkdir", "-p", "grandmother/parent"]) + self._mount.write_n_mb("grandmother/parent/orig_pos_file", 1) + self._filesystem.mds_asok(["flush", "journal"]) + self._mount.run_shell(["mkdir", "grandfather"]) + self._mount.run_shell(["mv", "grandmother/parent", "grandfather"]) + self._mount.write_n_mb("grandfather/parent/new_pos_file", 2) + self._filesystem.mds_asok(["flush", "journal"]) + + self._initial_state = ( + self._mount.stat("grandfather/parent/orig_pos_file"), + self._mount.stat("grandfather/parent/new_pos_file") + ) + + def validate(self): + root_files = self._mount.ls() + self.assert_equal(len(root_files), 1) + self.assert_equal(root_files[0] in ["grandfather", "grandmother"], True) + winner = root_files[0] + st_opf = self._mount.stat(f"{winner}/parent/orig_pos_file", sudo=True) + st_npf = self._mount.stat(f"{winner}/parent/new_pos_file", sudo=True) + + self.assert_equal(st_opf['st_size'], self._initial_state[0]['st_size']) + self.assert_equal(st_npf['st_size'], self._initial_state[1]['st_size']) + + +class MissingZerothObject(Workload): + def write(self): + self._mount.run_shell(["mkdir", "subdir"]) + self._mount.write_n_mb("subdir/sixmegs", 6) + self._initial_state = self._mount.stat("subdir/sixmegs") + + def damage(self): + super(MissingZerothObject, self).damage() + zeroth_id = "{0:x}.00000000".format(self._initial_state['st_ino']) + self._filesystem.rados(["rm", zeroth_id], pool=self._filesystem.get_data_pool_name()) + + def validate(self): + ino = self._initial_state['st_ino'] + st = self._mount.stat(f"lost+found/{ino:x}", sudo=True) + self.assert_equal(st['st_size'], self._initial_state['st_size']) + + +class NonDefaultLayout(Workload): + """ + Check that the reconstruction copes with files that have a different + object size in their layout + """ + def write(self): + self._mount.run_shell(["touch", "datafile"]) + self._mount.setfattr("./datafile", "ceph.file.layout.object_size", "8388608") + self._mount.run_shell(["dd", "if=/dev/urandom", "of=./datafile", "bs=1M", "count=32"]) + self._initial_state = self._mount.stat("datafile") + + def validate(self): + # Check we got the layout reconstructed properly + object_size = int(self._mount.getfattr("./datafile", "ceph.file.layout.object_size", sudo=True)) + self.assert_equal(object_size, 8388608) + + # Check we got the file size reconstructed properly + st = self._mount.stat("datafile", sudo=True) + self.assert_equal(st['st_size'], self._initial_state['st_size']) + + +class TestDataScan(CephFSTestCase): + MDSS_REQUIRED = 2 + + def is_marked_damaged(self, rank): + mds_map = self.fs.get_mds_map() + return rank in mds_map['damaged'] + + def _rebuild_metadata(self, workload, workers=1): + """ + That when all objects in metadata pool are removed, we can rebuild a metadata pool + based on the contents of a data pool, and a client can see and read our files. + """ + + # First, inject some files + + workload.write() + + # Unmount the client and flush the journal: the tool should also cope with + # situations where there is dirty metadata, but we'll test that separately + self.mount_a.umount_wait() + workload.flush() + + # Stop the MDS + self.fs.fail() + + # After recovery, we need the MDS to not be strict about stats (in production these options + # are off by default, but in QA we need to explicitly disable them) + self.fs.set_ceph_conf('mds', 'mds verify scatter', False) + self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False) + + # Apply any data damage the workload wants + workload.damage() + + # Reset the MDS map in case multiple ranks were in play: recovery procedure + # only understands how to rebuild metadata under rank 0 + self.fs.reset() + + self.fs.set_joinable() # redundant with reset + + def get_state(mds_id): + info = self.mds_cluster.get_mds_info(mds_id) + return info['state'] if info is not None else None + + self.wait_until_true(lambda: self.is_marked_damaged(0), 60) + for mds_id in self.fs.mds_ids: + self.wait_until_equal( + lambda: get_state(mds_id), + "up:standby", + timeout=60) + + self.fs.table_tool([self.fs.name + ":0", "reset", "session"]) + self.fs.table_tool([self.fs.name + ":0", "reset", "snap"]) + self.fs.table_tool([self.fs.name + ":0", "reset", "inode"]) + + # Run the recovery procedure + if False: + with self.assertRaises(CommandFailedError): + # Normal reset should fail when no objects are present, we'll use --force instead + self.fs.journal_tool(["journal", "reset"], 0) + + self.fs.journal_tool(["journal", "reset", "--force"], 0) + self.fs.data_scan(["init"]) + self.fs.data_scan(["scan_extents"], worker_count=workers) + self.fs.data_scan(["scan_inodes"], worker_count=workers) + self.fs.data_scan(["scan_links"]) + + # Mark the MDS repaired + self.fs.mon_manager.raw_cluster_cmd('mds', 'repaired', '0') + + # Start the MDS + self.fs.mds_restart() + self.fs.wait_for_daemons() + log.info(str(self.mds_cluster.status())) + + # Mount a client + self.mount_a.mount_wait() + + # run scrub as it is recommended post recovery for most + # (if not all) recovery mechanisms. + workload.scrub() + + # See that the files are present and correct + errors = workload.validate() + if errors: + log.error("Validation errors found: {0}".format(len(errors))) + for e in errors: + log.error(e.exception) + log.error(e.backtrace) + raise AssertionError("Validation failed, first error: {0}\n{1}".format( + errors[0].exception, errors[0].backtrace + )) + + def test_rebuild_simple(self): + self._rebuild_metadata(SimpleWorkload(self.fs, self.mount_a)) + + def test_rebuild_symlink(self): + self._rebuild_metadata(SymlinkWorkload(self.fs, self.mount_a)) + + def test_rebuild_moved_file(self): + self._rebuild_metadata(MovedFile(self.fs, self.mount_a)) + + def test_rebuild_backtraceless(self): + self._rebuild_metadata(BacktracelessFile(self.fs, self.mount_a)) + + def test_rebuild_moved_dir(self): + self._rebuild_metadata(MovedDir(self.fs, self.mount_a)) + + def test_rebuild_missing_zeroth(self): + self._rebuild_metadata(MissingZerothObject(self.fs, self.mount_a)) + + def test_rebuild_nondefault_layout(self): + self._rebuild_metadata(NonDefaultLayout(self.fs, self.mount_a)) + + def test_stashed_layout(self): + self._rebuild_metadata(StripedStashedLayout(self.fs, self.mount_a)) + + def _dirfrag_keys(self, object_id): + keys_str = self.fs.radosmo(["listomapkeys", object_id], stdout=StringIO()) + if keys_str: + return keys_str.strip().split("\n") + else: + return [] + + def test_fragmented_injection(self): + """ + That when injecting a dentry into a fragmented directory, we put it in the right fragment. + """ + + file_count = 100 + file_names = ["%s" % n for n in range(0, file_count)] + + # Make sure and disable dirfrag auto merging and splitting + self.fs.set_ceph_conf('mds', 'mds bal merge size', 0) + self.fs.set_ceph_conf('mds', 'mds bal split size', 100 * file_count) + + # Create a directory of `file_count` files, each named after its + # decimal number and containing the string of its decimal number + self.mount_a.run_python(dedent(""" + import os + path = os.path.join("{path}", "subdir") + os.mkdir(path) + for n in range(0, {file_count}): + open(os.path.join(path, "%s" % n), 'w').write("%s" % n) + """.format( + path=self.mount_a.mountpoint, + file_count=file_count + ))) + + dir_ino = self.mount_a.path_to_ino("subdir") + + # Only one MDS should be active! + self.assertEqual(len(self.fs.get_active_names()), 1) + + # Ensure that one directory is fragmented + mds_id = self.fs.get_active_names()[0] + self.fs.mds_asok(["dirfrag", "split", "/subdir", "0/0", "1"], mds_id) + + # Flush journal and stop MDS + self.mount_a.umount_wait() + self.fs.mds_asok(["flush", "journal"], mds_id) + self.fs.fail() + + # Pick a dentry and wipe out its key + # Because I did a 1 bit split, I know one frag will be named <inode>.01000000 + frag_obj_id = "{0:x}.01000000".format(dir_ino) + keys = self._dirfrag_keys(frag_obj_id) + victim_key = keys[7] # arbitrary choice + log.info("victim_key={0}".format(victim_key)) + victim_dentry = victim_key.split("_head")[0] + self.fs.radosm(["rmomapkey", frag_obj_id, victim_key]) + + # Start filesystem back up, observe that the file appears to be gone in an `ls` + self.fs.set_joinable() + self.fs.wait_for_daemons() + self.mount_a.mount_wait() + files = self.mount_a.run_shell(["ls", "subdir/"]).stdout.getvalue().strip().split("\n") + self.assertListEqual(sorted(files), sorted(list(set(file_names) - set([victim_dentry])))) + + # Stop the filesystem + self.mount_a.umount_wait() + self.fs.fail() + + # Run data-scan, observe that it inserts our dentry back into the correct fragment + # by checking the omap now has the dentry's key again + self.fs.data_scan(["scan_extents"]) + self.fs.data_scan(["scan_inodes"]) + self.fs.data_scan(["scan_links"]) + self.assertIn(victim_key, self._dirfrag_keys(frag_obj_id)) + + # Start the filesystem and check that the dentry we deleted is now once again visible + # and points to the correct file data. + self.fs.set_joinable() + self.fs.wait_for_daemons() + self.mount_a.mount_wait() + self.mount_a.run_shell(["ls", "-l", "subdir/"]) # debugging + # Use sudo because cephfs-data-scan will reinsert the dentry with root ownership, it can't know the real owner. + out = self.mount_a.run_shell_payload(f"sudo cat subdir/{victim_dentry}", omit_sudo=False).stdout.getvalue().strip() + self.assertEqual(out, victim_dentry) + + # Finally, close the loop by checking our injected dentry survives a merge + mds_id = self.fs.get_active_names()[0] + self.mount_a.ls("subdir") # Do an ls to ensure both frags are in cache so the merge will work + self.fs.mds_asok(["dirfrag", "merge", "/subdir", "0/0"], mds_id) + self.fs.mds_asok(["flush", "journal"], mds_id) + frag_obj_id = "{0:x}.00000000".format(dir_ino) + keys = self._dirfrag_keys(frag_obj_id) + self.assertListEqual(sorted(keys), sorted(["%s_head" % f for f in file_names])) + + # run scrub to update and make sure rstat.rbytes info in subdir inode and dirfrag + # are matched + out_json = self.fs.run_scrub(["start", "/subdir", "repair,recursive"]) + self.assertNotEqual(out_json, None) + self.assertEqual(out_json["return_code"], 0) + self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True) + + # Remove the whole 'sudbdir' directory + self.mount_a.run_shell(["rm", "-rf", "subdir/"]) + + @for_teuthology + def test_parallel_execution(self): + self._rebuild_metadata(ManyFilesWorkload(self.fs, self.mount_a, 25), workers=7) + + def test_pg_files(self): + """ + That the pg files command tells us which files are associated with + a particular PG + """ + file_count = 20 + self.mount_a.run_shell(["mkdir", "mydir"]) + self.mount_a.create_n_files("mydir/myfile", file_count) + + # Some files elsewhere in the system that we will ignore + # to check that the tool is filtering properly + self.mount_a.run_shell(["mkdir", "otherdir"]) + self.mount_a.create_n_files("otherdir/otherfile", file_count) + + pgs_to_files = defaultdict(list) + # Rough (slow) reimplementation of the logic + for i in range(0, file_count): + file_path = "mydir/myfile_{0}".format(i) + ino = self.mount_a.path_to_ino(file_path) + obj = "{0:x}.{1:08x}".format(ino, 0) + pgid = json.loads(self.fs.mon_manager.raw_cluster_cmd( + "osd", "map", self.fs.get_data_pool_name(), obj, + "--format=json-pretty" + ))['pgid'] + pgs_to_files[pgid].append(file_path) + log.info("{0}: {1}".format(file_path, pgid)) + + pg_count = self.fs.get_pool_pg_num(self.fs.get_data_pool_name()) + for pg_n in range(0, pg_count): + pg_str = "{0}.{1:x}".format(self.fs.get_data_pool_id(), pg_n) + out = self.fs.data_scan(["pg_files", "mydir", pg_str]) + lines = [l for l in out.split("\n") if l] + log.info("{0}: {1}".format(pg_str, lines)) + self.assertSetEqual(set(lines), set(pgs_to_files[pg_str])) + + def test_rebuild_linkage(self): + """ + The scan_links command fixes linkage errors + """ + self.mount_a.run_shell(["mkdir", "testdir1"]) + self.mount_a.run_shell(["mkdir", "testdir2"]) + dir1_ino = self.mount_a.path_to_ino("testdir1") + dir2_ino = self.mount_a.path_to_ino("testdir2") + dirfrag1_oid = "{0:x}.00000000".format(dir1_ino) + dirfrag2_oid = "{0:x}.00000000".format(dir2_ino) + + self.mount_a.run_shell(["touch", "testdir1/file1"]) + self.mount_a.run_shell(["ln", "testdir1/file1", "testdir1/link1"]) + self.mount_a.run_shell(["ln", "testdir1/file1", "testdir2/link2"]) + + mds_id = self.fs.get_active_names()[0] + self.fs.mds_asok(["flush", "journal"], mds_id) + + dirfrag1_keys = self._dirfrag_keys(dirfrag1_oid) + + # introduce duplicated primary link + file1_key = "file1_head" + self.assertIn(file1_key, dirfrag1_keys) + file1_omap_data = self.fs.radosmo(["getomapval", dirfrag1_oid, file1_key, '-']) + self.fs.radosm(["setomapval", dirfrag2_oid, file1_key], stdin=BytesIO(file1_omap_data)) + self.assertIn(file1_key, self._dirfrag_keys(dirfrag2_oid)) + + # remove a remote link, make inode link count incorrect + link1_key = 'link1_head' + self.assertIn(link1_key, dirfrag1_keys) + self.fs.radosm(["rmomapkey", dirfrag1_oid, link1_key]) + + # increase good primary link's version + self.mount_a.run_shell(["touch", "testdir1/file1"]) + self.mount_a.umount_wait() + + self.fs.mds_asok(["flush", "journal"], mds_id) + self.fs.fail() + + # repair linkage errors + self.fs.data_scan(["scan_links"]) + + # primary link in testdir2 was deleted? + self.assertNotIn(file1_key, self._dirfrag_keys(dirfrag2_oid)) + + self.fs.set_joinable() + self.fs.wait_for_daemons() + + self.mount_a.mount_wait() + + # link count was adjusted? + file1_nlink = self.mount_a.path_to_nlink("testdir1/file1") + self.assertEqual(file1_nlink, 2) + + out_json = self.fs.run_scrub(["start", "/testdir1", "repair,recursive"]) + self.assertNotEqual(out_json, None) + self.assertEqual(out_json["return_code"], 0) + self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True) + + def test_rebuild_inotable(self): + """ + The scan_links command repair inotables + """ + self.fs.set_max_mds(2) + self.fs.wait_for_daemons() + + active_mds_names = self.fs.get_active_names() + mds0_id = active_mds_names[0] + mds1_id = active_mds_names[1] + + self.mount_a.run_shell(["mkdir", "dir1"]) + dir_ino = self.mount_a.path_to_ino("dir1") + self.mount_a.setfattr("dir1", "ceph.dir.pin", "1") + # wait for subtree migration + + file_ino = 0; + while True: + time.sleep(1) + # allocate an inode from mds.1 + self.mount_a.run_shell(["touch", "dir1/file1"]) + file_ino = self.mount_a.path_to_ino("dir1/file1") + if file_ino >= (2 << 40): + break + self.mount_a.run_shell(["rm", "-f", "dir1/file1"]) + + self.mount_a.umount_wait() + + self.fs.mds_asok(["flush", "journal"], mds0_id) + self.fs.mds_asok(["flush", "journal"], mds1_id) + self.fs.fail() + + self.fs.radosm(["rm", "mds0_inotable"]) + self.fs.radosm(["rm", "mds1_inotable"]) + + self.fs.data_scan(["scan_links", "--filesystem", self.fs.name]) + + mds0_inotable = json.loads(self.fs.table_tool([self.fs.name + ":0", "show", "inode"])) + self.assertGreaterEqual( + mds0_inotable['0']['data']['inotable']['free'][0]['start'], dir_ino) + + mds1_inotable = json.loads(self.fs.table_tool([self.fs.name + ":1", "show", "inode"])) + self.assertGreaterEqual( + mds1_inotable['1']['data']['inotable']['free'][0]['start'], file_ino) + + self.fs.set_joinable() + self.fs.wait_for_daemons() + + out_json = self.fs.run_scrub(["start", "/dir1", "repair,recursive"]) + self.assertNotEqual(out_json, None) + self.assertEqual(out_json["return_code"], 0) + self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True) + + def test_rebuild_snaptable(self): + """ + The scan_links command repair snaptable + """ + self.fs.set_allow_new_snaps(True) + + self.mount_a.run_shell(["mkdir", "dir1"]) + self.mount_a.run_shell(["mkdir", "dir1/.snap/s1"]) + self.mount_a.run_shell(["mkdir", "dir1/.snap/s2"]) + self.mount_a.run_shell(["rmdir", "dir1/.snap/s2"]) + + self.mount_a.umount_wait() + + mds0_id = self.fs.get_active_names()[0] + self.fs.mds_asok(["flush", "journal"], mds0_id) + + # wait for mds to update removed snaps + time.sleep(10) + + old_snaptable = json.loads(self.fs.table_tool([self.fs.name + ":0", "show", "snap"])) + # stamps may have minor difference + for item in old_snaptable['snapserver']['snaps']: + del item['stamp'] + + self.fs.radosm(["rm", "mds_snaptable"]) + self.fs.data_scan(["scan_links", "--filesystem", self.fs.name]) + + new_snaptable = json.loads(self.fs.table_tool([self.fs.name + ":0", "show", "snap"])) + for item in new_snaptable['snapserver']['snaps']: + del item['stamp'] + self.assertGreaterEqual( + new_snaptable['snapserver']['last_snap'], old_snaptable['snapserver']['last_snap']) + self.assertEqual( + new_snaptable['snapserver']['snaps'], old_snaptable['snapserver']['snaps']) + + out_json = self.fs.run_scrub(["start", "/dir1", "repair,recursive"]) + self.assertNotEqual(out_json, None) + self.assertEqual(out_json["return_code"], 0) + self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True) + + def _prepare_extra_data_pool(self, set_root_layout=True): + extra_data_pool_name = self.fs.get_data_pool_name() + '_extra' + self.fs.add_data_pool(extra_data_pool_name) + if set_root_layout: + self.mount_a.setfattr(".", "ceph.dir.layout.pool", + extra_data_pool_name) + return extra_data_pool_name + + def test_extra_data_pool_rebuild_simple(self): + self._prepare_extra_data_pool() + self._rebuild_metadata(SimpleWorkload(self.fs, self.mount_a)) + + def test_extra_data_pool_rebuild_few_files(self): + self._prepare_extra_data_pool() + self._rebuild_metadata(ManyFilesWorkload(self.fs, self.mount_a, 5), workers=1) + + @for_teuthology + def test_extra_data_pool_rebuild_many_files_many_workers(self): + self._prepare_extra_data_pool() + self._rebuild_metadata(ManyFilesWorkload(self.fs, self.mount_a, 25), workers=7) + + def test_extra_data_pool_stashed_layout(self): + pool_name = self._prepare_extra_data_pool(False) + self._rebuild_metadata(StripedStashedLayout(self.fs, self.mount_a, pool_name)) diff --git a/qa/tasks/cephfs/test_dump_tree.py b/qa/tasks/cephfs/test_dump_tree.py new file mode 100644 index 000000000..48a2c6f00 --- /dev/null +++ b/qa/tasks/cephfs/test_dump_tree.py @@ -0,0 +1,66 @@ +from tasks.cephfs.cephfs_test_case import CephFSTestCase +import random +import os + +class TestDumpTree(CephFSTestCase): + def get_paths_to_ino(self): + inos = {} + p = self.mount_a.run_shell(["find", "./"]) + paths = p.stdout.getvalue().strip().split() + for path in paths: + inos[path] = self.mount_a.path_to_ino(path, False) + + return inos + + def populate(self): + self.mount_a.run_shell(["git", "clone", + "https://github.com/ceph/ceph-qa-suite"]) + + def test_basic(self): + self.mount_a.run_shell(["mkdir", "parent"]) + self.mount_a.run_shell(["mkdir", "parent/child"]) + self.mount_a.run_shell(["touch", "parent/child/file"]) + self.mount_a.run_shell(["mkdir", "parent/child/grandchild"]) + self.mount_a.run_shell(["touch", "parent/child/grandchild/file"]) + + inos = self.get_paths_to_ino() + tree = self.fs.mds_asok(["dump", "tree", "/parent/child", "1"]) + + target_inos = [inos["./parent/child"], inos["./parent/child/file"], + inos["./parent/child/grandchild"]] + + for ino in tree: + del target_inos[target_inos.index(ino['ino'])] # don't catch! + + assert(len(target_inos) == 0) + + def test_random(self): + random.seed(0) + + self.populate() + inos = self.get_paths_to_ino() + target = random.sample(inos.keys(), 1)[0] + + if target != "./": + target = os.path.dirname(target) + + subtree = [path for path in inos.keys() if path.startswith(target)] + target_inos = [inos[path] for path in subtree] + tree = self.fs.mds_asok(["dump", "tree", target[1:]]) + + for ino in tree: + del target_inos[target_inos.index(ino['ino'])] # don't catch! + + assert(len(target_inos) == 0) + + target_depth = target.count('/') + maxdepth = max([path.count('/') for path in subtree]) - target_depth + depth = random.randint(0, maxdepth) + target_inos = [inos[path] for path in subtree \ + if path.count('/') <= depth + target_depth] + tree = self.fs.mds_asok(["dump", "tree", target[1:], str(depth)]) + + for ino in tree: + del target_inos[target_inos.index(ino['ino'])] # don't catch! + + assert(len(target_inos) == 0) diff --git a/qa/tasks/cephfs/test_exports.py b/qa/tasks/cephfs/test_exports.py new file mode 100644 index 000000000..4b7e884ec --- /dev/null +++ b/qa/tasks/cephfs/test_exports.py @@ -0,0 +1,582 @@ +import logging +import random +import time +from tasks.cephfs.fuse_mount import FuseMount +from tasks.cephfs.cephfs_test_case import CephFSTestCase +from teuthology.exceptions import CommandFailedError + +log = logging.getLogger(__name__) + +class TestExports(CephFSTestCase): + MDSS_REQUIRED = 2 + CLIENTS_REQUIRED = 2 + + def test_session_race(self): + """ + Test session creation race. + + See: https://tracker.ceph.com/issues/24072#change-113056 + """ + + self.fs.set_max_mds(2) + status = self.fs.wait_for_daemons() + + rank1 = self.fs.get_rank(rank=1, status=status) + + # Create a directory that is pre-exported to rank 1 + self.mount_a.run_shell(["mkdir", "-p", "a/aa"]) + self.mount_a.setfattr("a", "ceph.dir.pin", "1") + self._wait_subtrees([('/a', 1)], status=status, rank=1) + + # Now set the mds config to allow the race + self.fs.rank_asok(["config", "set", "mds_inject_migrator_session_race", "true"], rank=1) + + # Now create another directory and try to export it + self.mount_b.run_shell(["mkdir", "-p", "b/bb"]) + self.mount_b.setfattr("b", "ceph.dir.pin", "1") + + time.sleep(5) + + # Now turn off the race so that it doesn't wait again + self.fs.rank_asok(["config", "set", "mds_inject_migrator_session_race", "false"], rank=1) + + # Now try to create a session with rank 1 by accessing a dir known to + # be there, if buggy, this should cause the rank 1 to crash: + self.mount_b.run_shell(["ls", "a"]) + + # Check if rank1 changed (standby tookover?) + new_rank1 = self.fs.get_rank(rank=1) + self.assertEqual(rank1['gid'], new_rank1['gid']) + +class TestExportPin(CephFSTestCase): + MDSS_REQUIRED = 3 + CLIENTS_REQUIRED = 1 + + def setUp(self): + CephFSTestCase.setUp(self) + + self.fs.set_max_mds(3) + self.status = self.fs.wait_for_daemons() + + self.mount_a.run_shell_payload("mkdir -p 1/2/3/4") + + def test_noop(self): + self.mount_a.setfattr("1", "ceph.dir.pin", "-1") + time.sleep(30) # for something to not happen + self._wait_subtrees([], status=self.status) + + def test_negative(self): + self.mount_a.setfattr("1", "ceph.dir.pin", "-2341") + time.sleep(30) # for something to not happen + self._wait_subtrees([], status=self.status) + + def test_empty_pin(self): + self.mount_a.setfattr("1/2/3/4", "ceph.dir.pin", "1") + time.sleep(30) # for something to not happen + self._wait_subtrees([], status=self.status) + + def test_trivial(self): + self.mount_a.setfattr("1", "ceph.dir.pin", "1") + self._wait_subtrees([('/1', 1)], status=self.status, rank=1) + + def test_export_targets(self): + self.mount_a.setfattr("1", "ceph.dir.pin", "1") + self._wait_subtrees([('/1', 1)], status=self.status, rank=1) + self.status = self.fs.status() + r0 = self.status.get_rank(self.fs.id, 0) + self.assertTrue(sorted(r0['export_targets']) == [1]) + + def test_redundant(self): + # redundant pin /1/2 to rank 1 + self.mount_a.setfattr("1", "ceph.dir.pin", "1") + self._wait_subtrees([('/1', 1)], status=self.status, rank=1) + self.mount_a.setfattr("1/2", "ceph.dir.pin", "1") + self._wait_subtrees([('/1', 1), ('/1/2', 1)], status=self.status, rank=1) + + def test_reassignment(self): + self.mount_a.setfattr("1/2", "ceph.dir.pin", "1") + self._wait_subtrees([('/1/2', 1)], status=self.status, rank=1) + self.mount_a.setfattr("1/2", "ceph.dir.pin", "0") + self._wait_subtrees([('/1/2', 0)], status=self.status, rank=0) + + def test_phantom_rank(self): + self.mount_a.setfattr("1", "ceph.dir.pin", "0") + self.mount_a.setfattr("1/2", "ceph.dir.pin", "10") + time.sleep(30) # wait for nothing weird to happen + self._wait_subtrees([('/1', 0)], status=self.status) + + def test_nested(self): + self.mount_a.setfattr("1", "ceph.dir.pin", "1") + self.mount_a.setfattr("1/2", "ceph.dir.pin", "0") + self.mount_a.setfattr("1/2/3", "ceph.dir.pin", "2") + self._wait_subtrees([('/1', 1), ('/1/2', 0), ('/1/2/3', 2)], status=self.status, rank=2) + + def test_nested_unset(self): + self.mount_a.setfattr("1", "ceph.dir.pin", "1") + self.mount_a.setfattr("1/2", "ceph.dir.pin", "2") + self._wait_subtrees([('/1', 1), ('/1/2', 2)], status=self.status, rank=1) + self.mount_a.setfattr("1/2", "ceph.dir.pin", "-1") + self._wait_subtrees([('/1', 1)], status=self.status, rank=1) + + def test_rename(self): + self.mount_a.setfattr("1", "ceph.dir.pin", "1") + self.mount_a.run_shell_payload("mkdir -p 9/8/7") + self.mount_a.setfattr("9/8", "ceph.dir.pin", "0") + self._wait_subtrees([('/1', 1), ("/9/8", 0)], status=self.status, rank=0) + self.mount_a.run_shell_payload("mv 9/8 1/2") + self._wait_subtrees([('/1', 1), ("/1/2/8", 0)], status=self.status, rank=0) + + def test_getfattr(self): + # pin /1 to rank 0 + self.mount_a.setfattr("1", "ceph.dir.pin", "1") + self.mount_a.setfattr("1/2", "ceph.dir.pin", "0") + self._wait_subtrees([('/1', 1), ('/1/2', 0)], status=self.status, rank=1) + + if not isinstance(self.mount_a, FuseMount): + p = self.mount_a.client_remote.sh('uname -r', wait=True) + dir_pin = self.mount_a.getfattr("1", "ceph.dir.pin") + log.debug("mount.getfattr('1','ceph.dir.pin'): %s " % dir_pin) + if str(p) < "5" and not(dir_pin): + self.skipTest("Kernel does not support getting the extended attribute ceph.dir.pin") + self.assertEqual(self.mount_a.getfattr("1", "ceph.dir.pin"), '1') + self.assertEqual(self.mount_a.getfattr("1/2", "ceph.dir.pin"), '0') + + def test_export_pin_cache_drop(self): + """ + That the export pin does not prevent empty (nothing in cache) subtree merging. + """ + + self.mount_a.setfattr("1", "ceph.dir.pin", "0") + self.mount_a.setfattr("1/2", "ceph.dir.pin", "1") + self._wait_subtrees([('/1', 0), ('/1/2', 1)], status=self.status) + self.mount_a.umount_wait() # release all caps + def _drop(): + self.fs.ranks_tell(["cache", "drop"], status=self.status) + # drop cache multiple times to clear replica pins + self._wait_subtrees([], status=self.status, action=_drop) + + def test_open_file(self): + """ + Test opening a file via a hard link that is not in the same mds as the inode. + + See https://tracker.ceph.com/issues/58411 + """ + + self.mount_a.run_shell_payload("mkdir -p target link") + self.mount_a.touch("target/test.txt") + self.mount_a.run_shell_payload("ln target/test.txt link/test.txt") + self.mount_a.setfattr("target", "ceph.dir.pin", "0") + self.mount_a.setfattr("link", "ceph.dir.pin", "1") + self._wait_subtrees([("/target", 0), ("/link", 1)], status=self.status) + + # Release client cache, otherwise the bug may not be triggered even if buggy. + self.mount_a.remount() + + # Open the file with access mode(O_CREAT|O_WRONLY|O_TRUNC), + # this should cause the rank 1 to crash if buggy. + # It's OK to use 'truncate -s 0 link/test.txt' here, + # its access mode is (O_CREAT|O_WRONLY), it can also trigger this bug. + log.info("test open mode (O_CREAT|O_WRONLY|O_TRUNC)") + proc = self.mount_a.open_for_writing("link/test.txt") + time.sleep(1) + success = proc.finished and self.fs.rank_is_running(rank=1) + + # Test other write modes too. + if success: + self.mount_a.remount() + log.info("test open mode (O_WRONLY|O_TRUNC)") + proc = self.mount_a.open_for_writing("link/test.txt", creat=False) + time.sleep(1) + success = proc.finished and self.fs.rank_is_running(rank=1) + if success: + self.mount_a.remount() + log.info("test open mode (O_CREAT|O_WRONLY)") + proc = self.mount_a.open_for_writing("link/test.txt", trunc=False) + time.sleep(1) + success = proc.finished and self.fs.rank_is_running(rank=1) + + # Test open modes too. + if success: + self.mount_a.remount() + log.info("test open mode (O_RDONLY)") + proc = self.mount_a.open_for_reading("link/test.txt") + time.sleep(1) + success = proc.finished and self.fs.rank_is_running(rank=1) + + if success: + # All tests done, rank 1 didn't crash. + return + + if not proc.finished: + log.warning("open operation is blocked, kill it") + proc.kill() + + if not self.fs.rank_is_running(rank=1): + log.warning("rank 1 crashed") + + self.mount_a.umount_wait(force=True) + + self.assertTrue(success, "open operation failed") + +class TestEphemeralPins(CephFSTestCase): + MDSS_REQUIRED = 3 + CLIENTS_REQUIRED = 1 + + def setUp(self): + CephFSTestCase.setUp(self) + + self.config_set('mds', 'mds_export_ephemeral_random', True) + self.config_set('mds', 'mds_export_ephemeral_distributed', True) + self.config_set('mds', 'mds_export_ephemeral_random_max', 1.0) + + self.mount_a.run_shell_payload(""" +set -e + +# Use up a random number of inode numbers so the ephemeral pinning is not the same every test. +mkdir .inode_number_thrash +count=$((RANDOM % 1024)) +for ((i = 0; i < count; i++)); do touch .inode_number_thrash/$i; done +rm -rf .inode_number_thrash +""") + + self.fs.set_max_mds(3) + self.status = self.fs.wait_for_daemons() + + def _setup_tree(self, path="tree", export=-1, distributed=False, random=0.0, count=100, wait=True): + return self.mount_a.run_shell_payload(f""" +set -ex +mkdir -p {path} +{f"setfattr -n ceph.dir.pin -v {export} {path}" if export >= 0 else ""} +{f"setfattr -n ceph.dir.pin.distributed -v 1 {path}" if distributed else ""} +{f"setfattr -n ceph.dir.pin.random -v {random} {path}" if random > 0.0 else ""} +for ((i = 0; i < {count}; i++)); do + mkdir -p "{path}/$i" + echo file > "{path}/$i/file" +done +""", wait=wait) + + def test_ephemeral_pin_dist_override(self): + """ + That an ephemeral distributed pin overrides a normal export pin. + """ + + self._setup_tree(distributed=True) + subtrees = self._wait_distributed_subtrees(3 * 2, status=self.status, rank="all") + for s in subtrees: + path = s['dir']['path'] + if path == '/tree': + self.assertTrue(s['distributed_ephemeral_pin']) + + def test_ephemeral_pin_dist_override_pin(self): + """ + That an export pin overrides an ephemerally pinned directory. + """ + + self._setup_tree(distributed=True) + subtrees = self._wait_distributed_subtrees(3 * 2, status=self.status, rank="all") + self.mount_a.setfattr("tree", "ceph.dir.pin", "0") + time.sleep(15) + subtrees = self._get_subtrees(status=self.status, rank=0) + for s in subtrees: + path = s['dir']['path'] + if path == '/tree': + self.assertEqual(s['auth_first'], 0) + self.assertFalse(s['distributed_ephemeral_pin']) + # it has been merged into /tree + + def test_ephemeral_pin_dist_off(self): + """ + That turning off ephemeral distributed pin merges subtrees. + """ + + self._setup_tree(distributed=True) + self._wait_distributed_subtrees(3 * 2, status=self.status, rank="all") + self.mount_a.setfattr("tree", "ceph.dir.pin.distributed", "0") + time.sleep(15) + subtrees = self._get_subtrees(status=self.status, rank=0) + for s in subtrees: + path = s['dir']['path'] + if path == '/tree': + self.assertFalse(s['distributed_ephemeral_pin']) + + + def test_ephemeral_pin_dist_conf_off(self): + """ + That turning off ephemeral distributed pin config prevents distribution. + """ + + self._setup_tree() + self.config_set('mds', 'mds_export_ephemeral_distributed', False) + self.mount_a.setfattr("tree", "ceph.dir.pin.distributed", "1") + time.sleep(15) + subtrees = self._get_subtrees(status=self.status, rank=0) + for s in subtrees: + path = s['dir']['path'] + if path == '/tree': + self.assertFalse(s['distributed_ephemeral_pin']) + + def _test_ephemeral_pin_dist_conf_off_merge(self): + """ + That turning off ephemeral distributed pin config merges subtrees. + FIXME: who triggers the merge? + """ + + self._setup_tree(distributed=True) + self._wait_distributed_subtrees(3 * 2, status=self.status, rank="all") + self.config_set('mds', 'mds_export_ephemeral_distributed', False) + self._wait_subtrees([('/tree', 0)], timeout=60, status=self.status) + + def test_ephemeral_pin_dist_override_before(self): + """ + That a conventional export pin overrides the distributed policy _before_ distributed policy is set. + """ + + count = 10 + self._setup_tree(count=count) + test = [] + for i in range(count): + path = f"tree/{i}" + self.mount_a.setfattr(path, "ceph.dir.pin", "1") + test.append(("/"+path, 1)) + self.mount_a.setfattr("tree", "ceph.dir.pin.distributed", "1") + time.sleep(15) # for something to not happen... + self._wait_subtrees(test, timeout=60, status=self.status, rank="all", path="/tree/") + + def test_ephemeral_pin_dist_override_after(self): + """ + That a conventional export pin overrides the distributed policy _after_ distributed policy is set. + """ + + self._setup_tree(distributed=True) + self._wait_distributed_subtrees(3 * 2, status=self.status, rank="all") + test = [] + for i in range(10): + path = f"tree/{i}" + self.mount_a.setfattr(path, "ceph.dir.pin", "1") + test.append(("/"+path, 1)) + self._wait_subtrees(test, timeout=60, status=self.status, rank="all", path="/tree/") + + def test_ephemeral_pin_dist_failover(self): + """ + That MDS failover does not cause unnecessary migrations. + """ + + # pin /tree so it does not export during failover + self._setup_tree(distributed=True) + self._wait_distributed_subtrees(3 * 2, status=self.status, rank="all") + #test = [(s['dir']['path'], s['auth_first']) for s in subtrees] + before = self.fs.ranks_perf(lambda p: p['mds']['exported']) + log.info(f"export stats: {before}") + self.fs.rank_fail(rank=1) + self.status = self.fs.wait_for_daemons() + time.sleep(10) # waiting for something to not happen + after = self.fs.ranks_perf(lambda p: p['mds']['exported']) + log.info(f"export stats: {after}") + self.assertEqual(before, after) + + def test_ephemeral_pin_distribution(self): + """ + That ephemerally pinned subtrees are somewhat evenly distributed. + """ + + max_mds = 3 + frags = 128 + + self.fs.set_max_mds(max_mds) + self.status = self.fs.wait_for_daemons() + + self.config_set('mds', 'mds_export_ephemeral_distributed_factor', (frags-1) / max_mds) + self._setup_tree(count=1000, distributed=True) + + subtrees = self._wait_distributed_subtrees(frags, status=self.status, rank="all") + nsubtrees = len(subtrees) + + # Check if distribution is uniform + rank0 = list(filter(lambda x: x['auth_first'] == 0, subtrees)) + rank1 = list(filter(lambda x: x['auth_first'] == 1, subtrees)) + rank2 = list(filter(lambda x: x['auth_first'] == 2, subtrees)) + self.assertGreaterEqual(len(rank0)/nsubtrees, 0.15) + self.assertGreaterEqual(len(rank1)/nsubtrees, 0.15) + self.assertGreaterEqual(len(rank2)/nsubtrees, 0.15) + + + def test_ephemeral_random(self): + """ + That 100% randomness causes all children to be pinned. + """ + self._setup_tree(random=1.0) + self._wait_random_subtrees(100, status=self.status, rank="all") + + def test_ephemeral_random_max(self): + """ + That the config mds_export_ephemeral_random_max is not exceeded. + """ + + r = 0.5 + count = 1000 + self._setup_tree(count=count, random=r) + subtrees = self._wait_random_subtrees(int(r*count*.75), status=self.status, rank="all") + self.config_set('mds', 'mds_export_ephemeral_random_max', 0.01) + self._setup_tree(path="tree/new", count=count) + time.sleep(30) # for something not to happen... + subtrees = self._get_subtrees(status=self.status, rank="all", path="tree/new/") + self.assertLessEqual(len(subtrees), int(.01*count*1.25)) + + def test_ephemeral_random_max_config(self): + """ + That the config mds_export_ephemeral_random_max config rejects new OOB policies. + """ + + self.config_set('mds', 'mds_export_ephemeral_random_max', 0.01) + try: + p = self._setup_tree(count=1, random=0.02, wait=False) + p.wait() + except CommandFailedError as e: + log.info(f"{e}") + self.assertIn("Invalid", p.stderr.getvalue()) + else: + raise RuntimeError("mds_export_ephemeral_random_max ignored!") + + def test_ephemeral_random_dist(self): + """ + That ephemeral distributed pin overrides ephemeral random pin + """ + + self._setup_tree(random=1.0, distributed=True) + self._wait_distributed_subtrees(3 * 2, status=self.status) + + time.sleep(15) + subtrees = self._get_subtrees(status=self.status, rank=0) + for s in subtrees: + path = s['dir']['path'] + if path.startswith('/tree'): + self.assertFalse(s['random_ephemeral_pin']) + + def test_ephemeral_random_pin_override_before(self): + """ + That a conventional export pin overrides the random policy before creating new directories. + """ + + self._setup_tree(count=0, random=1.0) + self._setup_tree(path="tree/pin", count=10, export=1) + self._wait_subtrees([("/tree/pin", 1)], status=self.status, rank=1, path="/tree/pin") + + def test_ephemeral_random_pin_override_after(self): + """ + That a conventional export pin overrides the random policy after creating new directories. + """ + + count = 10 + self._setup_tree(count=0, random=1.0) + self._setup_tree(path="tree/pin", count=count) + self._wait_random_subtrees(count+1, status=self.status, rank="all") + self.mount_a.setfattr("tree/pin", "ceph.dir.pin", "1") + self._wait_subtrees([("/tree/pin", 1)], status=self.status, rank=1, path="/tree/pin") + + def test_ephemeral_randomness(self): + """ + That the randomness is reasonable. + """ + + r = random.uniform(0.25, 0.75) # ratios don't work for small r! + count = 1000 + self._setup_tree(count=count, random=r) + subtrees = self._wait_random_subtrees(int(r*count*.50), status=self.status, rank="all") + time.sleep(30) # for max to not be exceeded + subtrees = self._wait_random_subtrees(int(r*count*.50), status=self.status, rank="all") + self.assertLessEqual(len(subtrees), int(r*count*1.50)) + + def test_ephemeral_random_cache_drop(self): + """ + That the random ephemeral pin does not prevent empty (nothing in cache) subtree merging. + """ + + count = 100 + self._setup_tree(count=count, random=1.0) + self._wait_random_subtrees(count, status=self.status, rank="all") + self.mount_a.umount_wait() # release all caps + def _drop(): + self.fs.ranks_tell(["cache", "drop"], status=self.status) + self._wait_subtrees([], status=self.status, action=_drop) + + def test_ephemeral_random_failover(self): + """ + That the random ephemeral pins stay pinned across MDS failover. + """ + + count = 100 + r = 0.5 + self._setup_tree(count=count, random=r) + # wait for all random subtrees to be created, not a specific count + time.sleep(30) + subtrees = self._wait_random_subtrees(1, status=self.status, rank=1) + before = [(s['dir']['path'], s['auth_first']) for s in subtrees] + before.sort(); + + self.fs.rank_fail(rank=1) + self.status = self.fs.wait_for_daemons() + + time.sleep(30) # waiting for something to not happen + subtrees = self._wait_random_subtrees(1, status=self.status, rank=1) + after = [(s['dir']['path'], s['auth_first']) for s in subtrees] + after.sort(); + log.info(f"subtrees before: {before}") + log.info(f"subtrees after: {after}") + + self.assertEqual(before, after) + + def test_ephemeral_pin_grow_mds(self): + """ + That consistent hashing works to reduce the number of migrations. + """ + + self.fs.set_max_mds(2) + self.status = self.fs.wait_for_daemons() + + self._setup_tree(random=1.0) + subtrees_old = self._wait_random_subtrees(100, status=self.status, rank="all") + + self.fs.set_max_mds(3) + self.status = self.fs.wait_for_daemons() + + # Sleeping for a while to allow the ephemeral pin migrations to complete + time.sleep(30) + + subtrees_new = self._wait_random_subtrees(100, status=self.status, rank="all") + count = 0 + for old_subtree in subtrees_old: + for new_subtree in subtrees_new: + if (old_subtree['dir']['path'] == new_subtree['dir']['path']) and (old_subtree['auth_first'] != new_subtree['auth_first']): + count = count + 1 + break + + log.info("{0} migrations have occured due to the cluster resizing".format(count)) + # ~50% of subtrees from the two rank will migrate to another rank + self.assertLessEqual((count/len(subtrees_old)), (0.5)*1.25) # with 25% overbudget + + def test_ephemeral_pin_shrink_mds(self): + """ + That consistent hashing works to reduce the number of migrations. + """ + + self.fs.set_max_mds(3) + self.status = self.fs.wait_for_daemons() + + self._setup_tree(random=1.0) + subtrees_old = self._wait_random_subtrees(100, status=self.status, rank="all") + + self.fs.set_max_mds(2) + self.status = self.fs.wait_for_daemons() + time.sleep(30) + + subtrees_new = self._wait_random_subtrees(100, status=self.status, rank="all") + count = 0 + for old_subtree in subtrees_old: + for new_subtree in subtrees_new: + if (old_subtree['dir']['path'] == new_subtree['dir']['path']) and (old_subtree['auth_first'] != new_subtree['auth_first']): + count = count + 1 + break + + log.info("{0} migrations have occured due to the cluster resizing".format(count)) + # rebalancing from 3 -> 2 may cause half of rank 0/1 to move and all of rank 2 + self.assertLessEqual((count/len(subtrees_old)), (1.0/3.0/2.0 + 1.0/3.0/2.0 + 1.0/3.0)*1.25) # aka .66 with 25% overbudget diff --git a/qa/tasks/cephfs/test_failover.py b/qa/tasks/cephfs/test_failover.py new file mode 100644 index 000000000..ddcc58ccc --- /dev/null +++ b/qa/tasks/cephfs/test_failover.py @@ -0,0 +1,819 @@ +import time +import signal +import logging +import operator +from random import randint, choice + +from tasks.cephfs.cephfs_test_case import CephFSTestCase +from teuthology.exceptions import CommandFailedError +from tasks.cephfs.fuse_mount import FuseMount + +log = logging.getLogger(__name__) + +class TestClusterAffinity(CephFSTestCase): + CLIENTS_REQUIRED = 0 + MDSS_REQUIRED = 4 + + def _verify_join_fs(self, target, status=None, fs=None): + fs_select = fs + if fs_select is None: + fs_select = self.fs + if status is None: + status = fs_select.wait_for_daemons(timeout=30) + log.debug("%s", status) + target = sorted(target, key=operator.itemgetter('name')) + log.info("target = %s", target) + current = list(status.get_all()) + current = sorted(current, key=operator.itemgetter('name')) + log.info("current = %s", current) + self.assertEqual(len(current), len(target)) + for i in range(len(current)): + for attr in target[i]: + self.assertIn(attr, current[i]) + self.assertEqual(target[i][attr], current[i][attr]) + + def _change_target_state(self, state, name, changes): + for entity in state: + if entity['name'] == name: + for k, v in changes.items(): + entity[k] = v + return + self.fail("no entity") + + def _verify_init(self, fs=None): + fs_select = fs + if fs_select is None: + fs_select = self.fs + status = fs_select.status() + log.info("status = {0}".format(status)) + target = [{'join_fscid': -1, 'name': info['name']} for info in status.get_all()] + self._verify_join_fs(target, status=status, fs=fs_select) + return (status, target) + + def _reach_target(self, target): + def takeover(): + try: + self._verify_join_fs(target) + return True + except AssertionError as e: + log.debug("%s", e) + return False + self.wait_until_true(takeover, 30) + + def test_join_fs_runtime(self): + """ + That setting mds_join_fs at runtime affects the cluster layout. + """ + status, target = self._verify_init() + standbys = list(status.get_standbys()) + self.config_set('mds.'+standbys[0]['name'], 'mds_join_fs', 'cephfs') + self._change_target_state(target, standbys[0]['name'], {'join_fscid': self.fs.id, 'state': 'up:active'}) + self._reach_target(target) + + def test_join_fs_unset(self): + """ + That unsetting mds_join_fs will cause failover if another high-affinity standby exists. + """ + status, target = self._verify_init() + standbys = list(status.get_standbys()) + names = (standbys[0]['name'], standbys[1]['name']) + self.config_set('mds.'+names[0], 'mds_join_fs', 'cephfs') + self.config_set('mds.'+names[1], 'mds_join_fs', 'cephfs') + self._change_target_state(target, names[0], {'join_fscid': self.fs.id}) + self._change_target_state(target, names[1], {'join_fscid': self.fs.id}) + self._reach_target(target) + time.sleep(5) # MDSMonitor tick + status = self.fs.wait_for_daemons() + active = self.fs.get_active_names(status=status)[0] + self.assertIn(active, names) + self.config_rm('mds.'+active, 'mds_join_fs') + self._change_target_state(target, active, {'join_fscid': -1}) + new_active = (set(names) - set((active,))).pop() + self._change_target_state(target, new_active, {'state': 'up:active'}) + self._reach_target(target) + + def test_join_fs_drop(self): + """ + That unsetting mds_join_fs will not cause failover if no high-affinity standby exists. + """ + status, target = self._verify_init() + standbys = list(status.get_standbys()) + active = standbys[0]['name'] + self.config_set('mds.'+active, 'mds_join_fs', 'cephfs') + self._change_target_state(target, active, {'join_fscid': self.fs.id, 'state': 'up:active'}) + self._reach_target(target) + self.config_rm('mds.'+active, 'mds_join_fs') + self._change_target_state(target, active, {'join_fscid': -1}) + self._reach_target(target) + + def test_join_fs_vanilla(self): + """ + That a vanilla standby is preferred over others with mds_join_fs set to another fs. + """ + fs2 = self.mds_cluster.newfs(name="cephfs2") + status, target = self._verify_init() + active = self.fs.get_active_names(status=status)[0] + status2, _ = self._verify_init(fs=fs2) + active2 = fs2.get_active_names(status=status2)[0] + standbys = [info['name'] for info in status.get_standbys()] + victim = standbys.pop() + # Set a bogus fs on the others + for mds in standbys: + self.config_set('mds.'+mds, 'mds_join_fs', 'cephfs2') + self._change_target_state(target, mds, {'join_fscid': fs2.id}) + # The active MDS for cephfs2 will be replaced by the MDS for which + # file system affinity has been set. Also, set the affinity for + # the earlier active MDS so that it is not chosen by the monitors + # as an active MDS for the existing file system. + log.info(f'assigning affinity to cephfs2 for active mds (mds.{active2})') + self.config_set(f'mds.{active2}', 'mds_join_fs', 'cephfs2') + self._change_target_state(target, active2, {'join_fscid': fs2.id}) + self.fs.rank_fail() + self._change_target_state(target, victim, {'state': 'up:active'}) + self._reach_target(target) + status = self.fs.status() + active = self.fs.get_active_names(status=status)[0] + self.assertEqual(active, victim) + + def test_join_fs_last_resort(self): + """ + That a standby with mds_join_fs set to another fs is still used if necessary. + """ + status, target = self._verify_init() + standbys = [info['name'] for info in status.get_standbys()] + for mds in standbys: + self.config_set('mds.'+mds, 'mds_join_fs', 'cephfs2') + fs2 = self.mds_cluster.newfs(name="cephfs2") + for mds in standbys: + self._change_target_state(target, mds, {'join_fscid': fs2.id}) + self.fs.rank_fail() + status = self.fs.status() + ranks = list(self.fs.get_ranks(status=status)) + self.assertEqual(len(ranks), 1) + self.assertIn(ranks[0]['name'], standbys) + # Note that we would expect the former active to reclaim its spot, but + # we're not testing that here. + + def test_join_fs_steady(self): + """ + That a sole MDS with mds_join_fs set will come back as active eventually even after failover. + """ + status, target = self._verify_init() + active = self.fs.get_active_names(status=status)[0] + self.config_set('mds.'+active, 'mds_join_fs', 'cephfs') + self._change_target_state(target, active, {'join_fscid': self.fs.id}) + self._reach_target(target) + self.fs.rank_fail() + self._reach_target(target) + + def test_join_fs_standby_replay(self): + """ + That a standby-replay daemon with weak affinity is replaced by a stronger one. + """ + status, target = self._verify_init() + standbys = [info['name'] for info in status.get_standbys()] + self.config_set('mds.'+standbys[0], 'mds_join_fs', 'cephfs') + self._change_target_state(target, standbys[0], {'join_fscid': self.fs.id, 'state': 'up:active'}) + self._reach_target(target) + self.fs.set_allow_standby_replay(True) + status = self.fs.status() + standbys = [info['name'] for info in status.get_standbys()] + self.config_set('mds.'+standbys[0], 'mds_join_fs', 'cephfs') + self._change_target_state(target, standbys[0], {'join_fscid': self.fs.id, 'state': 'up:standby-replay'}) + self._reach_target(target) + +class TestClusterResize(CephFSTestCase): + CLIENTS_REQUIRED = 0 + MDSS_REQUIRED = 3 + + def test_grow(self): + """ + That the MDS cluster grows after increasing max_mds. + """ + + # Need all my standbys up as well as the active daemons + # self.wait_for_daemon_start() necessary? + + self.fs.grow(2) + self.fs.grow(3) + + + def test_shrink(self): + """ + That the MDS cluster shrinks automatically after decreasing max_mds. + """ + + self.fs.grow(3) + self.fs.shrink(1) + + def test_up_less_than_max(self): + """ + That a health warning is generated when max_mds is greater than active count. + """ + + status = self.fs.status() + mdss = [info['gid'] for info in status.get_all()] + self.fs.set_max_mds(len(mdss)+1) + self.wait_for_health("MDS_UP_LESS_THAN_MAX", 30) + self.fs.shrink(2) + self.wait_for_health_clear(30) + + def test_down_health(self): + """ + That marking a FS down does not generate a health warning + """ + + self.fs.set_down() + try: + self.wait_for_health("", 30) + raise RuntimeError("got health warning?") + except RuntimeError as e: + if "Timed out after" in str(e): + pass + else: + raise + + def test_down_twice(self): + """ + That marking a FS down twice does not wipe old_max_mds. + """ + + self.fs.grow(2) + self.fs.set_down() + self.fs.wait_for_daemons() + self.fs.set_down(False) + self.assertEqual(self.fs.get_var("max_mds"), 2) + self.fs.wait_for_daemons(timeout=60) + + def test_down_grow(self): + """ + That setting max_mds undoes down. + """ + + self.fs.set_down() + self.fs.wait_for_daemons() + self.fs.grow(2) + self.fs.wait_for_daemons() + + def test_down(self): + """ + That down setting toggles and sets max_mds appropriately. + """ + + self.fs.set_down() + self.fs.wait_for_daemons() + self.assertEqual(self.fs.get_var("max_mds"), 0) + self.fs.set_down(False) + self.assertEqual(self.fs.get_var("max_mds"), 1) + self.fs.wait_for_daemons() + self.assertEqual(self.fs.get_var("max_mds"), 1) + + def test_hole(self): + """ + Test that a hole cannot be created in the FS ranks. + """ + + fscid = self.fs.id + + self.fs.grow(2) + + # Now add a delay which should slow down how quickly rank 1 stops + self.config_set('mds', 'ms_inject_delay_max', '5.0') + self.config_set('mds', 'ms_inject_delay_probability', '1.0') + self.fs.set_max_mds(1) + log.info("status = {0}".format(self.fs.status())) + + # Don't wait for rank 1 to stop + self.fs.set_max_mds(3) + log.info("status = {0}".format(self.fs.status())) + + # Now check that the mons didn't try to promote a standby to rank 2 + self.fs.set_max_mds(2) + status = self.fs.status() + try: + status = self.fs.wait_for_daemons(timeout=90) + ranks = set([info['rank'] for info in status.get_ranks(fscid)]) + self.assertEqual(ranks, set([0, 1])) + finally: + log.info("status = {0}".format(status)) + + def test_thrash(self): + """ + Test that thrashing max_mds does not fail. + """ + + max_mds = 2 + for i in range(0, 100): + self.fs.set_max_mds(max_mds) + max_mds = (max_mds+1)%3+1 + + self.fs.wait_for_daemons(timeout=90) + +class TestFailover(CephFSTestCase): + CLIENTS_REQUIRED = 1 + MDSS_REQUIRED = 2 + + def test_repeated_boot(self): + """ + That multiple boot messages do not result in the MDS getting evicted. + """ + + interval = 10 + self.config_set("mon", "paxos_propose_interval", interval) + + mds = choice(list(self.fs.status().get_all())) + + with self.assert_cluster_log(f"daemon mds.{mds['name']} restarted", present=False): + # Avoid a beacon to the monitors with down:dne by restarting: + self.fs.mds_fail(mds_id=mds['name']) + # `ceph mds fail` won't return until the FSMap is committed, double-check: + self.assertIsNone(self.fs.status().get_mds_gid(mds['gid'])) + time.sleep(2) # for mds to restart and accept asok commands + status1 = self.fs.mds_asok(['status'], mds_id=mds['name']) + time.sleep(interval*1.5) + status2 = self.fs.mds_asok(['status'], mds_id=mds['name']) + self.assertEqual(status1['id'], status2['id']) + + def test_simple(self): + """ + That when the active MDS is killed, a standby MDS is promoted into + its rank after the grace period. + + This is just a simple unit test, the harder cases are covered + in thrashing tests. + """ + + (original_active, ) = self.fs.get_active_names() + original_standbys = self.mds_cluster.get_standby_daemons() + + # Kill the rank 0 daemon's physical process + self.fs.mds_stop(original_active) + + # Wait until the monitor promotes his replacement + def promoted(): + ranks = list(self.fs.get_ranks()) + return len(ranks) > 0 and ranks[0]['name'] in original_standbys + + log.info("Waiting for promotion of one of the original standbys {0}".format( + original_standbys)) + self.wait_until_true(promoted, timeout=self.fs.beacon_timeout) + + # Start the original rank 0 daemon up again, see that he becomes a standby + self.fs.mds_restart(original_active) + self.wait_until_true( + lambda: original_active in self.mds_cluster.get_standby_daemons(), + timeout=60 # Approximately long enough for MDS to start and mon to notice + ) + + def test_client_abort(self): + """ + That a client will respect fuse_require_active_mds and error out + when the cluster appears to be unavailable. + """ + + if not isinstance(self.mount_a, FuseMount): + self.skipTest("Requires FUSE client to inject client metadata") + + require_active = self.fs.get_config("fuse_require_active_mds", service_type="mon").lower() == "true" + if not require_active: + self.skipTest("fuse_require_active_mds is not set") + + # Check it's not laggy to begin with + (original_active, ) = self.fs.get_active_names() + self.assertNotIn("laggy_since", self.fs.status().get_mds(original_active)) + + self.mounts[0].umount_wait() + + # Control: that we can mount and unmount usually, while the cluster is healthy + self.mounts[0].mount_wait() + self.mounts[0].umount_wait() + + # Stop the daemon processes + self.fs.mds_stop() + + # Wait for everyone to go laggy + def laggy(): + mdsmap = self.fs.get_mds_map() + for info in mdsmap['info'].values(): + if "laggy_since" not in info: + return False + + return True + + self.wait_until_true(laggy, self.fs.beacon_timeout) + with self.assertRaises(CommandFailedError): + self.mounts[0].mount_wait() + + def test_standby_count_wanted(self): + """ + That cluster health warnings are generated by insufficient standbys available. + """ + + # Need all my standbys up as well as the active daemons + self.wait_for_daemon_start() + + standbys = self.mds_cluster.get_standby_daemons() + self.assertGreaterEqual(len(standbys), 1) + self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys))) + + # Kill a standby and check for warning + victim = standbys.pop() + self.fs.mds_stop(victim) + self.wait_for_health("MDS_INSUFFICIENT_STANDBY", self.fs.beacon_timeout) + + # restart the standby, see that he becomes a standby, check health clears + self.fs.mds_restart(victim) + self.wait_until_true( + lambda: victim in self.mds_cluster.get_standby_daemons(), + timeout=60 # Approximately long enough for MDS to start and mon to notice + ) + self.wait_for_health_clear(timeout=30) + + # Set it one greater than standbys ever seen + standbys = self.mds_cluster.get_standby_daemons() + self.assertGreaterEqual(len(standbys), 1) + self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', str(len(standbys)+1)) + self.wait_for_health("MDS_INSUFFICIENT_STANDBY", self.fs.beacon_timeout) + + # Set it to 0 + self.fs.mon_manager.raw_cluster_cmd('fs', 'set', self.fs.name, 'standby_count_wanted', '0') + self.wait_for_health_clear(timeout=30) + + def test_discontinuous_mdsmap(self): + """ + That discontinuous mdsmap does not affect failover. + See http://tracker.ceph.com/issues/24856. + """ + self.fs.set_max_mds(2) + status = self.fs.wait_for_daemons() + + self.mount_a.umount_wait() + + monc_timeout = float(self.fs.get_config("mon_client_ping_timeout", service_type="mds")) + + mds_0 = self.fs.get_rank(rank=0, status=status) + self.fs.rank_freeze(True, rank=0) # prevent failover + self.fs.rank_signal(signal.SIGSTOP, rank=0, status=status) + self.wait_until_true( + lambda: "laggy_since" in self.fs.get_rank(), + timeout=self.fs.beacon_timeout + ) + + self.fs.rank_fail(rank=1) + self.fs.wait_for_state('up:resolve', rank=1, timeout=30) + + # Make sure of mds_0's monitor connection gets reset + time.sleep(monc_timeout * 2) + + # Continue rank 0, it will get discontinuous mdsmap + self.fs.rank_signal(signal.SIGCONT, rank=0) + self.wait_until_true( + lambda: "laggy_since" not in self.fs.get_rank(rank=0), + timeout=self.fs.beacon_timeout + ) + + # mds.b will be stuck at 'reconnect' state if snapserver gets confused + # by discontinuous mdsmap + self.fs.wait_for_state('up:active', rank=1, timeout=30) + self.assertEqual(mds_0['gid'], self.fs.get_rank(rank=0)['gid']) + self.fs.rank_freeze(False, rank=0) + + def test_connect_bootstrapping(self): + self.config_set("mds", "mds_sleep_rank_change", 10000000.0) + self.config_set("mds", "mds_connect_bootstrapping", True) + self.fs.set_max_mds(2) + self.fs.wait_for_daemons() + self.fs.rank_fail(rank=0) + # rank 0 will get stuck in up:resolve, see https://tracker.ceph.com/issues/53194 + self.fs.wait_for_daemons() + + +class TestStandbyReplay(CephFSTestCase): + CLIENTS_REQUIRED = 0 + MDSS_REQUIRED = 4 + + def _confirm_no_replay(self): + status = self.fs.status() + _ = len(list(status.get_standbys())) + self.assertEqual(0, len(list(self.fs.get_replays(status=status)))) + return status + + def _confirm_single_replay(self, full=True, status=None, retries=3): + status = self.fs.wait_for_daemons(status=status) + ranks = sorted(self.fs.get_mds_map(status=status)['in']) + replays = list(self.fs.get_replays(status=status)) + checked_replays = set() + for rank in ranks: + has_replay = False + for replay in replays: + if replay['rank'] == rank: + self.assertFalse(has_replay) + has_replay = True + checked_replays.add(replay['gid']) + if full and not has_replay: + if retries <= 0: + raise RuntimeError("rank "+str(rank)+" has no standby-replay follower") + else: + retries = retries-1 + time.sleep(2) + self.assertEqual(checked_replays, set(info['gid'] for info in replays)) + return status + + def _check_replay_takeover(self, status, rank=0): + replay = self.fs.get_replay(rank=rank, status=status) + new_status = self.fs.wait_for_daemons() + new_active = self.fs.get_rank(rank=rank, status=new_status) + if replay: + self.assertEqual(replay['gid'], new_active['gid']) + else: + # double check takeover came from a standby (or some new daemon via restart) + found = False + for info in status.get_standbys(): + if info['gid'] == new_active['gid']: + found = True + break + if not found: + for info in status.get_all(): + self.assertNotEqual(info['gid'], new_active['gid']) + return new_status + + def test_standby_replay_singleton(self): + """ + That only one MDS becomes standby-replay. + """ + + self._confirm_no_replay() + self.fs.set_allow_standby_replay(True) + time.sleep(30) + self._confirm_single_replay() + + def test_standby_replay_damaged(self): + """ + That a standby-replay daemon can cause the rank to go damaged correctly. + """ + + self._confirm_no_replay() + self.config_set("mds", "mds_standby_replay_damaged", True) + self.fs.set_allow_standby_replay(True) + self.wait_until_true( + lambda: len(self.fs.get_damaged()) > 0, + timeout=30 + ) + status = self.fs.status() + self.assertListEqual([], list(self.fs.get_ranks(status=status))) + self.assertListEqual([0], self.fs.get_damaged(status=status)) + + def test_standby_replay_disable(self): + """ + That turning off allow_standby_replay fails all standby-replay daemons. + """ + + self._confirm_no_replay() + self.fs.set_allow_standby_replay(True) + time.sleep(30) + self._confirm_single_replay() + self.fs.set_allow_standby_replay(False) + self._confirm_no_replay() + + def test_standby_replay_singleton_fail(self): + """ + That failures don't violate singleton constraint. + """ + + self._confirm_no_replay() + self.fs.set_allow_standby_replay(True) + status = self._confirm_single_replay() + + for i in range(10): + time.sleep(randint(1, 5)) + self.fs.rank_restart(status=status) + status = self._check_replay_takeover(status) + status = self._confirm_single_replay(status=status) + + for i in range(10): + time.sleep(randint(1, 5)) + self.fs.rank_fail() + status = self._check_replay_takeover(status) + status = self._confirm_single_replay(status=status) + + def test_standby_replay_singleton_fail_multimds(self): + """ + That failures don't violate singleton constraint with multiple actives. + """ + + status = self._confirm_no_replay() + new_max_mds = randint(2, len(list(status.get_standbys()))) + self.fs.set_max_mds(new_max_mds) + self.fs.wait_for_daemons() # wait for actives to come online! + self.fs.set_allow_standby_replay(True) + status = self._confirm_single_replay(full=False) + + for i in range(10): + time.sleep(randint(1, 5)) + victim = randint(0, new_max_mds-1) + self.fs.rank_restart(rank=victim, status=status) + status = self._check_replay_takeover(status, rank=victim) + status = self._confirm_single_replay(status=status, full=False) + + for i in range(10): + time.sleep(randint(1, 5)) + victim = randint(0, new_max_mds-1) + self.fs.rank_fail(rank=victim) + status = self._check_replay_takeover(status, rank=victim) + status = self._confirm_single_replay(status=status, full=False) + + def test_standby_replay_failure(self): + """ + That the failure of a standby-replay daemon happens cleanly + and doesn't interrupt anything else. + """ + + status = self._confirm_no_replay() + self.fs.set_max_mds(1) + self.fs.set_allow_standby_replay(True) + status = self._confirm_single_replay() + + for i in range(10): + time.sleep(randint(1, 5)) + victim = self.fs.get_replay(status=status) + self.fs.mds_restart(mds_id=victim['name']) + status = self._confirm_single_replay(status=status) + + def test_standby_replay_prepare_beacon(self): + """ + That a MDSMonitor::prepare_beacon handles standby-replay daemons + correctly without removing the standby. (Note, usually a standby-replay + beacon will just be replied to by MDSMonitor::preprocess_beacon.) + """ + + status = self._confirm_no_replay() + self.fs.set_max_mds(1) + self.fs.set_allow_standby_replay(True) + status = self._confirm_single_replay() + replays = list(status.get_replays(self.fs.id)) + self.assertEqual(len(replays), 1) + self.config_set('mds.'+replays[0]['name'], 'mds_inject_health_dummy', True) + time.sleep(10) # for something not to happen... + status = self._confirm_single_replay() + replays2 = list(status.get_replays(self.fs.id)) + self.assertEqual(replays[0]['gid'], replays2[0]['gid']) + + def test_rank_stopped(self): + """ + That when a rank is STOPPED, standby replays for + that rank get torn down + """ + + status = self._confirm_no_replay() + standby_count = len(list(status.get_standbys())) + self.fs.set_max_mds(2) + self.fs.set_allow_standby_replay(True) + status = self._confirm_single_replay() + + self.fs.set_max_mds(1) # stop rank 1 + + status = self._confirm_single_replay() + self.assertTrue(standby_count, len(list(status.get_standbys()))) + + +class TestMultiFilesystems(CephFSTestCase): + CLIENTS_REQUIRED = 2 + MDSS_REQUIRED = 4 + + # We'll create our own filesystems and start our own daemons + REQUIRE_FILESYSTEM = False + + def setUp(self): + super(TestMultiFilesystems, self).setUp() + self.mds_cluster.mon_manager.raw_cluster_cmd("fs", "flag", "set", + "enable_multiple", "true", + "--yes-i-really-mean-it") + + def _setup_two(self): + fs_a = self.mds_cluster.newfs(name="alpha") + fs_b = self.mds_cluster.newfs(name="bravo") + + self.mds_cluster.mds_restart() + + # Wait for both filesystems to go healthy + fs_a.wait_for_daemons() + fs_b.wait_for_daemons() + + # Reconfigure client auth caps + for mount in self.mounts: + self.mds_cluster.mon_manager.raw_cluster_cmd_result( + 'auth', 'caps', "client.{0}".format(mount.client_id), + 'mds', 'allow', + 'mon', 'allow r', + 'osd', 'allow rw pool={0}, allow rw pool={1}'.format( + fs_a.get_data_pool_name(), fs_b.get_data_pool_name())) + + return fs_a, fs_b + + def test_clients(self): + fs_a, fs_b = self._setup_two() + + # Mount a client on fs_a + self.mount_a.mount_wait(cephfs_name=fs_a.name) + self.mount_a.write_n_mb("pad.bin", 1) + self.mount_a.write_n_mb("test.bin", 2) + a_created_ino = self.mount_a.path_to_ino("test.bin") + self.mount_a.create_files() + + # Mount a client on fs_b + self.mount_b.mount_wait(cephfs_name=fs_b.name) + self.mount_b.write_n_mb("test.bin", 1) + b_created_ino = self.mount_b.path_to_ino("test.bin") + self.mount_b.create_files() + + # Check that a non-default filesystem mount survives an MDS + # failover (i.e. that map subscription is continuous, not + # just the first time), reproduces #16022 + old_fs_b_mds = fs_b.get_active_names()[0] + self.mds_cluster.mds_stop(old_fs_b_mds) + self.mds_cluster.mds_fail(old_fs_b_mds) + fs_b.wait_for_daemons() + background = self.mount_b.write_background() + # Raise exception if the write doesn't finish (i.e. if client + # has not kept up with MDS failure) + try: + self.wait_until_true(lambda: background.finished, timeout=30) + except RuntimeError: + # The mount is stuck, we'll have to force it to fail cleanly + background.stdin.close() + self.mount_b.umount_wait(force=True) + raise + + self.mount_a.umount_wait() + self.mount_b.umount_wait() + + # See that the client's files went into the correct pool + self.assertTrue(fs_a.data_objects_present(a_created_ino, 1024 * 1024)) + self.assertTrue(fs_b.data_objects_present(b_created_ino, 1024 * 1024)) + + def test_standby(self): + fs_a, fs_b = self._setup_two() + + # Assert that the remaining two MDS daemons are now standbys + a_daemons = fs_a.get_active_names() + b_daemons = fs_b.get_active_names() + self.assertEqual(len(a_daemons), 1) + self.assertEqual(len(b_daemons), 1) + original_a = a_daemons[0] + original_b = b_daemons[0] + expect_standby_daemons = set(self.mds_cluster.mds_ids) - (set(a_daemons) | set(b_daemons)) + + # Need all my standbys up as well as the active daemons + self.wait_for_daemon_start() + self.assertEqual(expect_standby_daemons, self.mds_cluster.get_standby_daemons()) + + # Kill fs_a's active MDS, see a standby take over + self.mds_cluster.mds_stop(original_a) + self.mds_cluster.mon_manager.raw_cluster_cmd("mds", "fail", original_a) + self.wait_until_equal(lambda: len(fs_a.get_active_names()), 1, 30, + reject_fn=lambda v: v > 1) + # Assert that it's a *different* daemon that has now appeared in the map for fs_a + self.assertNotEqual(fs_a.get_active_names()[0], original_a) + + # Kill fs_b's active MDS, see a standby take over + self.mds_cluster.mds_stop(original_b) + self.mds_cluster.mon_manager.raw_cluster_cmd("mds", "fail", original_b) + self.wait_until_equal(lambda: len(fs_b.get_active_names()), 1, 30, + reject_fn=lambda v: v > 1) + # Assert that it's a *different* daemon that has now appeared in the map for fs_a + self.assertNotEqual(fs_b.get_active_names()[0], original_b) + + # Both of the original active daemons should be gone, and all standbys used up + self.assertEqual(self.mds_cluster.get_standby_daemons(), set()) + + # Restart the ones I killed, see them reappear as standbys + self.mds_cluster.mds_restart(original_a) + self.mds_cluster.mds_restart(original_b) + self.wait_until_true( + lambda: {original_a, original_b} == self.mds_cluster.get_standby_daemons(), + timeout=30 + ) + + def test_grow_shrink(self): + # Usual setup... + fs_a, fs_b = self._setup_two() + + # Increase max_mds on fs_b, see a standby take up the role + fs_b.set_max_mds(2) + self.wait_until_equal(lambda: len(fs_b.get_active_names()), 2, 30, + reject_fn=lambda v: v > 2 or v < 1) + + # Increase max_mds on fs_a, see a standby take up the role + fs_a.set_max_mds(2) + self.wait_until_equal(lambda: len(fs_a.get_active_names()), 2, 30, + reject_fn=lambda v: v > 2 or v < 1) + + # Shrink fs_b back to 1, see a daemon go back to standby + fs_b.set_max_mds(1) + self.wait_until_equal(lambda: len(fs_b.get_active_names()), 1, 30, + reject_fn=lambda v: v > 2 or v < 1) + + # Grow fs_a up to 3, see the former fs_b daemon join it. + fs_a.set_max_mds(3) + self.wait_until_equal(lambda: len(fs_a.get_active_names()), 3, 60, + reject_fn=lambda v: v > 3 or v < 2) diff --git a/qa/tasks/cephfs/test_flush.py b/qa/tasks/cephfs/test_flush.py new file mode 100644 index 000000000..17cb84970 --- /dev/null +++ b/qa/tasks/cephfs/test_flush.py @@ -0,0 +1,112 @@ + +from textwrap import dedent +from tasks.cephfs.cephfs_test_case import CephFSTestCase +from tasks.cephfs.filesystem import ObjectNotFound, ROOT_INO + + +class TestFlush(CephFSTestCase): + def test_flush(self): + self.mount_a.run_shell(["mkdir", "mydir"]) + self.mount_a.run_shell(["touch", "mydir/alpha"]) + dir_ino = self.mount_a.path_to_ino("mydir") + file_ino = self.mount_a.path_to_ino("mydir/alpha") + + # Unmount the client so that it isn't still holding caps + self.mount_a.umount_wait() + + # Before flush, the dirfrag object does not exist + with self.assertRaises(ObjectNotFound): + self.fs.list_dirfrag(dir_ino) + + # Before flush, the file's backtrace has not been written + with self.assertRaises(ObjectNotFound): + self.fs.read_backtrace(file_ino) + + # Before flush, there are no dentries in the root + self.assertEqual(self.fs.list_dirfrag(ROOT_INO), []) + + # Execute flush + flush_data = self.fs.mds_asok(["flush", "journal"]) + self.assertEqual(flush_data['return_code'], 0) + + # After flush, the dirfrag object has been created + dir_list = self.fs.list_dirfrag(dir_ino) + self.assertEqual(dir_list, ["alpha_head"]) + + # And the 'mydir' dentry is in the root + self.assertEqual(self.fs.list_dirfrag(ROOT_INO), ['mydir_head']) + + # ...and the data object has its backtrace + backtrace = self.fs.read_backtrace(file_ino) + self.assertEqual(['alpha', 'mydir'], [a['dname'] for a in backtrace['ancestors']]) + self.assertEqual([dir_ino, 1], [a['dirino'] for a in backtrace['ancestors']]) + self.assertEqual(file_ino, backtrace['ino']) + + # ...and the journal is truncated to just a single subtreemap from the + # newly created segment + summary_output = self.fs.journal_tool(["event", "get", "summary"], 0) + try: + self.assertEqual(summary_output, + dedent( + """ + Events by type: + SUBTREEMAP: 1 + Errors: 0 + """ + ).strip()) + except AssertionError: + # In some states, flushing the journal will leave you + # an extra event from locks a client held. This is + # correct behaviour: the MDS is flushing the journal, + # it's just that new events are getting added too. + # In this case, we should nevertheless see a fully + # empty journal after a second flush. + self.assertEqual(summary_output, + dedent( + """ + Events by type: + SUBTREEMAP: 1 + UPDATE: 1 + Errors: 0 + """ + ).strip()) + flush_data = self.fs.mds_asok(["flush", "journal"]) + self.assertEqual(flush_data['return_code'], 0) + self.assertEqual(self.fs.journal_tool(["event", "get", "summary"], 0), + dedent( + """ + Events by type: + SUBTREEMAP: 1 + Errors: 0 + """ + ).strip()) + + # Now for deletion! + # We will count the RADOS deletions and MDS file purges, to verify that + # the expected behaviour is happening as a result of the purge + initial_dels = self.fs.mds_asok(['perf', 'dump', 'objecter'])['objecter']['osdop_delete'] + initial_purges = self.fs.mds_asok(['perf', 'dump', 'mds_cache'])['mds_cache']['strays_enqueued'] + + # Use a client to delete a file + self.mount_a.mount_wait() + self.mount_a.run_shell(["rm", "-rf", "mydir"]) + + # Flush the journal so that the directory inode can be purged + flush_data = self.fs.mds_asok(["flush", "journal"]) + self.assertEqual(flush_data['return_code'], 0) + + # We expect to see a single file purge + self.wait_until_true( + lambda: self.fs.mds_asok(['perf', 'dump', 'mds_cache'])['mds_cache']['strays_enqueued'] - initial_purges >= 2, + 60) + + # We expect two deletions, one of the dirfrag and one of the backtrace + self.wait_until_true( + lambda: self.fs.mds_asok(['perf', 'dump', 'objecter'])['objecter']['osdop_delete'] - initial_dels >= 2, + 60) # timeout is fairly long to allow for tick+rados latencies + + with self.assertRaises(ObjectNotFound): + self.fs.list_dirfrag(dir_ino) + with self.assertRaises(ObjectNotFound): + self.fs.read_backtrace(file_ino) + self.assertEqual(self.fs.list_dirfrag(ROOT_INO), []) diff --git a/qa/tasks/cephfs/test_forward_scrub.py b/qa/tasks/cephfs/test_forward_scrub.py new file mode 100644 index 000000000..f3cec881b --- /dev/null +++ b/qa/tasks/cephfs/test_forward_scrub.py @@ -0,0 +1,307 @@ + +""" +Test that the forward scrub functionality can traverse metadata and apply +requested tags, on well formed metadata. + +This is *not* the real testing for forward scrub, which will need to test +how the functionality responds to damaged metadata. + +""" +import logging +import json + +from collections import namedtuple +from io import BytesIO +from textwrap import dedent + +from teuthology.exceptions import CommandFailedError +from tasks.cephfs.cephfs_test_case import CephFSTestCase + +import struct + +log = logging.getLogger(__name__) + + +ValidationError = namedtuple("ValidationError", ["exception", "backtrace"]) + + +class TestForwardScrub(CephFSTestCase): + MDSS_REQUIRED = 1 + + def _read_str_xattr(self, pool, obj, attr): + """ + Read a ceph-encoded string from a rados xattr + """ + output = self.fs.mon_manager.do_rados(["getxattr", obj, attr], pool=pool, + stdout=BytesIO()).stdout.getvalue() + strlen = struct.unpack('i', output[0:4])[0] + return output[4:(4 + strlen)].decode(encoding='ascii') + + def _get_paths_to_ino(self): + inos = {} + p = self.mount_a.run_shell(["find", "./"]) + paths = p.stdout.getvalue().strip().split() + for path in paths: + inos[path] = self.mount_a.path_to_ino(path) + + return inos + + def test_apply_tag(self): + self.mount_a.run_shell(["mkdir", "parentdir"]) + self.mount_a.run_shell(["mkdir", "parentdir/childdir"]) + self.mount_a.run_shell(["touch", "rfile"]) + self.mount_a.run_shell(["touch", "parentdir/pfile"]) + self.mount_a.run_shell(["touch", "parentdir/childdir/cfile"]) + + # Build a structure mapping path to inode, as we will later want + # to check object by object and objects are named after ino number + inos = self._get_paths_to_ino() + + # Flush metadata: this is a friendly test of forward scrub so we're skipping + # the part where it's meant to cope with dirty metadata + self.mount_a.umount_wait() + self.fs.mds_asok(["flush", "journal"]) + + tag = "mytag" + + # Execute tagging forward scrub + self.fs.mds_asok(["tag", "path", "/parentdir", tag]) + # Wait for completion + import time + time.sleep(10) + # FIXME watching clog isn't a nice mechanism for this, once we have a ScrubMap we'll + # watch that instead + + # Check that dirs were tagged + for dirpath in ["./parentdir", "./parentdir/childdir"]: + self.assertTagged(inos[dirpath], tag, self.fs.get_metadata_pool_name()) + + # Check that files were tagged + for filepath in ["./parentdir/pfile", "./parentdir/childdir/cfile"]: + self.assertTagged(inos[filepath], tag, self.fs.get_data_pool_name()) + + # This guy wasn't in the tag path, shouldn't have been tagged + self.assertUntagged(inos["./rfile"]) + + def assertUntagged(self, ino): + file_obj_name = "{0:x}.00000000".format(ino) + with self.assertRaises(CommandFailedError): + self._read_str_xattr( + self.fs.get_data_pool_name(), + file_obj_name, + "scrub_tag" + ) + + def assertTagged(self, ino, tag, pool): + file_obj_name = "{0:x}.00000000".format(ino) + wrote = self._read_str_xattr( + pool, + file_obj_name, + "scrub_tag" + ) + self.assertEqual(wrote, tag) + + def _validate_linkage(self, expected): + inos = self._get_paths_to_ino() + try: + self.assertDictEqual(inos, expected) + except AssertionError: + log.error("Expected: {0}".format(json.dumps(expected, indent=2))) + log.error("Actual: {0}".format(json.dumps(inos, indent=2))) + raise + + def test_orphan_scan(self): + # Create some files whose metadata we will flush + self.mount_a.run_python(dedent(""" + import os + mount_point = "{mount_point}" + parent = os.path.join(mount_point, "parent") + os.mkdir(parent) + flushed = os.path.join(parent, "flushed") + os.mkdir(flushed) + for f in ["alpha", "bravo", "charlie"]: + open(os.path.join(flushed, f), 'w').write(f) + """.format(mount_point=self.mount_a.mountpoint))) + + inos = self._get_paths_to_ino() + + # Flush journal + # Umount before flush to avoid cap releases putting + # things we don't want in the journal later. + self.mount_a.umount_wait() + self.fs.flush() + + # Create a new inode that's just in the log, i.e. would + # look orphaned to backward scan if backward scan wisnae + # respectin' tha scrub_tag xattr. + self.mount_a.mount_wait() + self.mount_a.run_shell(["mkdir", "parent/unflushed"]) + self.mount_a.run_shell(["dd", "if=/dev/urandom", + "of=./parent/unflushed/jfile", + "bs=1M", "count=8"]) + inos["./parent/unflushed"] = self.mount_a.path_to_ino("./parent/unflushed") + inos["./parent/unflushed/jfile"] = self.mount_a.path_to_ino("./parent/unflushed/jfile") + self.mount_a.umount_wait() + + # Orphan an inode by deleting its dentry + # Our victim will be.... bravo. + self.mount_a.umount_wait() + self.fs.fail() + self.fs.set_ceph_conf('mds', 'mds verify scatter', False) + self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False) + frag_obj_id = "{0:x}.00000000".format(inos["./parent/flushed"]) + self.fs.radosm(["rmomapkey", frag_obj_id, "bravo_head"]) + + self.fs.set_joinable() + self.fs.wait_for_daemons() + + # See that the orphaned file is indeed missing from a client's POV + self.mount_a.mount_wait() + damaged_state = self._get_paths_to_ino() + self.assertNotIn("./parent/flushed/bravo", damaged_state) + self.mount_a.umount_wait() + + # Run a tagging forward scrub + tag = "mytag123" + self.fs.rank_asok(["tag", "path", "/parent", tag]) + + # See that the orphan wisnae tagged + self.assertUntagged(inos['./parent/flushed/bravo']) + + # See that the flushed-metadata-and-still-present files are tagged + self.assertTagged(inos['./parent/flushed/alpha'], tag, self.fs.get_data_pool_name()) + self.assertTagged(inos['./parent/flushed/charlie'], tag, self.fs.get_data_pool_name()) + + # See that journalled-but-not-flushed file *was* tagged + self.assertTagged(inos['./parent/unflushed/jfile'], tag, self.fs.get_data_pool_name()) + + # okay, now we are going to run cephfs-data-scan. It's necessary to + # have a clean journal otherwise replay will blowup on mismatched + # inotable versions (due to scan_links) + self.fs.flush() + self.fs.fail() + self.fs.journal_tool(["journal", "reset", "--force"], 0) + + # Run cephfs-data-scan targeting only orphans + self.fs.data_scan(["scan_extents", self.fs.get_data_pool_name()]) + self.fs.data_scan([ + "scan_inodes", + "--filter-tag", tag, + self.fs.get_data_pool_name() + ]) + self.fs.data_scan(["scan_links"]) + + # After in-place injection stats should be kosher again + self.fs.set_ceph_conf('mds', 'mds verify scatter', True) + self.fs.set_ceph_conf('mds', 'mds debug scatterstat', True) + + # And we should have all the same linkage we started with, + # and no lost+found, and no extra inodes! + self.fs.set_joinable() + self.fs.wait_for_daemons() + self.mount_a.mount_wait() + self._validate_linkage(inos) + + def _stash_inotable(self): + # Get all active ranks + ranks = self.fs.get_all_mds_rank() + + inotable_dict = {} + for rank in ranks: + inotable_oid = "mds{rank:d}_".format(rank=rank) + "inotable" + print("Trying to fetch inotable object: " + inotable_oid) + + #self.fs.get_metadata_object("InoTable", "mds0_inotable") + inotable_raw = self.fs.radosmo(['get', inotable_oid, '-']) + inotable_dict[inotable_oid] = inotable_raw + return inotable_dict + + def test_inotable_sync(self): + self.mount_a.write_n_mb("file1_sixmegs", 6) + + # Flush journal + self.mount_a.umount_wait() + self.fs.mds_asok(["flush", "journal"]) + + inotable_copy = self._stash_inotable() + + self.mount_a.mount_wait() + + self.mount_a.write_n_mb("file2_sixmegs", 6) + self.mount_a.write_n_mb("file3_sixmegs", 6) + + inos = self._get_paths_to_ino() + + # Flush journal + self.mount_a.umount_wait() + self.fs.mds_asok(["flush", "journal"]) + + self.mount_a.umount_wait() + + with self.assert_cluster_log("inode table repaired", invert_match=True): + out_json = self.fs.run_scrub(["start", "/", "repair,recursive"]) + self.assertNotEqual(out_json, None) + self.assertEqual(out_json["return_code"], 0) + self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True) + + self.fs.fail() + + # Truncate the journal (to ensure the inotable on disk + # is all that will be in the InoTable in memory) + + self.fs.journal_tool(["event", "splice", + "--inode={0}".format(inos["./file2_sixmegs"]), "summary"], 0) + + self.fs.journal_tool(["event", "splice", + "--inode={0}".format(inos["./file3_sixmegs"]), "summary"], 0) + + # Revert to old inotable. + for key, value in inotable_copy.items(): + self.fs.radosm(["put", key, "-"], stdin=BytesIO(value)) + + self.fs.set_joinable() + self.fs.wait_for_daemons() + + with self.assert_cluster_log("inode table repaired"): + out_json = self.fs.run_scrub(["start", "/", "repair,recursive"]) + self.assertNotEqual(out_json, None) + self.assertEqual(out_json["return_code"], 0) + self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True) + + self.fs.fail() + table_text = self.fs.table_tool(["0", "show", "inode"]) + table = json.loads(table_text) + self.assertGreater( + table['0']['data']['inotable']['free'][0]['start'], + inos['./file3_sixmegs']) + + def test_backtrace_repair(self): + """ + That the MDS can repair an inodes backtrace in the data pool + if it is found to be damaged. + """ + # Create a file for subsequent checks + self.mount_a.run_shell(["mkdir", "parent_a"]) + self.mount_a.run_shell(["touch", "parent_a/alpha"]) + file_ino = self.mount_a.path_to_ino("parent_a/alpha") + + # That backtrace and layout are written after initial flush + self.fs.mds_asok(["flush", "journal"]) + backtrace = self.fs.read_backtrace(file_ino) + self.assertEqual(['alpha', 'parent_a'], + [a['dname'] for a in backtrace['ancestors']]) + + # Go corrupt the backtrace + self.fs._write_data_xattr(file_ino, "parent", + "oh i'm sorry did i overwrite your xattr?") + + with self.assert_cluster_log("bad backtrace on inode"): + out_json = self.fs.run_scrub(["start", "/", "repair,recursive"]) + self.assertNotEqual(out_json, None) + self.assertEqual(out_json["return_code"], 0) + self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True) + + self.fs.mds_asok(["flush", "journal"]) + backtrace = self.fs.read_backtrace(file_ino) + self.assertEqual(['alpha', 'parent_a'], + [a['dname'] for a in backtrace['ancestors']]) diff --git a/qa/tasks/cephfs/test_fragment.py b/qa/tasks/cephfs/test_fragment.py new file mode 100644 index 000000000..7d35ec0df --- /dev/null +++ b/qa/tasks/cephfs/test_fragment.py @@ -0,0 +1,359 @@ +from io import StringIO + +from tasks.cephfs.cephfs_test_case import CephFSTestCase +from teuthology.orchestra import run + +import os +import time +import logging +log = logging.getLogger(__name__) + + +class TestFragmentation(CephFSTestCase): + CLIENTS_REQUIRED = 1 + MDSS_REQUIRED = 1 + + def get_splits(self): + return self.fs.mds_asok(['perf', 'dump', 'mds'])['mds']['dir_split'] + + def get_merges(self): + return self.fs.mds_asok(['perf', 'dump', 'mds'])['mds']['dir_merge'] + + def get_dir_ino(self, path): + dir_cache = self.fs.read_cache(path, 0) + dir_ino = None + dir_inono = self.mount_a.path_to_ino(path.strip("/")) + for ino in dir_cache: + if ino['ino'] == dir_inono: + dir_ino = ino + break + self.assertIsNotNone(dir_ino) + return dir_ino + + def _configure(self, **kwargs): + """ + Apply kwargs as MDS configuration settings, enable dirfrags + and restart the MDSs. + """ + + for k, v in kwargs.items(): + self.ceph_cluster.set_ceph_conf("mds", k, v.__str__()) + + self.mds_cluster.mds_fail_restart() + self.fs.wait_for_daemons() + + def test_oversize(self): + """ + That a directory is split when it becomes too large. + """ + + split_size = 20 + merge_size = 5 + + self._configure( + mds_bal_split_size=split_size, + mds_bal_merge_size=merge_size, + mds_bal_split_bits=1 + ) + + self.assertEqual(self.get_splits(), 0) + + self.mount_a.create_n_files("splitdir/file", split_size + 1) + + self.wait_until_true( + lambda: self.get_splits() == 1, + timeout=30 + ) + + frags = self.get_dir_ino("/splitdir")['dirfrags'] + self.assertEqual(len(frags), 2) + self.assertEqual(frags[0]['dirfrag'], "0x10000000000.0*") + self.assertEqual(frags[1]['dirfrag'], "0x10000000000.1*") + self.assertEqual( + sum([len(f['dentries']) for f in frags]), + split_size + 1 + ) + + self.assertEqual(self.get_merges(), 0) + + self.mount_a.run_shell(["rm", "-f", run.Raw("splitdir/file*")]) + + self.wait_until_true( + lambda: self.get_merges() == 1, + timeout=30 + ) + + self.assertEqual(len(self.get_dir_ino("/splitdir")["dirfrags"]), 1) + + def test_rapid_creation(self): + """ + That the fast-splitting limit of 1.5x normal limit is + applied when creating dentries quickly. + """ + + split_size = 100 + merge_size = 1 + + self._configure( + mds_bal_split_size=split_size, + mds_bal_merge_size=merge_size, + mds_bal_split_bits=3, + mds_bal_fragment_size_max=int(split_size * 1.5 + 2) + ) + + # We test this only at a single split level. If a client was sending + # IO so fast that it hit a second split before the first split + # was complete, it could violate mds_bal_fragment_size_max -- there + # is a window where the child dirfrags of a split are unfrozen + # (so they can grow), but still have STATE_FRAGMENTING (so they + # can't be split). + + # By writing 4x the split size when the split bits are set + # to 3 (i.e. 4-ways), I am reasonably sure to see precisely + # one split. The test is to check whether that split + # happens soon enough that the client doesn't exceed + # 2x the split_size (the "immediate" split mode should + # kick in at 1.5x the split size). + + self.assertEqual(self.get_splits(), 0) + self.mount_a.create_n_files("splitdir/file", split_size * 4) + self.wait_until_equal( + self.get_splits, + 1, + reject_fn=lambda s: s > 1, + timeout=30 + ) + + def test_deep_split(self): + """ + That when the directory grows many times larger than split size, + the fragments get split again. + """ + + split_size = 100 + merge_size = 1 # i.e. don't merge frag unless its empty + split_bits = 1 + + branch_factor = 2**split_bits + + # Arbitrary: how many levels shall we try fragmenting before + # ending the test? + max_depth = 5 + + self._configure( + mds_bal_split_size=split_size, + mds_bal_merge_size=merge_size, + mds_bal_split_bits=split_bits + ) + + # Each iteration we will create another level of fragments. The + # placement of dentries into fragments is by hashes (i.e. pseudo + # random), so we rely on statistics to get the behaviour that + # by writing about 1.5x as many dentries as the split_size times + # the number of frags, we will get them all to exceed their + # split size and trigger a split. + depth = 0 + files_written = 0 + splits_expected = 0 + while depth < max_depth: + log.info("Writing files for depth {0}".format(depth)) + target_files = branch_factor**depth * int(split_size * 1.5) + create_files = target_files - files_written + + self.ceph_cluster.mon_manager.raw_cluster_cmd("log", + "{0} Writing {1} files (depth={2})".format( + self.__class__.__name__, create_files, depth + )) + self.mount_a.create_n_files("splitdir/file_{0}".format(depth), + create_files) + self.ceph_cluster.mon_manager.raw_cluster_cmd("log", + "{0} Done".format(self.__class__.__name__)) + + files_written += create_files + log.info("Now have {0} files".format(files_written)) + + splits_expected += branch_factor**depth + log.info("Waiting to see {0} splits".format(splits_expected)) + try: + self.wait_until_equal( + self.get_splits, + splits_expected, + timeout=30, + reject_fn=lambda x: x > splits_expected + ) + + frags = self.get_dir_ino("/splitdir")['dirfrags'] + self.assertEqual(len(frags), branch_factor**(depth+1)) + self.assertEqual( + sum([len(f['dentries']) for f in frags]), + target_files + ) + except: + # On failures, log what fragmentation we actually ended + # up with. This block is just for logging, at the end + # we raise the exception again. + frags = self.get_dir_ino("/splitdir")['dirfrags'] + log.info("depth={0} splits_expected={1} files_written={2}".format( + depth, splits_expected, files_written + )) + log.info("Dirfrags:") + for f in frags: + log.info("{0}: {1}".format( + f['dirfrag'], len(f['dentries']) + )) + raise + + depth += 1 + + # Remember the inode number because we will be checking for + # objects later. + dir_inode_no = self.mount_a.path_to_ino("splitdir") + + self.mount_a.run_shell(["rm", "-rf", "splitdir/"]) + self.mount_a.umount_wait() + + self.fs.mds_asok(['flush', 'journal']) + + def _check_pq_finished(): + num_strays = self.fs.mds_asok(['perf', 'dump', 'mds_cache'])['mds_cache']['num_strays'] + pq_ops = self.fs.mds_asok(['perf', 'dump', 'purge_queue'])['purge_queue']['pq_executing'] + return num_strays == 0 and pq_ops == 0 + + # Wait for all strays to purge + self.wait_until_true( + lambda: _check_pq_finished(), + timeout=1200 + ) + # Check that the metadata pool objects for all the myriad + # child fragments are gone + metadata_objs = self.fs.radosmo(["ls"], stdout=StringIO()).strip() + frag_objs = [] + for o in metadata_objs.split("\n"): + if o.startswith("{0:x}.".format(dir_inode_no)): + frag_objs.append(o) + self.assertListEqual(frag_objs, []) + + def test_split_straydir(self): + """ + That stray dir is split when it becomes too large. + """ + def _count_fragmented(): + mdsdir_cache = self.fs.read_cache("~mdsdir", 1) + num = 0 + for ino in mdsdir_cache: + if ino["ino"] == 0x100: + continue + if len(ino["dirfrags"]) > 1: + log.info("straydir 0x{:X} is fragmented".format(ino["ino"])) + num += 1; + return num + + split_size = 50 + merge_size = 5 + split_bits = 1 + + self._configure( + mds_bal_split_size=split_size, + mds_bal_merge_size=merge_size, + mds_bal_split_bits=split_bits, + mds_bal_fragment_size_max=(split_size * 100) + ) + + # manually split/merge + self.assertEqual(_count_fragmented(), 0) + self.fs.mds_asok(["dirfrag", "split", "~mdsdir/stray8", "0/0", "1"]) + self.fs.mds_asok(["dirfrag", "split", "~mdsdir/stray9", "0/0", "1"]) + self.wait_until_true( + lambda: _count_fragmented() == 2, + timeout=30 + ) + + time.sleep(30) + + self.fs.mds_asok(["dirfrag", "merge", "~mdsdir/stray8", "0/0"]) + self.wait_until_true( + lambda: _count_fragmented() == 1, + timeout=30 + ) + + time.sleep(30) + + # auto merge + + # merging stray dirs is driven by MDCache::advance_stray() + # advance stray dir 10 times + for _ in range(10): + self.fs.mds_asok(['flush', 'journal']) + + self.wait_until_true( + lambda: _count_fragmented() == 0, + timeout=30 + ) + + # auto split + + # there are 10 stray dirs. advance stray dir 20 times + self.mount_a.create_n_files("testdir1/file", split_size * 20) + self.mount_a.run_shell(["mkdir", "testdir2"]) + testdir1_path = os.path.join(self.mount_a.mountpoint, "testdir1") + for i in self.mount_a.ls(testdir1_path): + self.mount_a.run_shell(["ln", "testdir1/{0}".format(i), "testdir2/"]) + + self.mount_a.umount_wait() + self.mount_a.mount_wait() + self.mount_a.wait_until_mounted() + + # flush journal and restart mds. after restart, testdir2 is not in mds' cache + self.fs.mds_asok(['flush', 'journal']) + self.mds_cluster.mds_fail_restart() + self.fs.wait_for_daemons() + # splitting stray dirs is driven by MDCache::advance_stray() + # advance stray dir after unlink 'split_size' files. + self.fs.mds_asok(['config', 'set', 'mds_log_events_per_segment', str(split_size)]) + + self.assertEqual(_count_fragmented(), 0) + self.mount_a.run_shell(["rm", "-rf", "testdir1"]) + self.wait_until_true( + lambda: _count_fragmented() > 0, + timeout=30 + ) + + def test_dir_merge_with_snap_items(self): + """ + That directory remain fragmented when snapshot items are taken into account. + """ + split_size = 1000 + merge_size = 100 + self._configure( + mds_bal_split_size=split_size, + mds_bal_merge_size=merge_size, + mds_bal_split_bits=1 + ) + + # split the dir + create_files = split_size + 50 + self.mount_a.create_n_files("splitdir/file_", create_files) + + self.wait_until_true( + lambda: self.get_splits() == 1, + timeout=30 + ) + + frags = self.get_dir_ino("/splitdir")['dirfrags'] + self.assertEqual(len(frags), 2) + self.assertEqual(frags[0]['dirfrag'], "0x10000000000.0*") + self.assertEqual(frags[1]['dirfrag'], "0x10000000000.1*") + self.assertEqual( + sum([len(f['dentries']) for f in frags]), create_files + ) + + self.assertEqual(self.get_merges(), 0) + + self.mount_a.run_shell(["mkdir", "splitdir/.snap/snap_a"]) + self.mount_a.run_shell(["mkdir", "splitdir/.snap/snap_b"]) + self.mount_a.run_shell(["rm", "-f", run.Raw("splitdir/file*")]) + + time.sleep(30) + + self.assertEqual(self.get_merges(), 0) + self.assertEqual(len(self.get_dir_ino("/splitdir")["dirfrags"]), 2) diff --git a/qa/tasks/cephfs/test_fscrypt.py b/qa/tasks/cephfs/test_fscrypt.py new file mode 100644 index 000000000..11dd2038f --- /dev/null +++ b/qa/tasks/cephfs/test_fscrypt.py @@ -0,0 +1,77 @@ +from logging import getLogger + +from io import StringIO +from tasks.cephfs.xfstests_dev import XFSTestsDev + + +log = getLogger(__name__) + + +class TestFscrypt(XFSTestsDev): + + def setup_xfsprogs_devs(self): + self.install_xfsprogs = True + + def require_kernel_mount(self): + from tasks.cephfs.fuse_mount import FuseMount + from tasks.cephfs.kernel_mount import KernelMount + + # TODO: make xfstests-dev compatible with ceph-fuse. xfstests-dev + # remounts CephFS before running tests using kernel, so ceph-fuse + # mounts are never actually tested. + if isinstance(self.mount_a, FuseMount): + self.skipTest('Requires kernel client; xfstests-dev not '\ + 'compatible with ceph-fuse ATM.') + elif isinstance(self.mount_a, KernelMount): + log.info('client is kernel mounted') + + def test_fscrypt_encrypt(self): + self.require_kernel_mount() + + # XXX: check_status is set to False so that we can check for command's + # failure on our own (since this command doesn't set right error code + # and error message in some cases) and print custom log messages + # accordingly. + proc = self.mount_a.client_remote.run(args=['sudo', 'env', 'DIFF_LENGTH=0', + './check', '-g', 'encrypt'], cwd=self.xfstests_repo_path, stdout=StringIO(), + stderr=StringIO(), timeout=900, check_status=False, omit_sudo=False, + label='running tests for encrypt from xfstests-dev') + + if proc.returncode != 0: + log.info('Command failed.') + log.info(f'Command return value: {proc.returncode}') + stdout, stderr = proc.stdout.getvalue(), proc.stderr.getvalue() + log.info(f'Command stdout -\n{stdout}') + log.info(f'Command stderr -\n{stderr}') + + # Currently only the 395,396,397,421,429,435,440,580,593,595 and 598 + # of the 26 test cases will be actually ran, all the others will be + # skipped for now because of not supporting features in kernel or kceph. + self.assertEqual(proc.returncode, 0) + self.assertIn('Passed all 26 tests', stdout) + + def test_fscrypt_dummy_encryption_with_quick_group(self): + self.require_kernel_mount() + + self.write_local_config('test_dummy_encryption') + + # XXX: check_status is set to False so that we can check for command's + # failure on our own (since this command doesn't set right error code + # and error message in some cases) and print custom log messages + # accordingly. This will take a long time and set the timeout to 3 hours. + proc = self.mount_a.client_remote.run(args=['sudo', 'env', 'DIFF_LENGTH=0', + './check', '-g', 'quick', '-E', './ceph.exclude'], cwd=self.xfstests_repo_path, + stdout=StringIO(), stderr=StringIO(), timeout=10800, check_status=False, + omit_sudo=False, label='running tests for dummy_encryption from xfstests-dev') + + if proc.returncode != 0: + log.info('Command failed.') + log.info(f'Command return value: {proc.returncode}') + stdout, stderr = proc.stdout.getvalue(), proc.stderr.getvalue() + log.info(f'Command stdout -\n{stdout}') + log.info(f'Command stderr -\n{stderr}') + + # Currently, many test cases will be skipped due to unsupported features, + # but still will be marked as successful. + self.assertEqual(proc.returncode, 0) + self.assertIn('Passed all ', stdout) diff --git a/qa/tasks/cephfs/test_fstop.py b/qa/tasks/cephfs/test_fstop.py new file mode 100644 index 000000000..ed76eaac2 --- /dev/null +++ b/qa/tasks/cephfs/test_fstop.py @@ -0,0 +1,114 @@ +import logging +import json + +from tasks.cephfs.cephfs_test_case import CephFSTestCase +from teuthology.exceptions import CommandFailedError +from teuthology.contextutil import safe_while + +log = logging.getLogger(__name__) + + +class TestFSTop(CephFSTestCase): + CLIENTS_REQUIRED = 2 + + def setUp(self): + super(TestFSTop, self).setUp() + self._enable_mgr_stats_plugin() + + def tearDown(self): + self._disable_mgr_stats_plugin() + super(TestFSTop, self).tearDown() + + def _enable_mgr_stats_plugin(self): + return self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "enable", "stats") + + def _disable_mgr_stats_plugin(self): + return self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "disable", "stats") + + def _fstop_dump(self, *args): + return self.mount_a.run_shell(['cephfs-top', + '--id=admin', + *args]).stdout.getvalue() + + def _get_metrics(self, verifier_callback, trials, *args): + metrics = None + done = False + with safe_while(sleep=1, tries=trials, action='wait for metrics') as proceed: + while proceed(): + metrics = json.loads(self._fstop_dump(*args)) + done = verifier_callback(metrics) + if done: + break + return done, metrics + + # TESTS + def test_fstop_non_existent_cluster(self): + try: + self.mount_a.run_shell(['cephfs-top', + '--cluster=hpec', + '--id=admin', + '--selftest']) + except CommandFailedError: + pass + else: + raise RuntimeError('expected cephfs-top command to fail.') + + def test_fstop(self): + try: + self.mount_a.run_shell(['cephfs-top', + '--id=admin', + '--selftest']) + except CommandFailedError: + raise RuntimeError('cephfs-top --selftest failed') + + def test_dump(self): + """ + Tests 'cephfs-top --dump' output is valid + """ + def verify_fstop_metrics(metrics): + clients = metrics.get(self.fs.name, {}) + if str(self.mount_a.get_global_id()) in clients and \ + str(self.mount_b.get_global_id()) in clients: + return True + return False + + # validate + valid, metrics = self._get_metrics(verify_fstop_metrics, 30, '--dump') + log.debug("metrics={0}".format(metrics)) + self.assertTrue(valid) + + def test_dumpfs(self): + """ + Tests 'cephfs-top --dumpfs' output is valid + """ + newfs_name = "cephfs_b" + + def verify_fstop_metrics(metrics): + clients = metrics.get(newfs_name, {}) + if self.fs.name not in metrics and \ + str(self.mount_b.get_global_id()) in clients: + return True + return False + + # umount mount_b, mount another filesystem on it and use --dumpfs filter + self.mount_b.umount_wait() + + self.mds_cluster.mon_manager.raw_cluster_cmd("fs", "flag", "set", "enable_multiple", "true", + "--yes-i-really-mean-it") + + # create a new filesystem + fs_b = self.mds_cluster.newfs(name=newfs_name) + + # mount cephfs_b on mount_b + self.mount_b.mount_wait(cephfs_name=fs_b.name) + + # validate + valid, metrics = self._get_metrics(verify_fstop_metrics, 30, + '--dumpfs={}'.format(newfs_name)) + log.debug("metrics={0}".format(metrics)) + + # restore mount_b + self.mount_b.umount_wait() + self.mount_b.mount_wait(cephfs_name=self.fs.name) + + self.assertTrue(valid) diff --git a/qa/tasks/cephfs/test_full.py b/qa/tasks/cephfs/test_full.py new file mode 100644 index 000000000..2b3a7d5f9 --- /dev/null +++ b/qa/tasks/cephfs/test_full.py @@ -0,0 +1,398 @@ +import json +import logging +import os +from textwrap import dedent +from typing import Optional +from teuthology.exceptions import CommandFailedError +from tasks.cephfs.fuse_mount import FuseMount +from tasks.cephfs.cephfs_test_case import CephFSTestCase + + +log = logging.getLogger(__name__) + + +class FullnessTestCase(CephFSTestCase): + CLIENTS_REQUIRED = 2 + + # Subclasses define whether they're filling whole cluster or just data pool + data_only = False + + # Subclasses define how many bytes should be written to achieve fullness + pool_capacity: Optional[int] = None + fill_mb = None + + def is_full(self): + return self.fs.is_full() + + def setUp(self): + CephFSTestCase.setUp(self) + + mds_status = self.fs.rank_asok(["status"]) + + # Capture the initial OSD map epoch for later use + self.initial_osd_epoch = mds_status['osdmap_epoch_barrier'] + + def test_barrier(self): + """ + That when an OSD epoch barrier is set on an MDS, subsequently + issued capabilities cause clients to update their OSD map to that + epoch. + """ + + # script that sync up client with MDS OSD map barrier. The barrier should + # be updated by cap flush ack message. + pyscript = dedent(""" + import os + fd = os.open("{path}", os.O_CREAT | os.O_RDWR, 0O600) + os.fchmod(fd, 0O666) + os.fsync(fd) + os.close(fd) + """) + + # Sync up client with initial MDS OSD map barrier. + path = os.path.join(self.mount_a.mountpoint, "foo") + self.mount_a.run_python(pyscript.format(path=path)) + + # Grab mounts' initial OSD epochs: later we will check that + # it hasn't advanced beyond this point. + mount_a_initial_epoch, mount_a_initial_barrier = self.mount_a.get_osd_epoch() + + # Freshly mounted at start of test, should be up to date with OSD map + self.assertGreaterEqual(mount_a_initial_epoch, self.initial_osd_epoch) + + # Set and unset a flag to cause OSD epoch to increment + self.fs.mon_manager.raw_cluster_cmd("osd", "set", "pause") + self.fs.mon_manager.raw_cluster_cmd("osd", "unset", "pause") + + out = self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json").strip() + new_epoch = json.loads(out)['epoch'] + self.assertNotEqual(self.initial_osd_epoch, new_epoch) + + # Do a metadata operation on clients, witness that they end up with + # the old OSD map from startup time (nothing has prompted client + # to update its map) + path = os.path.join(self.mount_a.mountpoint, "foo") + self.mount_a.run_python(pyscript.format(path=path)) + mount_a_epoch, mount_a_barrier = self.mount_a.get_osd_epoch() + self.assertEqual(mount_a_epoch, mount_a_initial_epoch) + self.assertEqual(mount_a_barrier, mount_a_initial_barrier) + + # Set a barrier on the MDS + self.fs.rank_asok(["osdmap", "barrier", new_epoch.__str__()]) + + # Sync up client with new MDS OSD map barrier + path = os.path.join(self.mount_a.mountpoint, "baz") + self.mount_a.run_python(pyscript.format(path=path)) + mount_a_epoch, mount_a_barrier = self.mount_a.get_osd_epoch() + self.assertEqual(mount_a_barrier, new_epoch) + + # Some time passes here because the metadata part of the operation + # completes immediately, while the resulting OSD map update happens + # asynchronously (it's an Objecter::_maybe_request_map) as a result + # of seeing the new epoch barrier. + self.wait_until_true( + lambda: self.mount_a.get_osd_epoch()[0] >= new_epoch, + timeout=30) + + def _data_pool_name(self): + data_pool_names = self.fs.get_data_pool_names() + if len(data_pool_names) > 1: + raise RuntimeError("This test can't handle multiple data pools") + else: + return data_pool_names[0] + + def _test_full(self, easy_case): + """ + - That a client trying to write data to a file is prevented + from doing so with an -EFULL result + - That they are also prevented from creating new files by the MDS. + - That they may delete another file to get the system healthy again + + :param easy_case: if true, delete a successfully written file to + free up space. else, delete the file that experienced + the failed write. + """ + + osd_mon_report_interval = int(self.fs.get_config("osd_mon_report_interval", service_type='osd')) + + log.info("Writing {0}MB should fill this cluster".format(self.fill_mb)) + + # Fill up the cluster. This dd may or may not fail, as it depends on + # how soon the cluster recognises its own fullness + self.mount_a.write_n_mb("large_file_a", self.fill_mb // 2) + try: + self.mount_a.write_n_mb("large_file_b", (self.fill_mb * 1.1) // 2) + except CommandFailedError: + log.info("Writing file B failed (full status happened already)") + assert self.is_full() + else: + log.info("Writing file B succeeded (full status will happen soon)") + self.wait_until_true(lambda: self.is_full(), + timeout=osd_mon_report_interval * 120) + + # Attempting to write more data should give me ENOSPC + with self.assertRaises(CommandFailedError) as ar: + self.mount_a.write_n_mb("large_file_b", 50, seek=self.fill_mb // 2) + self.assertEqual(ar.exception.exitstatus, 1) # dd returns 1 on "No space" + + # Wait for the MDS to see the latest OSD map so that it will reliably + # be applying the policy of rejecting non-deletion metadata operations + # while in the full state. + osd_epoch = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['epoch'] + self.wait_until_true( + lambda: self.fs.rank_asok(['status'])['osdmap_epoch'] >= osd_epoch, + timeout=10) + + if not self.data_only: + with self.assertRaises(CommandFailedError): + self.mount_a.write_n_mb("small_file_1", 0) + + # Clear out some space + if easy_case: + self.mount_a.run_shell(['rm', '-f', 'large_file_a']) + self.mount_a.run_shell(['rm', '-f', 'large_file_b']) + else: + # In the hard case it is the file that filled the system. + # Before the new #7317 (ENOSPC, epoch barrier) changes, this + # would fail because the last objects written would be + # stuck in the client cache as objecter operations. + self.mount_a.run_shell(['rm', '-f', 'large_file_b']) + self.mount_a.run_shell(['rm', '-f', 'large_file_a']) + + # Here we are waiting for two things to happen: + # * The MDS to purge the stray folder and execute object deletions + # * The OSDs to inform the mon that they are no longer full + self.wait_until_true(lambda: not self.is_full(), + timeout=osd_mon_report_interval * 120) + + # Wait for the MDS to see the latest OSD map so that it will reliably + # be applying the free space policy + osd_epoch = json.loads(self.fs.mon_manager.raw_cluster_cmd("osd", "dump", "--format=json-pretty"))['epoch'] + self.wait_until_true( + lambda: self.fs.rank_asok(['status'])['osdmap_epoch'] >= osd_epoch, + timeout=10) + + # Now I should be able to write again + self.mount_a.write_n_mb("large_file", 50, seek=0) + + # Ensure that the MDS keeps its OSD epoch barrier across a restart + + def test_full_different_file(self): + self._test_full(True) + + def test_full_same_file(self): + self._test_full(False) + + def _remote_write_test(self, template): + """ + Run some remote python in a way that's useful for + testing free space behaviour (see test_* methods using this) + """ + file_path = os.path.join(self.mount_a.mountpoint, "full_test_file") + + # Enough to trip the full flag + osd_mon_report_interval = int(self.fs.get_config("osd_mon_report_interval", service_type='osd')) + mon_tick_interval = int(self.fs.get_config("mon_tick_interval", service_type="mon")) + + # Sufficient data to cause RADOS cluster to go 'full' + log.info("pool capacity {0}, {1}MB should be enough to fill it".format(self.pool_capacity, self.fill_mb)) + + # Long enough for RADOS cluster to notice it is full and set flag on mons + # (report_interval for mon to learn PG stats, tick interval for it to update OSD map, + # factor of 1.5 for I/O + network latency in committing OSD map and distributing it + # to the OSDs) + full_wait = (osd_mon_report_interval + mon_tick_interval) * 1.5 + + # Configs for this test should bring this setting down in order to + # run reasonably quickly + if osd_mon_report_interval > 10: + log.warning("This test may run rather slowly unless you decrease" + "osd_mon_report_interval (5 is a good setting)!") + + # set the object_size to 1MB to make the objects destributed more evenly + # among the OSDs to fix Tracker#45434 + file_layout = "stripe_unit=1048576 stripe_count=1 object_size=1048576" + self.mount_a.run_python(template.format( + fill_mb=self.fill_mb, + file_path=file_path, + file_layout=file_layout, + full_wait=full_wait, + is_fuse=isinstance(self.mount_a, FuseMount) + )) + + def test_full_fclose(self): + # A remote script which opens a file handle, fills up the filesystem, and then + # checks that ENOSPC errors on buffered writes appear correctly as errors in fsync + remote_script = dedent(""" + import time + import datetime + import subprocess + import os + + # Write some buffered data through before going full, all should be well + print("writing some data through which we expect to succeed") + bytes = 0 + f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT) + os.setxattr("{file_path}", 'ceph.file.layout', b'{file_layout}') + bytes += os.write(f, b'a' * 512 * 1024) + os.fsync(f) + print("fsync'ed data successfully, will now attempt to fill fs") + + # Okay, now we're going to fill up the filesystem, and then keep + # writing until we see an error from fsync. As long as we're doing + # buffered IO, the error should always only appear from fsync and not + # from write + full = False + + for n in range(0, int({fill_mb} * 0.9)): + bytes += os.write(f, b'x' * 1024 * 1024) + print("wrote {{0}} bytes via buffered write, may repeat".format(bytes)) + print("done writing {{0}} bytes".format(bytes)) + + # OK, now we should sneak in under the full condition + # due to the time it takes the OSDs to report to the + # mons, and get a successful fsync on our full-making data + os.fsync(f) + print("successfully fsync'ed prior to getting full state reported") + + # buffered write, add more dirty data to the buffer + print("starting buffered write") + try: + for n in range(0, int({fill_mb} * 0.2)): + bytes += os.write(f, b'x' * 1024 * 1024) + print("sleeping a bit as we've exceeded 90% of our expected full ratio") + time.sleep({full_wait}) + except OSError: + pass; + + print("wrote, now waiting 30s and then doing a close we expect to fail") + + # Wait long enough for a background flush that should fail + time.sleep(30) + + if {is_fuse}: + # ...and check that the failed background flush is reflected in fclose + try: + os.close(f) + except OSError: + print("close() returned an error as expected") + else: + raise RuntimeError("close() failed to raise error") + else: + # The kernel cephfs client does not raise errors on fclose + os.close(f) + + os.unlink("{file_path}") + """) + self._remote_write_test(remote_script) + + def test_full_fsync(self): + """ + That when the full flag is encountered during asynchronous + flushes, such that an fwrite() succeeds but an fsync/fclose() + should return the ENOSPC error. + """ + + # A remote script which opens a file handle, fills up the filesystem, and then + # checks that ENOSPC errors on buffered writes appear correctly as errors in fsync + remote_script = dedent(""" + import time + import datetime + import subprocess + import os + + # Write some buffered data through before going full, all should be well + print("writing some data through which we expect to succeed") + bytes = 0 + f = os.open("{file_path}", os.O_WRONLY | os.O_CREAT) + os.setxattr("{file_path}", 'ceph.file.layout', b'{file_layout}') + bytes += os.write(f, b'a' * 4096) + os.fsync(f) + print("fsync'ed data successfully, will now attempt to fill fs") + + # Okay, now we're going to fill up the filesystem, and then keep + # writing until we see an error from fsync. As long as we're doing + # buffered IO, the error should always only appear from fsync and not + # from write + full = False + + for n in range(0, int({fill_mb} * 1.1)): + try: + bytes += os.write(f, b'x' * 1024 * 1024) + print("wrote bytes via buffered write, moving on to fsync") + except OSError as e: + if {is_fuse}: + print("Unexpected error %s from write() instead of fsync()" % e) + raise + else: + print("Reached fullness after %.2f MB" % (bytes / (1024.0 * 1024.0))) + full = True + break + + try: + os.fsync(f) + print("fsync'ed successfully") + except OSError as e: + print("Reached fullness after %.2f MB" % (bytes / (1024.0 * 1024.0))) + full = True + break + else: + print("Not full yet after %.2f MB" % (bytes / (1024.0 * 1024.0))) + + if n > {fill_mb} * 0.9: + # Be cautious in the last region where we expect to hit + # the full condition, so that we don't overshoot too dramatically + print("sleeping a bit as we've exceeded 90% of our expected full ratio") + time.sleep({full_wait}) + + if not full: + raise RuntimeError("Failed to reach fullness after writing %d bytes" % bytes) + + # close() should not raise an error because we already caught it in + # fsync. There shouldn't have been any more writeback errors + # since then because all IOs got cancelled on the full flag. + print("calling close") + os.close(f) + print("close() did not raise error") + + os.unlink("{file_path}") + """) + + self._remote_write_test(remote_script) + + +class TestQuotaFull(FullnessTestCase): + """ + Test per-pool fullness, which indicates quota limits exceeded + """ + pool_capacity = 1024 * 1024 * 32 # arbitrary low-ish limit + fill_mb = pool_capacity // (1024 * 1024) # type: ignore + + # We are only testing quota handling on the data pool, not the metadata + # pool. + data_only = True + + def setUp(self): + super(TestQuotaFull, self).setUp() + + pool_name = self.fs.get_data_pool_name() + self.fs.mon_manager.raw_cluster_cmd("osd", "pool", "set-quota", pool_name, + "max_bytes", "{0}".format(self.pool_capacity)) + + +class TestClusterFull(FullnessTestCase): + """ + Test data pool fullness, which indicates that an OSD has become too full + """ + pool_capacity = None + REQUIRE_MEMSTORE = True + + def setUp(self): + super(TestClusterFull, self).setUp() + + if self.pool_capacity is None: + TestClusterFull.pool_capacity = self.fs.get_pool_df(self._data_pool_name())['max_avail'] + TestClusterFull.fill_mb = (self.pool_capacity // (1024 * 1024)) + +# Hide the parent class so that unittest.loader doesn't try to run it. +del globals()['FullnessTestCase'] diff --git a/qa/tasks/cephfs/test_journal_migration.py b/qa/tasks/cephfs/test_journal_migration.py new file mode 100644 index 000000000..67b514c22 --- /dev/null +++ b/qa/tasks/cephfs/test_journal_migration.py @@ -0,0 +1,100 @@ + +from tasks.cephfs.cephfs_test_case import CephFSTestCase +from tasks.workunit import task as workunit + +JOURNAL_FORMAT_LEGACY = 0 +JOURNAL_FORMAT_RESILIENT = 1 + + +class TestJournalMigration(CephFSTestCase): + CLIENTS_REQUIRED = 1 + MDSS_REQUIRED = 2 + + def test_journal_migration(self): + old_journal_version = JOURNAL_FORMAT_LEGACY + new_journal_version = JOURNAL_FORMAT_RESILIENT + + self.mount_a.umount_wait() + self.fs.mds_stop() + + # Create a filesystem using the older journal format. + self.fs.set_ceph_conf('mds', 'mds journal format', old_journal_version) + self.fs.mds_restart() + self.fs.recreate() + + # Enable standby replay, to cover the bug case #8811 where + # a standby replay might mistakenly end up trying to rewrite + # the journal at the same time as an active daemon. + self.fs.set_allow_standby_replay(True) + + status = self.fs.wait_for_daemons() + + self.assertTrue(self.fs.get_replay(status=status) is not None) + + # Do some client work so that the log is populated with something. + with self.mount_a.mounted_wait(): + self.mount_a.create_files() + self.mount_a.check_files() # sanity, this should always pass + + # Run a more substantial workunit so that the length of the log to be + # coverted is going span at least a few segments + workunit(self.ctx, { + 'clients': { + "client.{0}".format(self.mount_a.client_id): ["suites/fsstress.sh"], + }, + "timeout": "3h" + }) + + # Modify the ceph.conf to ask the MDS to use the new journal format. + self.fs.set_ceph_conf('mds', 'mds journal format', new_journal_version) + + # Restart the MDS. + self.fs.mds_fail_restart() + + # This ensures that all daemons come up into a valid state + status = self.fs.wait_for_daemons() + + # Check that files created in the initial client workload are still visible + # in a client mount. + with self.mount_a.mounted_wait(): + self.mount_a.check_files() + + # Verify that the journal really has been rewritten. + journal_version = self.fs.get_journal_version() + if journal_version != new_journal_version: + raise RuntimeError("Journal was not upgraded, version should be {0} but is {1}".format( + new_journal_version, journal_version() + )) + + # Verify that cephfs-journal-tool can now read the rewritten journal + inspect_out = self.fs.journal_tool(["journal", "inspect"], 0) + if not inspect_out.endswith(": OK"): + raise RuntimeError("Unexpected journal-tool result: '{0}'".format( + inspect_out + )) + + self.fs.journal_tool(["event", "get", "json", + "--path", "/tmp/journal.json"], 0) + p = self.fs.tool_remote.sh([ + "python3", + "-c", + "import json; print(len(json.load(open('/tmp/journal.json'))))" + ]) + event_count = int(p.strip()) + if event_count < 1000: + # Approximate value of "lots", expected from having run fsstress + raise RuntimeError("Unexpectedly few journal events: {0}".format(event_count)) + + # Do some client work to check that writing the log is still working + with self.mount_a.mounted_wait(): + workunit(self.ctx, { + 'clients': { + "client.{0}".format(self.mount_a.client_id): ["fs/misc/trivial_sync.sh"], + }, + "timeout": "3h" + }) + + # Check that both an active and a standby replay are still up + status = self.fs.status() + self.assertEqual(len(list(self.fs.get_replays(status=status))), 1) + self.assertEqual(len(list(self.fs.get_ranks(status=status))), 1) diff --git a/qa/tasks/cephfs/test_journal_repair.py b/qa/tasks/cephfs/test_journal_repair.py new file mode 100644 index 000000000..c5769784d --- /dev/null +++ b/qa/tasks/cephfs/test_journal_repair.py @@ -0,0 +1,405 @@ + +""" +Test our tools for recovering the content of damaged journals +""" + +import json +import logging +from textwrap import dedent +import time + +from teuthology.exceptions import CommandFailedError, ConnectionLostError +from tasks.cephfs.filesystem import ObjectNotFound, ROOT_INO +from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology +from tasks.workunit import task as workunit + +log = logging.getLogger(__name__) + + +class TestJournalRepair(CephFSTestCase): + MDSS_REQUIRED = 2 + + def test_inject_to_empty(self): + """ + That when some dentries in the journal but nothing is in + the backing store, we correctly populate the backing store + from the journalled dentries. + """ + + # Inject metadata operations + self.mount_a.run_shell(["touch", "rootfile"]) + self.mount_a.run_shell(["mkdir", "subdir"]) + self.mount_a.run_shell(["touch", "subdir/subdirfile"]) + # There are several different paths for handling hardlinks, depending + # on whether an existing dentry (being overwritten) is also a hardlink + self.mount_a.run_shell(["mkdir", "linkdir"]) + + # Test inode -> remote transition for a dentry + self.mount_a.run_shell(["touch", "linkdir/link0"]) + self.mount_a.run_shell(["rm", "-f", "linkdir/link0"]) + self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link0"]) + + # Test nothing -> remote transition + self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link1"]) + + # Test remote -> inode transition + self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link2"]) + self.mount_a.run_shell(["rm", "-f", "linkdir/link2"]) + self.mount_a.run_shell(["touch", "linkdir/link2"]) + + # Test remote -> diff remote transition + self.mount_a.run_shell(["ln", "subdir/subdirfile", "linkdir/link3"]) + self.mount_a.run_shell(["rm", "-f", "linkdir/link3"]) + self.mount_a.run_shell(["ln", "rootfile", "linkdir/link3"]) + + # Test an empty directory + self.mount_a.run_shell(["mkdir", "subdir/subsubdir"]) + self.mount_a.run_shell(["sync"]) + + # Before we unmount, make a note of the inode numbers, later we will + # check that they match what we recover from the journal + rootfile_ino = self.mount_a.path_to_ino("rootfile") + subdir_ino = self.mount_a.path_to_ino("subdir") + linkdir_ino = self.mount_a.path_to_ino("linkdir") + subdirfile_ino = self.mount_a.path_to_ino("subdir/subdirfile") + subsubdir_ino = self.mount_a.path_to_ino("subdir/subsubdir") + + self.mount_a.umount_wait() + + # Stop the MDS + self.fs.fail() + + # Now, the journal should contain the operations, but the backing + # store shouldn't + with self.assertRaises(ObjectNotFound): + self.fs.list_dirfrag(subdir_ino) + self.assertEqual(self.fs.list_dirfrag(ROOT_INO), []) + + # Execute the dentry recovery, this should populate the backing store + self.fs.journal_tool(['event', 'recover_dentries', 'list'], 0) + + # Dentries in ROOT_INO are present + self.assertEqual(sorted(self.fs.list_dirfrag(ROOT_INO)), sorted(['rootfile_head', 'subdir_head', 'linkdir_head'])) + self.assertEqual(self.fs.list_dirfrag(subdir_ino), ['subdirfile_head', 'subsubdir_head']) + self.assertEqual(sorted(self.fs.list_dirfrag(linkdir_ino)), + sorted(['link0_head', 'link1_head', 'link2_head', 'link3_head'])) + + # Now check the MDS can read what we wrote: truncate the journal + # and start the mds. + self.fs.journal_tool(['journal', 'reset'], 0) + self.fs.set_joinable() + self.fs.wait_for_daemons() + + # List files + self.mount_a.mount_wait() + + # First ls -R to populate MDCache, such that hardlinks will + # resolve properly (recover_dentries does not create backtraces, + # so ordinarily hardlinks to inodes that happen not to have backtraces + # will be invisible in readdir). + # FIXME: hook in forward scrub here to regenerate backtraces + proc = self.mount_a.run_shell(['ls', '-R']) + self.mount_a.umount_wait() # remount to clear client cache before our second ls + self.mount_a.mount_wait() + + proc = self.mount_a.run_shell(['ls', '-R']) + self.assertEqual(proc.stdout.getvalue().strip(), + dedent(""" + .: + linkdir + rootfile + subdir + + ./linkdir: + link0 + link1 + link2 + link3 + + ./subdir: + subdirfile + subsubdir + + ./subdir/subsubdir: + """).strip()) + + # Check the correct inos were preserved by path + self.assertEqual(rootfile_ino, self.mount_a.path_to_ino("rootfile")) + self.assertEqual(subdir_ino, self.mount_a.path_to_ino("subdir")) + self.assertEqual(subdirfile_ino, self.mount_a.path_to_ino("subdir/subdirfile")) + self.assertEqual(subsubdir_ino, self.mount_a.path_to_ino("subdir/subsubdir")) + + # Check that the hard link handling came out correctly + self.assertEqual(self.mount_a.path_to_ino("linkdir/link0"), subdirfile_ino) + self.assertEqual(self.mount_a.path_to_ino("linkdir/link1"), subdirfile_ino) + self.assertNotEqual(self.mount_a.path_to_ino("linkdir/link2"), subdirfile_ino) + self.assertEqual(self.mount_a.path_to_ino("linkdir/link3"), rootfile_ino) + + # Create a new file, ensure it is not issued the same ino as one of the + # recovered ones + self.mount_a.run_shell(["touch", "afterwards"]) + new_ino = self.mount_a.path_to_ino("afterwards") + self.assertNotIn(new_ino, [rootfile_ino, subdir_ino, subdirfile_ino]) + + # Check that we can do metadata ops in the recovered directory + self.mount_a.run_shell(["touch", "subdir/subsubdir/subsubdirfile"]) + + @for_teuthology # 308s + def test_reset(self): + """ + That after forcibly modifying the backing store, we can get back into + a good state by resetting the MDSMap. + + The scenario is that we have two active MDSs, and we lose the journals. Once + we have completely lost confidence in the integrity of the metadata, we want to + return the system to a single-MDS state to go into a scrub to recover what we + can. + """ + + # Set max_mds to 2 + self.fs.set_max_mds(2) + status = self.fs.wait_for_daemons() + rank0_gid = self.fs.get_rank(rank=0, status=status)['gid'] + self.fs.set_joinable(False) # no unintended failover + + # Create a dir on each rank + self.mount_a.run_shell_payload("mkdir {alpha,bravo} && touch {alpha,bravo}/file") + self.mount_a.setfattr("alpha/", "ceph.dir.pin", "0") + self.mount_a.setfattr("bravo/", "ceph.dir.pin", "1") + + # Ensure the pinning has taken effect and the /bravo dir is now + # migrated to rank 1. + self._wait_subtrees([('/bravo', 1), ('/alpha', 0)], rank=0, status=status) + + # Do some IO (this should be split across ranks according to + # the rank-pinned dirs) + self.mount_a.create_n_files("alpha/file", 1000) + self.mount_a.create_n_files("bravo/file", 1000) + + # Flush the journals so that we have some backing store data + # belonging to one MDS, and some to the other MDS. + self.fs.rank_asok(["flush", "journal"], rank=0) + self.fs.rank_asok(["flush", "journal"], rank=1) + + # Stop (hard) the second MDS daemon + self.fs.rank_fail(rank=1) + + # Wipe out the tables for MDS rank 1 so that it is broken and can't start + # (this is the simulated failure that we will demonstrate that the disaster + # recovery tools can get us back from) + self.fs.erase_metadata_objects(prefix="mds1_") + + # Try to access files from the client + blocked_ls = self.mount_a.run_shell(["ls", "-R"], wait=False) + + # Check that this "ls -R" blocked rather than completing: indicates + # it got stuck trying to access subtrees which were on the now-dead MDS. + log.info("Sleeping to check ls is blocked...") + time.sleep(60) + self.assertFalse(blocked_ls.finished) + + # This mount is now useless because it will depend on MDS rank 1, and MDS rank 1 + # is not coming back. Kill it. + log.info("Killing mount, it's blocked on the MDS we killed") + self.mount_a.kill() + self.mount_a.kill_cleanup() + try: + # Now that the mount is dead, the ls -R should error out. + blocked_ls.wait() + except (CommandFailedError, ConnectionLostError): + # The ConnectionLostError case is for kernel client, where + # killing the mount also means killing the node. + pass + + # See that the second MDS will crash when it starts and tries to + # acquire rank 1 + self.fs.set_joinable(True) + + # The daemon taking the damaged rank should start starting, then + # restart back into standby after asking the mon to mark the rank + # damaged. + def is_marked_damaged(): + mds_map = self.fs.get_mds_map() + return 1 in mds_map['damaged'] + + self.wait_until_true(is_marked_damaged, 60) + self.assertEqual(rank0_gid, self.fs.get_rank(rank=0)['gid']) + + # Now give up and go through a disaster recovery procedure + self.fs.fail() + # Invoke recover_dentries quietly, because otherwise log spews millions of lines + self.fs.journal_tool(["event", "recover_dentries", "summary"], 0, quiet=True) + self.fs.journal_tool(["event", "recover_dentries", "summary"], 1, quiet=True) + self.fs.table_tool(["0", "reset", "session"]) + self.fs.journal_tool(["journal", "reset"], 0) + self.fs.erase_mds_objects(1) + self.fs.mon_manager.raw_cluster_cmd('fs', 'reset', self.fs.name, + '--yes-i-really-mean-it') + + # Bring an MDS back online, mount a client, and see that we can walk the full + # filesystem tree again + self.fs.set_joinable(True) # redundant with `fs reset` + status = self.fs.wait_for_daemons() + self.assertEqual(len(list(self.fs.get_ranks(status=status))), 1) + self.mount_a.mount_wait() + self.mount_a.run_shell(["ls", "-R"], wait=True) + + def test_table_tool(self): + self.mount_a.run_shell(["touch", "foo"]) + self.fs.rank_asok(["flush", "journal"]) + + log.info(self.fs.table_tool(["all", "show", "inode"])) + log.info(self.fs.table_tool(["all", "show", "snap"])) + log.info(self.fs.table_tool(["all", "show", "session"])) + + # Inode table should always be the same because initial state + # and choice of inode are deterministic. + # Should see one inode consumed + self.assertEqual( + json.loads(self.fs.table_tool(["all", "show", "inode"])), + {"0": { + "data": { + "version": 2, + "inotable": { + "projected_free": [ + {"start": 1099511628777, + "len": 1099511626775}], + "free": [ + {"start": 1099511628777, + "len": 1099511626775}]}}, + "result": 0}} + + ) + + # Should see one session + session_data = json.loads(self.fs.table_tool( + ["all", "show", "session"])) + self.assertEqual(len(session_data["0"]["data"]["sessions"]), 1) + self.assertEqual(session_data["0"]["result"], 0) + + # Should see no snaps + self.assertEqual( + json.loads(self.fs.table_tool(["all", "show", "snap"])), + {"version": 1, + "snapserver": {"last_snap": 1, + "last_created": 1, + "last_destroyed": 1, + "pending_noop": [], + "snaps": [], + "need_to_purge": {}, + "pending_update": [], + "pending_destroy": []}, + "result": 0} + ) + + # Reset everything + for table in ["session", "inode", "snap"]: + self.fs.table_tool(["all", "reset", table]) + + log.info(self.fs.table_tool(["all", "show", "inode"])) + log.info(self.fs.table_tool(["all", "show", "snap"])) + log.info(self.fs.table_tool(["all", "show", "session"])) + + # Should see 0 sessions + session_data = json.loads(self.fs.table_tool( + ["all", "show", "session"])) + self.assertEqual(len(session_data["0"]["data"]["sessions"]), 0) + self.assertEqual(session_data["0"]["result"], 0) + + # Should see entire inode range now marked free + self.assertEqual( + json.loads(self.fs.table_tool(["all", "show", "inode"])), + {"0": {"data": {"version": 1, + "inotable": {"projected_free": [ + {"start": 1099511627776, + "len": 1099511627776}], + "free": [ + {"start": 1099511627776, + "len": 1099511627776}]}}, + "result": 0}} + ) + + # Should see no snaps + self.assertEqual( + json.loads(self.fs.table_tool(["all", "show", "snap"])), + {"version": 1, + "snapserver": {"last_snap": 1, + "last_created": 1, + "last_destroyed": 1, + "pending_noop": [], + "snaps": [], + "need_to_purge": {}, + "pending_update": [], + "pending_destroy": []}, + "result": 0} + ) + + def test_table_tool_take_inos(self): + initial_range_start = 1099511627776 + initial_range_len = 1099511627776 + # Initially a completely clear range + self.assertEqual( + json.loads(self.fs.table_tool(["all", "show", "inode"])), + {"0": {"data": {"version": 0, + "inotable": {"projected_free": [ + {"start": initial_range_start, + "len": initial_range_len}], + "free": [ + {"start": initial_range_start, + "len": initial_range_len}]}}, + "result": 0}} + ) + + # Remove some + self.assertEqual( + json.loads(self.fs.table_tool(["all", "take_inos", "{0}".format(initial_range_start + 100)])), + {"0": {"data": {"version": 1, + "inotable": {"projected_free": [ + {"start": initial_range_start + 101, + "len": initial_range_len - 101}], + "free": [ + {"start": initial_range_start + 101, + "len": initial_range_len - 101}]}}, + "result": 0}} + ) + + @for_teuthology # Hack: "for_teuthology" because .sh doesn't work outside teuth + def test_journal_smoke(self): + workunit(self.ctx, { + 'clients': { + "client.{0}".format(self.mount_a.client_id): [ + "fs/misc/trivial_sync.sh"], + }, + "timeout": "1h" + }) + + for mount in self.mounts: + mount.umount_wait() + + self.fs.fail() + + # journal tool smoke + workunit(self.ctx, { + 'clients': { + "client.{0}".format(self.mount_a.client_id): [ + "suites/cephfs_journal_tool_smoke.sh"], + }, + "timeout": "1h" + }) + + + + self.fs.set_joinable() + self.fs.wait_for_daemons() + + self.mount_a.mount_wait() + + # trivial sync moutn a + workunit(self.ctx, { + 'clients': { + "client.{0}".format(self.mount_a.client_id): [ + "fs/misc/trivial_sync.sh"], + }, + "timeout": "1h" + }) + diff --git a/qa/tasks/cephfs/test_mantle.py b/qa/tasks/cephfs/test_mantle.py new file mode 100644 index 000000000..746c2ffe3 --- /dev/null +++ b/qa/tasks/cephfs/test_mantle.py @@ -0,0 +1,111 @@ +from io import StringIO + +from tasks.cephfs.cephfs_test_case import CephFSTestCase +import json +import logging + +log = logging.getLogger(__name__) +failure = "using old balancer; mantle failed for balancer=" +success = "mantle balancer version changed: " + +class TestMantle(CephFSTestCase): + def start_mantle(self): + self.wait_for_health_clear(timeout=30) + self.fs.set_max_mds(2) + self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30, + reject_fn=lambda v: v > 2 or v < 1) + + for m in self.fs.get_active_names(): + self.fs.mds_asok(['config', 'set', 'debug_objecter', '20'], mds_id=m) + self.fs.mds_asok(['config', 'set', 'debug_ms', '0'], mds_id=m) + self.fs.mds_asok(['config', 'set', 'debug_mds', '0'], mds_id=m) + self.fs.mds_asok(['config', 'set', 'debug_mds_balancer', '5'], mds_id=m) + + def push_balancer(self, obj, lua_code, expect): + self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', obj) + self.fs.radosm(["put", obj, "-"], stdin=StringIO(lua_code)) + with self.assert_cluster_log(failure + obj + " " + expect): + log.info("run a " + obj + " balancer that expects=" + expect) + + def test_version_empty(self): + self.start_mantle() + expect = " : (2) No such file or directory" + + ret = self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer') + assert(ret == 22) # EINVAL + + self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', " ") + with self.assert_cluster_log(failure + " " + expect): pass + + def test_version_not_in_rados(self): + self.start_mantle() + expect = failure + "ghost.lua : (2) No such file or directory" + self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "ghost.lua") + with self.assert_cluster_log(expect): pass + + def test_balancer_invalid(self): + self.start_mantle() + expect = ": (22) Invalid argument" + + lua_code = "this is invalid lua code!" + self.push_balancer("invalid.lua", lua_code, expect) + + lua_code = "BAL_LOG()" + self.push_balancer("invalid_log.lua", lua_code, expect) + + lua_code = "BAL_LOG(0)" + self.push_balancer("invalid_log_again.lua", lua_code, expect) + + def test_balancer_valid(self): + self.start_mantle() + lua_code = "BAL_LOG(0, \"test\")\nreturn {3, 4}" + self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "valid.lua") + self.fs.radosm(["put", "valid.lua", "-"], stdin=StringIO(lua_code)) + with self.assert_cluster_log(success + "valid.lua"): + log.info("run a valid.lua balancer") + + def test_return_invalid(self): + self.start_mantle() + expect = ": (22) Invalid argument" + + lua_code = "return \"hello\"" + self.push_balancer("string.lua", lua_code, expect) + + lua_code = "return 3" + self.push_balancer("number.lua", lua_code, expect) + + lua_code = "return {}" + self.push_balancer("dict_empty.lua", lua_code, expect) + + lua_code = "return {\"this\", \"is\", \"a\", \"test\"}" + self.push_balancer("dict_of_strings.lua", lua_code, expect) + + lua_code = "return {3, \"test\"}" + self.push_balancer("dict_of_mixed.lua", lua_code, expect) + + lua_code = "return {3}" + self.push_balancer("not_enough_numbers.lua", lua_code, expect) + + lua_code = "return {3, 4, 5, 6, 7, 8, 9}" + self.push_balancer("too_many_numbers.lua", lua_code, expect) + + def test_dead_osd(self): + self.start_mantle() + expect = " : (110) Connection timed out" + + # kill the OSDs so that the balancer pull from RADOS times out + osd_map = json.loads(self.fs.mon_manager.raw_cluster_cmd('osd', 'dump', '--format=json-pretty')) + for i in range(0, len(osd_map['osds'])): + self.fs.mon_manager.raw_cluster_cmd_result('osd', 'down', str(i)) + self.fs.mon_manager.raw_cluster_cmd_result('osd', 'out', str(i)) + + # trigger a pull from RADOS + self.fs.mon_manager.raw_cluster_cmd_result('fs', 'set', self.fs.name, 'balancer', "valid.lua") + + # make the timeout a little longer since dead OSDs spam ceph -w + with self.assert_cluster_log(failure + "valid.lua" + expect, timeout=30): + log.info("run a balancer that should timeout") + + # cleanup + for i in range(0, len(osd_map['osds'])): + self.fs.mon_manager.raw_cluster_cmd_result('osd', 'in', str(i)) diff --git a/qa/tasks/cephfs/test_mds_metrics.py b/qa/tasks/cephfs/test_mds_metrics.py new file mode 100644 index 000000000..ad877f622 --- /dev/null +++ b/qa/tasks/cephfs/test_mds_metrics.py @@ -0,0 +1,643 @@ +import os +import json +import time +import random +import logging +import errno + +from teuthology.contextutil import safe_while, MaxWhileTries +from teuthology.exceptions import CommandFailedError +from tasks.cephfs.cephfs_test_case import CephFSTestCase + +log = logging.getLogger(__name__) + +class TestMDSMetrics(CephFSTestCase): + CLIENTS_REQUIRED = 2 + MDSS_REQUIRED = 3 + + TEST_DIR_PERFIX = "test_mds_metrics" + + def setUp(self): + super(TestMDSMetrics, self).setUp() + self._start_with_single_active_mds() + self._enable_mgr_stats_plugin() + + def tearDown(self): + self._disable_mgr_stats_plugin() + super(TestMDSMetrics, self).tearDown() + + def _start_with_single_active_mds(self): + curr_max_mds = self.fs.get_var('max_mds') + if curr_max_mds > 1: + self.fs.shrink(1) + + def verify_mds_metrics(self, active_mds_count=1, client_count=1, ranks=[], mul_fs=[]): + def verify_metrics_cbk(metrics): + mds_metrics = metrics['metrics'] + if not len(mds_metrics) == active_mds_count + 1: # n active mdss + delayed set + return False + fs_status = self.fs.status() + nonlocal ranks, mul_fs + if not ranks: + if not mul_fs: + mul_fs = [self.fs.id] + for filesystem in mul_fs: + ranks = set([info['rank'] for info in fs_status.get_ranks(filesystem)]) + for rank in ranks: + r = mds_metrics.get("mds.{}".format(rank), None) + if not r or not len(mds_metrics['delayed_ranks']) == 0: + return False + for item in mul_fs: + key = fs_status.get_fsmap(item)['mdsmap']['fs_name'] + global_metrics = metrics['global_metrics'].get(key, {}) + client_metadata = metrics['client_metadata'].get(key, {}) + if not len(global_metrics) >= client_count or not len(client_metadata) >= client_count: + return False + return True + return verify_metrics_cbk + + def _fs_perf_stats(self, *args): + return self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "perf", "stats", *args) + + def _enable_mgr_stats_plugin(self): + return self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "enable", "stats") + + def _disable_mgr_stats_plugin(self): + return self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "disable", "stats") + + def _spread_directory_on_all_ranks(self, fscid): + fs_status = self.fs.status() + ranks = set([info['rank'] for info in fs_status.get_ranks(fscid)]) + # create a per-rank pinned directory + for rank in ranks: + dirname = "{0}_{1}".format(TestMDSMetrics.TEST_DIR_PERFIX, rank) + self.mount_a.run_shell(["mkdir", dirname]) + self.mount_a.setfattr(dirname, "ceph.dir.pin", str(rank)) + log.info("pinning directory {0} to rank {1}".format(dirname, rank)) + for i in range(16): + filename = "{0}.{1}".format("test", i) + self.mount_a.write_n_mb(os.path.join(dirname, filename), 1) + + def _do_spread_io(self, fscid): + # spread readdir I/O + self.mount_b.run_shell(["find", "."]) + + def _do_spread_io_all_clients(self, fscid): + # spread readdir I/O + self.mount_a.run_shell(["find", "."]) + self.mount_b.run_shell(["find", "."]) + + def _cleanup_test_dirs(self): + dirnames = self.mount_a.run_shell(["ls"]).stdout.getvalue() + for dirname in dirnames.split("\n"): + if dirname.startswith(TestMDSMetrics.TEST_DIR_PERFIX): + log.info("cleaning directory {}".format(dirname)) + self.mount_a.run_shell(["rm", "-rf", dirname]) + + def _get_metrics(self, verifier_callback, trials, *args): + metrics = None + done = False + with safe_while(sleep=1, tries=trials, action='wait for metrics') as proceed: + while proceed(): + metrics = json.loads(self._fs_perf_stats(*args)) + done = verifier_callback(metrics) + if done: + break + return done, metrics + + def _setup_fs(self, fs_name): + fs_a = self.mds_cluster.newfs(name=fs_name) + + self.mds_cluster.mds_restart() + + # Wait for filesystem to go healthy + fs_a.wait_for_daemons() + + # Reconfigure client auth caps + for mount in self.mounts: + self.mds_cluster.mon_manager.raw_cluster_cmd_result( + 'auth', 'caps', f"client.{mount.client_id}", + 'mds', 'allow', + 'mon', 'allow r', + 'osd', f'allow rw pool={fs_a.get_data_pool_name()}') + + return fs_a + + # basic check to verify if we get back metrics from each active mds rank + + def test_metrics_from_rank(self): + # validate + valid, metrics = self._get_metrics( + self.verify_mds_metrics(client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30) + log.debug("metrics={0}".format(metrics)) + self.assertTrue(valid) + + def test_metrics_post_client_disconnection(self): + # validate + valid, metrics = self._get_metrics( + self.verify_mds_metrics(client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30) + log.debug("metrics={0}".format(metrics)) + self.assertTrue(valid) + + self.mount_a.umount_wait() + + valid, metrics = self._get_metrics( + self.verify_mds_metrics(client_count=TestMDSMetrics.CLIENTS_REQUIRED - 1), 30) + log.debug("metrics={0}".format(metrics)) + self.assertTrue(valid) + + def test_metrics_mds_grow(self): + # validate + valid, metrics = self._get_metrics( + self.verify_mds_metrics(client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30) + log.debug("metrics={0}".format(metrics)) + self.assertTrue(valid) + + # grow the mds cluster + self.fs.grow(2) + + fscid = self.fs.id + # spread directory per rank + self._spread_directory_on_all_ranks(fscid) + + # spread some I/O + self._do_spread_io(fscid) + + # wait a bit for mgr to get updated metrics + time.sleep(5) + + # validate + valid, metrics = self._get_metrics(self.verify_mds_metrics( + active_mds_count=2, client_count=TestMDSMetrics.CLIENTS_REQUIRED) , 30) + log.debug("metrics={0}".format(metrics)) + self.assertTrue(valid) + + # cleanup test directories + self._cleanup_test_dirs() + + def test_metrics_mds_grow_and_shrink(self): + # validate + valid, metrics = self._get_metrics( + self.verify_mds_metrics(client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30) + log.debug("metrics={0}".format(metrics)) + self.assertTrue(valid) + + # grow the mds cluster + self.fs.grow(2) + + fscid = self.fs.id + # spread directory per rank + self._spread_directory_on_all_ranks(fscid) + + # spread some I/O + self._do_spread_io(fscid) + + # wait a bit for mgr to get updated metrics + time.sleep(5) + + # validate + valid, metrics = self._get_metrics( + self.verify_mds_metrics(active_mds_count=2, client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30) + log.debug("metrics={0}".format(metrics)) + self.assertTrue(valid) + + # shrink mds cluster + self.fs.shrink(1) + + # wait a bit for mgr to get updated metrics + time.sleep(5) + + # validate + valid, metrics = self._get_metrics( + self.verify_mds_metrics(client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30) + log.debug("metrics={0}".format(metrics)) + self.assertTrue(valid) + + # cleanup test directories + self._cleanup_test_dirs() + + def test_delayed_metrics(self): + # validate + valid, metrics = self._get_metrics( + self.verify_mds_metrics(client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30) + log.debug("metrics={0}".format(metrics)) + self.assertTrue(valid) + + # grow the mds cluster + self.fs.grow(2) + + fscid = self.fs.id + # spread directory per rank + self._spread_directory_on_all_ranks(fscid) + + # spread some I/O + self._do_spread_io(fscid) + + # wait a bit for mgr to get updated metrics + time.sleep(5) + + # validate + valid, metrics = self._get_metrics( + self.verify_mds_metrics(active_mds_count=2, client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30) + log.debug("metrics={0}".format(metrics)) + self.assertTrue(valid) + + # do not give this mds any chance + delayed_rank = 1 + mds_id_rank0 = self.fs.get_rank(rank=0)['name'] + mds_id_rank1 = self.fs.get_rank(rank=1)['name'] + + self.fs.set_inter_mds_block(True, mds_id_rank0, mds_id_rank1) + + def verify_delayed_metrics(metrics): + mds_metrics = metrics['metrics'] + r = mds_metrics.get("mds.{}".format(delayed_rank), None) + if not r or not delayed_rank in mds_metrics['delayed_ranks']: + return False + return True + # validate + valid, metrics = self._get_metrics(verify_delayed_metrics, 30) + log.debug("metrics={0}".format(metrics)) + + self.assertTrue(valid) + self.fs.set_inter_mds_block(False, mds_id_rank0, mds_id_rank1) + + # validate + valid, metrics = self._get_metrics( + self.verify_mds_metrics(active_mds_count=2, client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30) + log.debug("metrics={0}".format(metrics)) + self.assertTrue(valid) + + # cleanup test directories + self._cleanup_test_dirs() + + def test_query_mds_filter(self): + # validate + valid, metrics = self._get_metrics( + self.verify_mds_metrics(client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30) + log.debug("metrics={0}".format(metrics)) + self.assertTrue(valid) + + # grow the mds cluster + self.fs.grow(2) + + fscid = self.fs.id + # spread directory per rank + self._spread_directory_on_all_ranks(fscid) + + # spread some I/O + self._do_spread_io(fscid) + + # wait a bit for mgr to get updated metrics + time.sleep(5) + + # validate + valid, metrics = self._get_metrics( + self.verify_mds_metrics(active_mds_count=2, client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30) + log.debug("metrics={0}".format(metrics)) + self.assertTrue(valid) + + filtered_mds = 1 + def verify_filtered_mds_rank_metrics(metrics): + # checks if the metrics has only client_metadata and + # global_metrics filtered using --mds_rank=1 + global_metrics = metrics['global_metrics'].get(self.fs.name, {}) + client_metadata = metrics['client_metadata'].get(self.fs.name, {}) + mds_metrics = metrics['metrics'] + if len(mds_metrics) != 2 or f"mds.{filtered_mds}" not in mds_metrics: + return False + if len(global_metrics) > TestMDSMetrics.CLIENTS_REQUIRED or\ + len(client_metadata) > TestMDSMetrics.CLIENTS_REQUIRED: + return False + if len(set(global_metrics) - set(mds_metrics[f"mds.{filtered_mds}"])) or\ + len(set(client_metadata) - set(mds_metrics[f"mds.{filtered_mds}"])): + return False + return True + # initiate a new query with `--mds_rank` filter and validate if + # we get metrics *only* from that mds. + valid, metrics = self._get_metrics(verify_filtered_mds_rank_metrics, 30, + f'--mds_rank={filtered_mds}') + log.debug(f"metrics={metrics}") + self.assertTrue(valid, "Incorrect 'ceph fs perf stats' output" + f" with filter '--mds_rank={filtered_mds}'") + + def test_query_client_filter(self): + # validate + valid, metrics = self._get_metrics( + self.verify_mds_metrics(client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30) + log.debug("metrics={0}".format(metrics)) + self.assertTrue(valid) + + mds_metrics = metrics['metrics'] + # pick an random client + client = random.choice(list(mds_metrics['mds.0'].keys())) + # could have used regex to extract client id + client_id = (client.split(' ')[0]).split('.')[-1] + + valid, metrics = self._get_metrics( + self.verify_mds_metrics(client_count=1), 30, '--client_id={}'.format(client_id)) + log.debug("metrics={0}".format(metrics)) + self.assertTrue(valid) + + def test_query_client_ip_filter(self): + # validate + valid, metrics = self._get_metrics( + self.verify_mds_metrics(client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30) + log.debug("metrics={0}".format(metrics)) + self.assertTrue(valid) + + client_matadata = metrics['client_metadata'][self.fs.name] + # pick an random client + client = random.choice(list(client_matadata.keys())) + # get IP of client to use in filter + client_ip = client_matadata[client]['IP'] + + valid, metrics = self._get_metrics( + self.verify_mds_metrics(client_count=1), 30, '--client_ip={}'.format(client_ip)) + log.debug("metrics={0}".format(metrics)) + self.assertTrue(valid) + + # verify IP from output with filter IP + for i in metrics['client_metadata'][self.fs.name]: + self.assertEqual(client_ip, metrics['client_metadata'][self.fs.name][i]['IP']) + + def test_query_mds_and_client_filter(self): + # validate + valid, metrics = self._get_metrics( + self.verify_mds_metrics(client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30) + log.debug("metrics={0}".format(metrics)) + self.assertTrue(valid) + + # grow the mds cluster + self.fs.grow(2) + + fscid = self.fs.id + # spread directory per rank + self._spread_directory_on_all_ranks(fscid) + + # spread some I/O + self._do_spread_io_all_clients(fscid) + + # wait a bit for mgr to get updated metrics + time.sleep(5) + + # validate + valid, metrics = self._get_metrics( + self.verify_mds_metrics(active_mds_count=2, client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30) + log.debug("metrics={0}".format(metrics)) + self.assertTrue(valid) + + mds_metrics = metrics['metrics'] + + # pick an random client + client = random.choice(list(mds_metrics['mds.1'].keys())) + # could have used regex to extract client id + client_id = (client.split(' ')[0]).split('.')[-1] + filtered_mds = 1 + valid, metrics = self._get_metrics( + self.verify_mds_metrics(client_count=1, ranks=[filtered_mds]), + 30, '--mds_rank={}'.format(filtered_mds), '--client_id={}'.format(client_id)) + log.debug("metrics={0}".format(metrics)) + self.assertTrue(valid) + + def test_for_invalid_mds_rank(self): + invalid_mds_rank = "1," + # try, 'fs perf stat' command with invalid mds_rank + try: + self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "perf", "stats", "--mds_rank", invalid_mds_rank) + except CommandFailedError as ce: + if ce.exitstatus != errno.EINVAL: + raise + else: + raise RuntimeError("expected the 'fs perf stat' command to fail for invalid mds_rank") + + def test_for_invalid_client_id(self): + invalid_client_id = "abcd" + # try, 'fs perf stat' command with invalid client_id + try: + self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "perf", "stats", "--client_id", invalid_client_id) + except CommandFailedError as ce: + if ce.exitstatus != errno.EINVAL: + raise + else: + raise RuntimeError("expected the 'fs perf stat' command to fail for invalid client_id") + + def test_for_invalid_client_ip(self): + invalid_client_ip = "1.2.3" + # try, 'fs perf stat' command with invalid client_ip + try: + self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "perf", "stats", "--client_ip", invalid_client_ip) + except CommandFailedError as ce: + if ce.exitstatus != errno.EINVAL: + raise + else: + raise RuntimeError("expected the 'fs perf stat' command to fail for invalid client_ip") + + def test_perf_stats_stale_metrics(self): + """ + That `ceph fs perf stats` doesn't output stale metrics after the rank0 MDS failover + """ + # validate + valid, metrics = self._get_metrics(self.verify_mds_metrics( + active_mds_count=1, client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30) + log.debug(f'metrics={metrics}') + self.assertTrue(valid) + + # mount_a and mount_b are the clients mounted for TestMDSMetrics. So get their + # entries from the global_metrics. + client_a_name = f'client.{self.mount_a.get_global_id()}' + client_b_name = f'client.{self.mount_b.get_global_id()}' + + global_metrics = metrics['global_metrics'] + client_a_metrics = global_metrics[self.fs.name][client_a_name] + client_b_metrics = global_metrics[self.fs.name][client_b_name] + + # fail rank0 mds + self.fs.rank_fail(rank=0) + + # Wait for rank0 up:active state + self.fs.wait_for_state('up:active', rank=0, timeout=30) + + fscid = self.fs.id + + # spread directory per rank + self._spread_directory_on_all_ranks(fscid) + + # spread some I/O + self._do_spread_io_all_clients(fscid) + + # wait a bit for mgr to get updated metrics + time.sleep(5) + + # validate + try: + valid, metrics_new = self._get_metrics(self.verify_mds_metrics( + active_mds_count=1, client_count=TestMDSMetrics.CLIENTS_REQUIRED), 30) + log.debug(f'metrics={metrics_new}') + self.assertTrue(valid) + + client_metadata = metrics_new['client_metadata'] + client_a_metadata = client_metadata.get(self.fs.name, {}).get(client_a_name, {}) + client_b_metadata = client_metadata.get(self.fs.name, {}).get(client_b_name, {}) + + global_metrics = metrics_new['global_metrics'] + client_a_metrics_new = global_metrics.get(self.fs.name, {}).get(client_a_name, {}) + client_b_metrics_new = global_metrics.get(self.fs.name, {}).get(client_b_name, {}) + + # the metrics should be different for the test to succeed. + self.assertTrue(client_a_metadata and client_b_metadata and + client_a_metrics_new and client_b_metrics_new and + (client_a_metrics_new != client_a_metrics) and + (client_b_metrics_new != client_b_metrics), + "Invalid 'ceph fs perf stats' metrics after rank0 mds failover") + except MaxWhileTries: + raise RuntimeError("Failed to fetch 'ceph fs perf stats' metrics") + finally: + # cleanup test directories + self._cleanup_test_dirs() + + def test_client_metrics_and_metadata(self): + self.mount_a.umount_wait() + self.mount_b.umount_wait() + self.fs.delete_all_filesystems() + + self.mds_cluster.mon_manager.raw_cluster_cmd("fs", "flag", "set", + "enable_multiple", "true", "--yes-i-really-mean-it") + + # creating filesystem + fs_a = self._setup_fs(fs_name="fs1") + + # Mount a client on fs_a + self.mount_a.mount_wait(cephfs_name=fs_a.name) + self.mount_a.write_n_mb("pad.bin", 1) + self.mount_a.write_n_mb("test.bin", 2) + self.mount_a.path_to_ino("test.bin") + self.mount_a.create_files() + + # creating another filesystem + fs_b = self._setup_fs(fs_name="fs2") + + # Mount a client on fs_b + self.mount_b.mount_wait(cephfs_name=fs_b.name) + self.mount_b.write_n_mb("test.bin", 1) + self.mount_b.path_to_ino("test.bin") + self.mount_b.create_files() + + fscid_list = [fs_a.id, fs_b.id] + + # validate + valid, metrics = self._get_metrics( + self.verify_mds_metrics(client_count=1, mul_fs=fscid_list), 30) + log.debug(f"metrics={metrics}") + self.assertTrue(valid) + + client_metadata_a = metrics['client_metadata']['fs1'] + client_metadata_b = metrics['client_metadata']['fs2'] + + for i in client_metadata_a: + if not (client_metadata_a[i]['hostname']): + raise RuntimeError("hostname of fs1 not found!") + if not (client_metadata_a[i]['valid_metrics']): + raise RuntimeError("valid_metrics of fs1 not found!") + + for i in client_metadata_b: + if not (client_metadata_b[i]['hostname']): + raise RuntimeError("hostname of fs2 not found!") + if not (client_metadata_b[i]['valid_metrics']): + raise RuntimeError("valid_metrics of fs2 not found!") + + def test_non_existing_mds_rank(self): + def verify_filtered_metrics(metrics): + # checks if the metrics has non empty client_metadata and global_metrics + if metrics['client_metadata'].get(self.fs.name, {})\ + or metrics['global_metrics'].get(self.fs.name, {}): + return True + return False + + try: + # validate + filter_rank = random.randint(1, 10) + valid, metrics = self._get_metrics(verify_filtered_metrics, 30, + '--mds_rank={}'.format(filter_rank)) + log.info(f'metrics={metrics}') + self.assertFalse(valid, "Fetched 'ceph fs perf stats' metrics using nonexistent MDS rank") + except MaxWhileTries: + # success + pass + + def test_perf_stats_stale_metrics_with_multiple_filesystem(self): + self.mount_a.umount_wait() + self.mount_b.umount_wait() + + self.mds_cluster.mon_manager.raw_cluster_cmd("fs", "flag", "set", + "enable_multiple", "true", "--yes-i-really-mean-it") + + # creating filesystem + fs_b = self._setup_fs(fs_name="fs2") + + # Mount a client on fs_b + self.mount_b.mount_wait(cephfs_name=fs_b.name) + self.mount_b.write_n_mb("test.bin", 1) + self.mount_b.path_to_ino("test.bin") + self.mount_b.create_files() + + # creating another filesystem + fs_a = self._setup_fs(fs_name="fs1") + + # Mount a client on fs_a + self.mount_a.mount_wait(cephfs_name=fs_a.name) + self.mount_a.write_n_mb("pad.bin", 1) + self.mount_a.write_n_mb("test.bin", 2) + self.mount_a.path_to_ino("test.bin") + self.mount_a.create_files() + + # validate + valid, metrics = self._get_metrics( + self.verify_mds_metrics(client_count=1, mul_fs=[fs_a.id, fs_b.id]), 30) + log.debug(f"metrics={metrics}") + self.assertTrue(valid) + + # get mounted client's entries from the global_metrics. + client_a_name = f'client.{self.mount_a.get_global_id()}' + + global_metrics = metrics['global_metrics'] + client_a_metrics = global_metrics.get("fs1", {}).get(client_a_name, {}) + + # fail active mds of fs_a + fs_a_mds = fs_a.get_active_names()[0] + self.mds_cluster.mds_fail(fs_a_mds) + fs_a.wait_for_state('up:active', rank=0, timeout=30) + + # spread directory per rank + self._spread_directory_on_all_ranks(fs_a.id) + + # spread some I/O + self._do_spread_io_all_clients(fs_a.id) + + # wait a bit for mgr to get updated metrics + time.sleep(5) + + # validate + try: + valid, metrics_new = self._get_metrics( + self.verify_mds_metrics(client_count=1, mul_fs=[fs_a.id, fs_b.id]), 30) + log.debug(f'metrics={metrics_new}') + self.assertTrue(valid) + + client_metadata = metrics_new['client_metadata'] + client_a_metadata = client_metadata.get("fs1", {}).get(client_a_name, {}) + + global_metrics = metrics_new['global_metrics'] + client_a_metrics_new = global_metrics.get("fs1", {}).get(client_a_name, {}) + + # the metrics should be different for the test to succeed. + self.assertTrue(client_a_metadata and client_a_metrics_new + and (client_a_metrics_new != client_a_metrics), + "Invalid 'ceph fs perf stats' metrics after" + f" rank0 mds of {fs_a.name} failover") + except MaxWhileTries: + raise RuntimeError("Failed to fetch `ceph fs perf stats` metrics") + finally: + # cleanup test directories + self._cleanup_test_dirs() + diff --git a/qa/tasks/cephfs/test_meta_injection.py b/qa/tasks/cephfs/test_meta_injection.py new file mode 100644 index 000000000..916b30a25 --- /dev/null +++ b/qa/tasks/cephfs/test_meta_injection.py @@ -0,0 +1,38 @@ +from tasks.cephfs.cephfs_test_case import CephFSTestCase + +class TestMetaInjection(CephFSTestCase): + def test_meta_injection(self): + conf_ori = self.fs.mds_asok(['config', 'show']) + self.fs.mds_asok(['config', 'set', 'mds_log_max_segments', '1']) + self.mount_a.run_shell(["mkdir", "metadir"]) + self.mount_a.run_shell(["touch", "metadir/metafile1"]) + self.mount_a.run_shell(["touch", "metadir/metafile2"]) + self.fs.mds_asok(['flush', 'journal']) + dirino = self.mount_a.path_to_ino("metadir") + ino = self.mount_a.path_to_ino("metadir/metafile1") + + # export meta of ino + self.fs.meta_tool(['showm', '-i', str(ino), '-o', '/tmp/meta_out'], 0, True) + out = self.mount_a.run_shell(['grep', str(ino),'/tmp/meta_out']).stdout.getvalue().strip() + + # check the metadata of ino + self.assertNotEqual(out.find(u'"ino":'+ str(ino)), -1) + + # amend info of ino + self.fs.get_meta_of_fs_file(dirino, "metafile1", "/tmp/meta_obj") + self.fs.meta_tool(['amend', '-i', str(ino), '--in', '/tmp/meta_out', '--yes-i-really-really-mean-it'], 0, True) + self.fs.get_meta_of_fs_file(dirino, "metafile1", "/tmp/meta_obj_chg") + + # checkout meta_out after import it + ori_mds5 = self.mount_a.run_shell(["md5sum", "/tmp/meta_obj"]).stdout.getvalue().strip().split() + chg_mds5 = self.mount_a.run_shell(["md5sum", "/tmp/meta_obj_chg"]).stdout.getvalue().strip().split() + print(ori_mds5," ==> ", chg_mds5) + self.assertEqual(len(ori_mds5), 2) + self.assertEqual(len(chg_mds5), 2) + self.assertEqual(ori_mds5[0], chg_mds5[0]) + + self.mount_a.run_shell(["rm", "metadir", "-rf"]) + self.mount_a.run_shell(["rm", "/tmp/meta_obj"]) + self.mount_a.run_shell(["rm", "/tmp/meta_obj_chg"]) + # restore config of mds_log_max_segments + self.fs.mds_asok(['config', 'set', 'mds_log_max_segments', conf_ori["mds_log_max_segments"]]) diff --git a/qa/tasks/cephfs/test_mirroring.py b/qa/tasks/cephfs/test_mirroring.py new file mode 100644 index 000000000..c1a940e3f --- /dev/null +++ b/qa/tasks/cephfs/test_mirroring.py @@ -0,0 +1,1298 @@ +import os +import json +import errno +import logging +import random +import time + +from io import StringIO +from collections import deque + +from tasks.cephfs.cephfs_test_case import CephFSTestCase +from teuthology.exceptions import CommandFailedError +from teuthology.contextutil import safe_while + +log = logging.getLogger(__name__) + +class TestMirroring(CephFSTestCase): + MDSS_REQUIRED = 5 + CLIENTS_REQUIRED = 2 + REQUIRE_BACKUP_FILESYSTEM = True + + MODULE_NAME = "mirroring" + + def setUp(self): + super(TestMirroring, self).setUp() + self.primary_fs_name = self.fs.name + self.primary_fs_id = self.fs.id + self.secondary_fs_name = self.backup_fs.name + self.secondary_fs_id = self.backup_fs.id + self.enable_mirroring_module() + + def tearDown(self): + self.disable_mirroring_module() + super(TestMirroring, self).tearDown() + + def enable_mirroring_module(self): + self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "enable", TestMirroring.MODULE_NAME) + + def disable_mirroring_module(self): + self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "disable", TestMirroring.MODULE_NAME) + + def enable_mirroring(self, fs_name, fs_id): + self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "enable", fs_name) + time.sleep(10) + # verify via asok + res = self.mirror_daemon_command(f'mirror status for fs: {fs_name}', + 'fs', 'mirror', 'status', f'{fs_name}@{fs_id}') + self.assertTrue(res['peers'] == {}) + self.assertTrue(res['snap_dirs']['dir_count'] == 0) + + def disable_mirroring(self, fs_name, fs_id): + self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "disable", fs_name) + time.sleep(10) + # verify via asok + try: + self.mirror_daemon_command(f'mirror status for fs: {fs_name}', + 'fs', 'mirror', 'status', f'{fs_name}@{fs_id}') + except CommandFailedError: + pass + else: + raise RuntimeError('expected admin socket to be unavailable') + + def verify_peer_added(self, fs_name, fs_id, peer_spec, remote_fs_name=None): + # verify via asok + res = self.mirror_daemon_command(f'mirror status for fs: {fs_name}', + 'fs', 'mirror', 'status', f'{fs_name}@{fs_id}') + peer_uuid = self.get_peer_uuid(peer_spec) + self.assertTrue(peer_uuid in res['peers']) + client_name = res['peers'][peer_uuid]['remote']['client_name'] + cluster_name = res['peers'][peer_uuid]['remote']['cluster_name'] + self.assertTrue(peer_spec == f'{client_name}@{cluster_name}') + if remote_fs_name: + self.assertTrue(self.secondary_fs_name == res['peers'][peer_uuid]['remote']['fs_name']) + else: + self.assertTrue(self.fs_name == res['peers'][peer_uuid]['remote']['fs_name']) + + def peer_add(self, fs_name, fs_id, peer_spec, remote_fs_name=None): + if remote_fs_name: + self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "peer_add", fs_name, peer_spec, remote_fs_name) + else: + self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "peer_add", fs_name, peer_spec) + time.sleep(10) + self.verify_peer_added(fs_name, fs_id, peer_spec, remote_fs_name) + + def peer_remove(self, fs_name, fs_id, peer_spec): + peer_uuid = self.get_peer_uuid(peer_spec) + self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "peer_remove", fs_name, peer_uuid) + time.sleep(10) + # verify via asok + res = self.mirror_daemon_command(f'mirror status for fs: {fs_name}', + 'fs', 'mirror', 'status', f'{fs_name}@{fs_id}') + self.assertTrue(res['peers'] == {} and res['snap_dirs']['dir_count'] == 0) + + def bootstrap_peer(self, fs_name, client_name, site_name): + outj = json.loads(self.mgr_cluster.mon_manager.raw_cluster_cmd( + "fs", "snapshot", "mirror", "peer_bootstrap", "create", fs_name, client_name, site_name)) + return outj['token'] + + def import_peer(self, fs_name, token): + self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "peer_bootstrap", "import", + fs_name, token) + + def add_directory(self, fs_name, fs_id, dir_name): + # get initial dir count + res = self.mirror_daemon_command(f'mirror status for fs: {fs_name}', + 'fs', 'mirror', 'status', f'{fs_name}@{fs_id}') + dir_count = res['snap_dirs']['dir_count'] + log.debug(f'initial dir_count={dir_count}') + + self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "add", fs_name, dir_name) + + time.sleep(10) + # verify via asok + res = self.mirror_daemon_command(f'mirror status for fs: {fs_name}', + 'fs', 'mirror', 'status', f'{fs_name}@{fs_id}') + new_dir_count = res['snap_dirs']['dir_count'] + log.debug(f'new dir_count={new_dir_count}') + self.assertTrue(new_dir_count > dir_count) + + def remove_directory(self, fs_name, fs_id, dir_name): + # get initial dir count + res = self.mirror_daemon_command(f'mirror status for fs: {fs_name}', + 'fs', 'mirror', 'status', f'{fs_name}@{fs_id}') + dir_count = res['snap_dirs']['dir_count'] + log.debug(f'initial dir_count={dir_count}') + + self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "remove", fs_name, dir_name) + + time.sleep(10) + # verify via asok + res = self.mirror_daemon_command(f'mirror status for fs: {fs_name}', + 'fs', 'mirror', 'status', f'{fs_name}@{fs_id}') + new_dir_count = res['snap_dirs']['dir_count'] + log.debug(f'new dir_count={new_dir_count}') + self.assertTrue(new_dir_count < dir_count) + + def check_peer_status(self, fs_name, fs_id, peer_spec, dir_name, expected_snap_name, + expected_snap_count): + peer_uuid = self.get_peer_uuid(peer_spec) + res = self.mirror_daemon_command(f'peer status for fs: {fs_name}', + 'fs', 'mirror', 'peer', 'status', + f'{fs_name}@{fs_id}', peer_uuid) + self.assertTrue(dir_name in res) + self.assertTrue(res[dir_name]['last_synced_snap']['name'] == expected_snap_name) + self.assertTrue(res[dir_name]['snaps_synced'] == expected_snap_count) + + def check_peer_status_deleted_snap(self, fs_name, fs_id, peer_spec, dir_name, + expected_delete_count): + peer_uuid = self.get_peer_uuid(peer_spec) + res = self.mirror_daemon_command(f'peer status for fs: {fs_name}', + 'fs', 'mirror', 'peer', 'status', + f'{fs_name}@{fs_id}', peer_uuid) + self.assertTrue(dir_name in res) + self.assertTrue(res[dir_name]['snaps_deleted'] == expected_delete_count) + + def check_peer_status_renamed_snap(self, fs_name, fs_id, peer_spec, dir_name, + expected_rename_count): + peer_uuid = self.get_peer_uuid(peer_spec) + res = self.mirror_daemon_command(f'peer status for fs: {fs_name}', + 'fs', 'mirror', 'peer', 'status', + f'{fs_name}@{fs_id}', peer_uuid) + self.assertTrue(dir_name in res) + self.assertTrue(res[dir_name]['snaps_renamed'] == expected_rename_count) + + def check_peer_snap_in_progress(self, fs_name, fs_id, + peer_spec, dir_name, snap_name): + peer_uuid = self.get_peer_uuid(peer_spec) + res = self.mirror_daemon_command(f'peer status for fs: {fs_name}', + 'fs', 'mirror', 'peer', 'status', + f'{fs_name}@{fs_id}', peer_uuid) + self.assertTrue('syncing' == res[dir_name]['state']) + self.assertTrue(res[dir_name]['current_sycning_snap']['name'] == snap_name) + + def verify_snapshot(self, dir_name, snap_name): + snap_list = self.mount_b.ls(path=f'{dir_name}/.snap') + self.assertTrue(snap_name in snap_list) + + source_res = self.mount_a.dir_checksum(path=f'{dir_name}/.snap/{snap_name}', + follow_symlinks=True) + log.debug(f'source snapshot checksum {snap_name} {source_res}') + + dest_res = self.mount_b.dir_checksum(path=f'{dir_name}/.snap/{snap_name}', + follow_symlinks=True) + log.debug(f'destination snapshot checksum {snap_name} {dest_res}') + self.assertTrue(source_res == dest_res) + + def verify_failed_directory(self, fs_name, fs_id, peer_spec, dir_name): + peer_uuid = self.get_peer_uuid(peer_spec) + res = self.mirror_daemon_command(f'peer status for fs: {fs_name}', + 'fs', 'mirror', 'peer', 'status', + f'{fs_name}@{fs_id}', peer_uuid) + self.assertTrue('failed' == res[dir_name]['state']) + + def get_peer_uuid(self, peer_spec): + status = self.fs.status() + fs_map = status.get_fsmap_byname(self.primary_fs_name) + peers = fs_map['mirror_info']['peers'] + for peer_uuid, mirror_info in peers.items(): + client_name = mirror_info['remote']['client_name'] + cluster_name = mirror_info['remote']['cluster_name'] + remote_peer_spec = f'{client_name}@{cluster_name}' + if peer_spec == remote_peer_spec: + return peer_uuid + return None + + def get_daemon_admin_socket(self): + """overloaded by teuthology override (fs/mirror/clients/mirror.yaml)""" + return "/var/run/ceph/cephfs-mirror.asok" + + def get_mirror_daemon_pid(self): + """pid file overloaded in fs/mirror/clients/mirror.yaml""" + return self.mount_a.run_shell(['cat', '/var/run/ceph/cephfs-mirror.pid']).stdout.getvalue().strip() + + def get_mirror_rados_addr(self, fs_name, fs_id): + """return the rados addr used by cephfs-mirror instance""" + res = self.mirror_daemon_command(f'mirror status for fs: {fs_name}', + 'fs', 'mirror', 'status', f'{fs_name}@{fs_id}') + return res['rados_inst'] + + def mirror_daemon_command(self, cmd_label, *args): + asok_path = self.get_daemon_admin_socket() + try: + # use mount_a's remote to execute command + p = self.mount_a.client_remote.run(args= + ['ceph', '--admin-daemon', asok_path] + list(args), + stdout=StringIO(), stderr=StringIO(), timeout=30, + check_status=True, label=cmd_label) + p.wait() + except CommandFailedError as ce: + log.warn(f'mirror daemon command with label "{cmd_label}" failed: {ce}') + raise + res = p.stdout.getvalue().strip() + log.debug(f'command returned={res}') + return json.loads(res) + + def get_mirror_daemon_status(self): + daemon_status = json.loads(self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "daemon", "status")) + log.debug(f'daemon_status: {daemon_status}') + # running a single mirror daemon is supported + status = daemon_status[0] + log.debug(f'status: {status}') + return status + + def test_basic_mirror_commands(self): + self.enable_mirroring(self.primary_fs_name, self.primary_fs_id) + self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) + + def test_mirror_peer_commands(self): + self.enable_mirroring(self.primary_fs_name, self.primary_fs_id) + + # add peer + self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name) + # remove peer + self.peer_remove(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph") + + self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) + + def test_mirror_disable_with_peer(self): + self.enable_mirroring(self.primary_fs_name, self.primary_fs_id) + + # add peer + self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name) + + self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) + + def test_matching_peer(self): + self.enable_mirroring(self.primary_fs_name, self.primary_fs_id) + + try: + self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph") + except CommandFailedError as ce: + if ce.exitstatus != errno.EINVAL: + raise RuntimeError('invalid errno when adding a matching remote peer') + else: + raise RuntimeError('adding a peer matching local spec should fail') + + # verify via asok -- nothing should get added + res = self.mirror_daemon_command(f'mirror status for fs: {self.primary_fs_name}', + 'fs', 'mirror', 'status', f'{self.primary_fs_name}@{self.primary_fs_id}') + self.assertTrue(res['peers'] == {}) + + # and explicitly specifying the spec (via filesystem name) should fail too + try: + self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.primary_fs_name) + except CommandFailedError as ce: + if ce.exitstatus != errno.EINVAL: + raise RuntimeError('invalid errno when adding a matching remote peer') + else: + raise RuntimeError('adding a peer matching local spec should fail') + + # verify via asok -- nothing should get added + res = self.mirror_daemon_command(f'mirror status for fs: {self.primary_fs_name}', + 'fs', 'mirror', 'status', f'{self.primary_fs_name}@{self.primary_fs_id}') + self.assertTrue(res['peers'] == {}) + + self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) + + def test_mirror_peer_add_existing(self): + self.enable_mirroring(self.primary_fs_name, self.primary_fs_id) + + # add peer + self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name) + + # adding the same peer should be idempotent + self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name) + + # remove peer + self.peer_remove(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph") + + self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) + + def test_peer_commands_with_mirroring_disabled(self): + # try adding peer when mirroring is not enabled + try: + self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name) + except CommandFailedError as ce: + if ce.exitstatus != errno.EINVAL: + raise RuntimeError(-errno.EINVAL, 'incorrect error code when adding a peer') + else: + raise RuntimeError(-errno.EINVAL, 'expected peer_add to fail') + + # try removing peer + try: + self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "peer_remove", self.primary_fs_name, 'dummy-uuid') + except CommandFailedError as ce: + if ce.exitstatus != errno.EINVAL: + raise RuntimeError(-errno.EINVAL, 'incorrect error code when removing a peer') + else: + raise RuntimeError(-errno.EINVAL, 'expected peer_remove to fail') + + def test_add_directory_with_mirroring_disabled(self): + # try adding a directory when mirroring is not enabled + try: + self.add_directory(self.primary_fs_name, self.primary_fs_id, "/d1") + except CommandFailedError as ce: + if ce.exitstatus != errno.EINVAL: + raise RuntimeError(-errno.EINVAL, 'incorrect error code when adding a directory') + else: + raise RuntimeError(-errno.EINVAL, 'expected directory add to fail') + + def test_directory_commands(self): + self.mount_a.run_shell(["mkdir", "d1"]) + self.enable_mirroring(self.primary_fs_name, self.primary_fs_id) + self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d1') + try: + self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d1') + except CommandFailedError as ce: + if ce.exitstatus != errno.EEXIST: + raise RuntimeError(-errno.EINVAL, 'incorrect error code when re-adding a directory') + else: + raise RuntimeError(-errno.EINVAL, 'expected directory add to fail') + self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d1') + try: + self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d1') + except CommandFailedError as ce: + if ce.exitstatus not in (errno.ENOENT, errno.EINVAL): + raise RuntimeError(-errno.EINVAL, 'incorrect error code when re-deleting a directory') + else: + raise RuntimeError(-errno.EINVAL, 'expected directory removal to fail') + self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) + self.mount_a.run_shell(["rmdir", "d1"]) + + def test_add_relative_directory_path(self): + self.enable_mirroring(self.primary_fs_name, self.primary_fs_id) + try: + self.add_directory(self.primary_fs_name, self.primary_fs_id, './d1') + except CommandFailedError as ce: + if ce.exitstatus != errno.EINVAL: + raise RuntimeError(-errno.EINVAL, 'incorrect error code when adding a relative path dir') + else: + raise RuntimeError(-errno.EINVAL, 'expected directory add to fail') + self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) + + def test_add_directory_path_normalization(self): + self.mount_a.run_shell(["mkdir", "-p", "d1/d2/d3"]) + self.enable_mirroring(self.primary_fs_name, self.primary_fs_id) + self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d1/d2/d3') + def check_add_command_failure(dir_path): + try: + self.add_directory(self.primary_fs_name, self.primary_fs_id, dir_path) + except CommandFailedError as ce: + if ce.exitstatus != errno.EEXIST: + raise RuntimeError(-errno.EINVAL, 'incorrect error code when re-adding a directory') + else: + raise RuntimeError(-errno.EINVAL, 'expected directory add to fail') + + # everything points for /d1/d2/d3 + check_add_command_failure('/d1/d2/././././././d3') + check_add_command_failure('/d1/d2/././././././d3//////') + check_add_command_failure('/d1/d2/../d2/././././d3') + check_add_command_failure('/././././d1/./././d2/./././d3//////') + check_add_command_failure('/./d1/./d2/./d3/../../../d1/d2/d3') + + self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) + self.mount_a.run_shell(["rm", "-rf", "d1"]) + + def test_add_ancestor_and_child_directory(self): + self.mount_a.run_shell(["mkdir", "-p", "d1/d2/d3"]) + self.mount_a.run_shell(["mkdir", "-p", "d1/d4"]) + self.enable_mirroring(self.primary_fs_name, self.primary_fs_id) + self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d1/d2/') + def check_add_command_failure(dir_path): + try: + self.add_directory(self.primary_fs_name, self.primary_fs_id, dir_path) + except CommandFailedError as ce: + if ce.exitstatus != errno.EINVAL: + raise RuntimeError(-errno.EINVAL, 'incorrect error code when adding a directory') + else: + raise RuntimeError(-errno.EINVAL, 'expected directory add to fail') + + # cannot add ancestors or a subtree for an existing directory + check_add_command_failure('/') + check_add_command_failure('/d1') + check_add_command_failure('/d1/d2/d3') + + # obviously, one can add a non-ancestor or non-subtree + self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d1/d4/') + + self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) + self.mount_a.run_shell(["rm", "-rf", "d1"]) + + def test_cephfs_mirror_blocklist(self): + self.enable_mirroring(self.primary_fs_name, self.primary_fs_id) + + # add peer + self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name) + + res = self.mirror_daemon_command(f'mirror status for fs: {self.primary_fs_name}', + 'fs', 'mirror', 'status', f'{self.primary_fs_name}@{self.primary_fs_id}') + peers_1 = set(res['peers']) + + # fetch rados address for blacklist check + rados_inst = self.get_mirror_rados_addr(self.primary_fs_name, self.primary_fs_id) + + # simulate non-responding mirror daemon by sending SIGSTOP + pid = self.get_mirror_daemon_pid() + log.debug(f'SIGSTOP to cephfs-mirror pid {pid}') + self.mount_a.run_shell(['kill', '-SIGSTOP', pid]) + + # wait for blocklist timeout -- the manager module would blocklist + # the mirror daemon + time.sleep(40) + + # wake up the mirror daemon -- at this point, the daemon should know + # that it has been blocklisted + log.debug('SIGCONT to cephfs-mirror') + self.mount_a.run_shell(['kill', '-SIGCONT', pid]) + + # check if the rados addr is blocklisted + self.assertTrue(self.mds_cluster.is_addr_blocklisted(rados_inst)) + + # wait enough so that the mirror daemon restarts blocklisted instances + time.sleep(40) + rados_inst_new = self.get_mirror_rados_addr(self.primary_fs_name, self.primary_fs_id) + + # and we should get a new rados instance + self.assertTrue(rados_inst != rados_inst_new) + + # along with peers that were added + res = self.mirror_daemon_command(f'mirror status for fs: {self.primary_fs_name}', + 'fs', 'mirror', 'status', f'{self.primary_fs_name}@{self.primary_fs_id}') + peers_2 = set(res['peers']) + self.assertTrue(peers_1, peers_2) + + self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) + + def test_cephfs_mirror_stats(self): + log.debug('reconfigure client auth caps') + self.mds_cluster.mon_manager.raw_cluster_cmd_result( + 'auth', 'caps', "client.{0}".format(self.mount_b.client_id), + 'mds', 'allow rw', + 'mon', 'allow r', + 'osd', 'allow rw pool={0}, allow rw pool={1}'.format( + self.backup_fs.get_data_pool_name(), self.backup_fs.get_data_pool_name())) + + log.debug(f'mounting filesystem {self.secondary_fs_name}') + self.mount_b.umount_wait() + self.mount_b.mount_wait(cephfs_name=self.secondary_fs_name) + + # create a bunch of files in a directory to snap + self.mount_a.run_shell(["mkdir", "d0"]) + self.mount_a.create_n_files('d0/file', 50, sync=True) + + self.enable_mirroring(self.primary_fs_name, self.primary_fs_id) + self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d0') + self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name) + + # take a snapshot + self.mount_a.run_shell(["mkdir", "d0/.snap/snap0"]) + + time.sleep(30) + self.check_peer_status(self.primary_fs_name, self.primary_fs_id, + "client.mirror_remote@ceph", '/d0', 'snap0', 1) + self.verify_snapshot('d0', 'snap0') + + # some more IO + self.mount_a.run_shell(["mkdir", "d0/d00"]) + self.mount_a.run_shell(["mkdir", "d0/d01"]) + + self.mount_a.create_n_files('d0/d00/more_file', 20, sync=True) + self.mount_a.create_n_files('d0/d01/some_more_file', 75, sync=True) + + # take another snapshot + self.mount_a.run_shell(["mkdir", "d0/.snap/snap1"]) + + time.sleep(60) + self.check_peer_status(self.primary_fs_name, self.primary_fs_id, + "client.mirror_remote@ceph", '/d0', 'snap1', 2) + self.verify_snapshot('d0', 'snap1') + + # delete a snapshot + self.mount_a.run_shell(["rmdir", "d0/.snap/snap0"]) + + time.sleep(10) + snap_list = self.mount_b.ls(path='d0/.snap') + self.assertTrue('snap0' not in snap_list) + self.check_peer_status_deleted_snap(self.primary_fs_name, self.primary_fs_id, + "client.mirror_remote@ceph", '/d0', 1) + + # rename a snapshot + self.mount_a.run_shell(["mv", "d0/.snap/snap1", "d0/.snap/snap2"]) + + time.sleep(10) + snap_list = self.mount_b.ls(path='d0/.snap') + self.assertTrue('snap1' not in snap_list) + self.assertTrue('snap2' in snap_list) + self.check_peer_status_renamed_snap(self.primary_fs_name, self.primary_fs_id, + "client.mirror_remote@ceph", '/d0', 1) + + self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d0') + self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) + + def test_cephfs_mirror_cancel_sync(self): + log.debug('reconfigure client auth caps') + self.mds_cluster.mon_manager.raw_cluster_cmd_result( + 'auth', 'caps', "client.{0}".format(self.mount_b.client_id), + 'mds', 'allow rw', + 'mon', 'allow r', + 'osd', 'allow rw pool={0}, allow rw pool={1}'.format( + self.backup_fs.get_data_pool_name(), self.backup_fs.get_data_pool_name())) + + log.debug(f'mounting filesystem {self.secondary_fs_name}') + self.mount_b.umount_wait() + self.mount_b.mount_wait(cephfs_name=self.secondary_fs_name) + + # create a bunch of files in a directory to snap + self.mount_a.run_shell(["mkdir", "d0"]) + for i in range(8): + filename = f'file.{i}' + self.mount_a.write_n_mb(os.path.join('d0', filename), 1024) + + self.enable_mirroring(self.primary_fs_name, self.primary_fs_id) + self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d0') + self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name) + + # take a snapshot + self.mount_a.run_shell(["mkdir", "d0/.snap/snap0"]) + + time.sleep(10) + self.check_peer_snap_in_progress(self.primary_fs_name, self.primary_fs_id, + "client.mirror_remote@ceph", '/d0', 'snap0') + + self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d0') + + snap_list = self.mount_b.ls(path='d0/.snap') + self.assertTrue('snap0' not in snap_list) + self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) + + def test_cephfs_mirror_restart_sync_on_blocklist(self): + log.debug('reconfigure client auth caps') + self.mds_cluster.mon_manager.raw_cluster_cmd_result( + 'auth', 'caps', "client.{0}".format(self.mount_b.client_id), + 'mds', 'allow rw', + 'mon', 'allow r', + 'osd', 'allow rw pool={0}, allow rw pool={1}'.format( + self.backup_fs.get_data_pool_name(), self.backup_fs.get_data_pool_name())) + + log.debug(f'mounting filesystem {self.secondary_fs_name}') + self.mount_b.umount_wait() + self.mount_b.mount_wait(cephfs_name=self.secondary_fs_name) + + # create a bunch of files in a directory to snap + self.mount_a.run_shell(["mkdir", "d0"]) + for i in range(8): + filename = f'file.{i}' + self.mount_a.write_n_mb(os.path.join('d0', filename), 1024) + + self.enable_mirroring(self.primary_fs_name, self.primary_fs_id) + self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d0') + self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name) + + # fetch rados address for blacklist check + rados_inst = self.get_mirror_rados_addr(self.primary_fs_name, self.primary_fs_id) + + # take a snapshot + self.mount_a.run_shell(["mkdir", "d0/.snap/snap0"]) + + time.sleep(10) + self.check_peer_snap_in_progress(self.primary_fs_name, self.primary_fs_id, + "client.mirror_remote@ceph", '/d0', 'snap0') + + # simulate non-responding mirror daemon by sending SIGSTOP + pid = self.get_mirror_daemon_pid() + log.debug(f'SIGSTOP to cephfs-mirror pid {pid}') + self.mount_a.run_shell(['kill', '-SIGSTOP', pid]) + + # wait for blocklist timeout -- the manager module would blocklist + # the mirror daemon + time.sleep(40) + + # wake up the mirror daemon -- at this point, the daemon should know + # that it has been blocklisted + log.debug('SIGCONT to cephfs-mirror') + self.mount_a.run_shell(['kill', '-SIGCONT', pid]) + + # check if the rados addr is blocklisted + self.assertTrue(self.mds_cluster.is_addr_blocklisted(rados_inst)) + + time.sleep(500) + self.check_peer_status(self.primary_fs_name, self.primary_fs_id, + "client.mirror_remote@ceph", '/d0', 'snap0', expected_snap_count=1) + self.verify_snapshot('d0', 'snap0') + + self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d0') + self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) + + def test_cephfs_mirror_failed_sync_with_correction(self): + self.enable_mirroring(self.primary_fs_name, self.primary_fs_id) + self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name) + + # add a non-existent directory for synchronization + self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d0') + + # wait for mirror daemon to mark it the directory as failed + time.sleep(120) + self.verify_failed_directory(self.primary_fs_name, self.primary_fs_id, + "client.mirror_remote@ceph", '/d0') + + # create the directory + self.mount_a.run_shell(["mkdir", "d0"]) + self.mount_a.run_shell(["mkdir", "d0/.snap/snap0"]) + + # wait for correction + time.sleep(120) + self.check_peer_status(self.primary_fs_name, self.primary_fs_id, + "client.mirror_remote@ceph", '/d0', 'snap0', 1) + self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) + + def test_cephfs_mirror_service_daemon_status(self): + self.enable_mirroring(self.primary_fs_name, self.primary_fs_id) + self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name) + + time.sleep(30) + status = self.get_mirror_daemon_status() + + # assumption for this test: mirroring enabled for a single filesystem w/ single + # peer + + # we have not added any directories + peer = status['filesystems'][0]['peers'][0] + self.assertEquals(status['filesystems'][0]['directory_count'], 0) + self.assertEquals(peer['stats']['failure_count'], 0) + self.assertEquals(peer['stats']['recovery_count'], 0) + + # add a non-existent directory for synchronization -- check if its reported + # in daemon stats + self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d0') + + time.sleep(120) + status = self.get_mirror_daemon_status() + # we added one + peer = status['filesystems'][0]['peers'][0] + self.assertEquals(status['filesystems'][0]['directory_count'], 1) + # failure count should be reflected + self.assertEquals(peer['stats']['failure_count'], 1) + self.assertEquals(peer['stats']['recovery_count'], 0) + + # create the directory, mirror daemon would recover + self.mount_a.run_shell(["mkdir", "d0"]) + + time.sleep(120) + status = self.get_mirror_daemon_status() + peer = status['filesystems'][0]['peers'][0] + self.assertEquals(status['filesystems'][0]['directory_count'], 1) + # failure and recovery count should be reflected + self.assertEquals(peer['stats']['failure_count'], 1) + self.assertEquals(peer['stats']['recovery_count'], 1) + + self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) + + def test_mirroring_init_failure(self): + """Test mirror daemon init failure""" + + # disable mgr mirroring plugin as it would try to load dir map on + # on mirroring enabled for a filesystem (an throw up erorrs in + # the logs) + self.disable_mirroring_module() + + # enable mirroring through mon interface -- this should result in the mirror daemon + # failing to enable mirroring due to absence of `cephfs_mirorr` index object. + self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "mirror", "enable", self.primary_fs_name) + + with safe_while(sleep=5, tries=10, action='wait for failed state') as proceed: + while proceed(): + try: + # verify via asok + res = self.mirror_daemon_command(f'mirror status for fs: {self.primary_fs_name}', + 'fs', 'mirror', 'status', f'{self.primary_fs_name}@{self.primary_fs_id}') + if not 'state' in res: + return + self.assertTrue(res['state'] == "failed") + return True + except: + pass + + self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "mirror", "disable", self.primary_fs_name) + time.sleep(10) + # verify via asok + try: + self.mirror_daemon_command(f'mirror status for fs: {self.primary_fs_name}', + 'fs', 'mirror', 'status', f'{self.primary_fs_name}@{self.primary_fs_id}') + except CommandFailedError: + pass + else: + raise RuntimeError('expected admin socket to be unavailable') + + def test_mirroring_init_failure_with_recovery(self): + """Test if the mirror daemon can recover from a init failure""" + + # disable mgr mirroring plugin as it would try to load dir map on + # on mirroring enabled for a filesystem (an throw up erorrs in + # the logs) + self.disable_mirroring_module() + + # enable mirroring through mon interface -- this should result in the mirror daemon + # failing to enable mirroring due to absence of `cephfs_mirror` index object. + + self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "mirror", "enable", self.primary_fs_name) + # need safe_while since non-failed status pops up as mirroring is restarted + # internally in mirror daemon. + with safe_while(sleep=5, tries=20, action='wait for failed state') as proceed: + while proceed(): + try: + # verify via asok + res = self.mirror_daemon_command(f'mirror status for fs: {self.primary_fs_name}', + 'fs', 'mirror', 'status', f'{self.primary_fs_name}@{self.primary_fs_id}') + if not 'state' in res: + return + self.assertTrue(res['state'] == "failed") + return True + except: + pass + + # create the index object and check daemon recovery + try: + p = self.mount_a.client_remote.run(args=['rados', '-p', self.fs.metadata_pool_name, 'create', 'cephfs_mirror'], + stdout=StringIO(), stderr=StringIO(), timeout=30, + check_status=True, label="create index object") + p.wait() + except CommandFailedError as ce: + log.warn(f'mirror daemon command to create mirror index object failed: {ce}') + raise + time.sleep(30) + res = self.mirror_daemon_command(f'mirror status for fs: {self.primary_fs_name}', + 'fs', 'mirror', 'status', f'{self.primary_fs_name}@{self.primary_fs_id}') + self.assertTrue(res['peers'] == {}) + self.assertTrue(res['snap_dirs']['dir_count'] == 0) + + self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "mirror", "disable", self.primary_fs_name) + time.sleep(10) + # verify via asok + try: + self.mirror_daemon_command(f'mirror status for fs: {self.primary_fs_name}', + 'fs', 'mirror', 'status', f'{self.primary_fs_name}@{self.primary_fs_id}') + except CommandFailedError: + pass + else: + raise RuntimeError('expected admin socket to be unavailable') + + def test_cephfs_mirror_peer_bootstrap(self): + """Test importing peer bootstrap token""" + self.enable_mirroring(self.primary_fs_name, self.primary_fs_id) + + # create a bootstrap token for the peer + bootstrap_token = self.bootstrap_peer(self.secondary_fs_name, "client.mirror_peer_bootstrap", "site-remote") + + # import the peer via bootstrap token + self.import_peer(self.primary_fs_name, bootstrap_token) + time.sleep(10) + self.verify_peer_added(self.primary_fs_name, self.primary_fs_id, "client.mirror_peer_bootstrap@site-remote", + self.secondary_fs_name) + + # verify via peer_list interface + peer_uuid = self.get_peer_uuid("client.mirror_peer_bootstrap@site-remote") + res = json.loads(self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "peer_list", self.primary_fs_name)) + self.assertTrue(peer_uuid in res) + self.assertTrue('mon_host' in res[peer_uuid] and res[peer_uuid]['mon_host'] != '') + + # remove peer + self.peer_remove(self.primary_fs_name, self.primary_fs_id, "client.mirror_peer_bootstrap@site-remote") + # disable mirroring + self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) + + def test_cephfs_mirror_symlink_sync(self): + log.debug('reconfigure client auth caps') + self.mds_cluster.mon_manager.raw_cluster_cmd_result( + 'auth', 'caps', "client.{0}".format(self.mount_b.client_id), + 'mds', 'allow rw', + 'mon', 'allow r', + 'osd', 'allow rw pool={0}, allow rw pool={1}'.format( + self.backup_fs.get_data_pool_name(), self.backup_fs.get_data_pool_name())) + + log.debug(f'mounting filesystem {self.secondary_fs_name}') + self.mount_b.umount_wait() + self.mount_b.mount_wait(cephfs_name=self.secondary_fs_name) + + # create a bunch of files w/ symbolic links in a directory to snap + self.mount_a.run_shell(["mkdir", "d0"]) + self.mount_a.create_n_files('d0/file', 10, sync=True) + self.mount_a.run_shell(["ln", "-s", "./file_0", "d0/sym_0"]) + self.mount_a.run_shell(["ln", "-s", "./file_1", "d0/sym_1"]) + self.mount_a.run_shell(["ln", "-s", "./file_2", "d0/sym_2"]) + + self.enable_mirroring(self.primary_fs_name, self.primary_fs_id) + self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d0') + self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name) + + # take a snapshot + self.mount_a.run_shell(["mkdir", "d0/.snap/snap0"]) + + time.sleep(30) + self.check_peer_status(self.primary_fs_name, self.primary_fs_id, + "client.mirror_remote@ceph", '/d0', 'snap0', 1) + self.verify_snapshot('d0', 'snap0') + + self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d0') + self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) + + def test_cephfs_mirror_with_parent_snapshot(self): + """Test snapshot synchronization with parent directory snapshots""" + self.mount_a.run_shell(["mkdir", "-p", "d0/d1/d2/d3"]) + + self.enable_mirroring(self.primary_fs_name, self.primary_fs_id) + self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d0/d1/d2/d3') + self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name) + + # take a snapshot + self.mount_a.run_shell(["mkdir", "d0/d1/d2/d3/.snap/snap0"]) + + time.sleep(30) + self.check_peer_status(self.primary_fs_name, self.primary_fs_id, + "client.mirror_remote@ceph", '/d0/d1/d2/d3', 'snap0', 1) + + # create snapshots in parent directories + self.mount_a.run_shell(["mkdir", "d0/.snap/snap_d0"]) + self.mount_a.run_shell(["mkdir", "d0/d1/.snap/snap_d1"]) + self.mount_a.run_shell(["mkdir", "d0/d1/d2/.snap/snap_d2"]) + + # try syncing more snapshots + self.mount_a.run_shell(["mkdir", "d0/d1/d2/d3/.snap/snap1"]) + time.sleep(30) + self.check_peer_status(self.primary_fs_name, self.primary_fs_id, + "client.mirror_remote@ceph", '/d0/d1/d2/d3', 'snap1', 2) + + self.mount_a.run_shell(["rmdir", "d0/d1/d2/d3/.snap/snap0"]) + self.mount_a.run_shell(["rmdir", "d0/d1/d2/d3/.snap/snap1"]) + time.sleep(15) + self.check_peer_status_deleted_snap(self.primary_fs_name, self.primary_fs_id, + "client.mirror_remote@ceph", '/d0/d1/d2/d3', 2) + + self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d0/d1/d2/d3') + self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) + + def test_cephfs_mirror_remove_on_stall(self): + self.enable_mirroring(self.primary_fs_name, self.primary_fs_id) + + # fetch rados address for blacklist check + rados_inst = self.get_mirror_rados_addr(self.primary_fs_name, self.primary_fs_id) + + # simulate non-responding mirror daemon by sending SIGSTOP + pid = self.get_mirror_daemon_pid() + log.debug(f'SIGSTOP to cephfs-mirror pid {pid}') + self.mount_a.run_shell(['kill', '-SIGSTOP', pid]) + + # wait for blocklist timeout -- the manager module would blocklist + # the mirror daemon + time.sleep(40) + + # make sure the rados addr is blocklisted + self.assertTrue(self.mds_cluster.is_addr_blocklisted(rados_inst)) + + # now we are sure that there are no "active" mirror daemons -- add a directory path. + dir_path_p = "/d0/d1" + dir_path = "/d0/d1/d2" + + self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "add", self.primary_fs_name, dir_path) + + time.sleep(10) + # this uses an undocumented interface to get dirpath map state + res_json = self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "dirmap", self.primary_fs_name, dir_path) + res = json.loads(res_json) + # there are no mirror daemons + self.assertTrue(res['state'], 'stalled') + + self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "remove", self.primary_fs_name, dir_path) + + time.sleep(10) + try: + self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "dirmap", self.primary_fs_name, dir_path) + except CommandFailedError as ce: + if ce.exitstatus != errno.ENOENT: + raise RuntimeError('invalid errno when checking dirmap status for non-existent directory') + else: + raise RuntimeError('incorrect errno when checking dirmap state for non-existent directory') + + # adding a parent directory should be allowed + self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "add", self.primary_fs_name, dir_path_p) + + time.sleep(10) + # however, this directory path should get stalled too + res_json = self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "dirmap", self.primary_fs_name, dir_path_p) + res = json.loads(res_json) + # there are no mirror daemons + self.assertTrue(res['state'], 'stalled') + + # wake up the mirror daemon -- at this point, the daemon should know + # that it has been blocklisted + log.debug('SIGCONT to cephfs-mirror') + self.mount_a.run_shell(['kill', '-SIGCONT', pid]) + + # wait for restart mirror on blocklist + time.sleep(60) + res_json = self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", "snapshot", "mirror", "dirmap", self.primary_fs_name, dir_path_p) + res = json.loads(res_json) + # there are no mirror daemons + self.assertTrue(res['state'], 'mapped') + + self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) + + def test_cephfs_mirror_incremental_sync(self): + """ Test incremental snapshot synchronization (based on mtime differences).""" + log.debug('reconfigure client auth caps') + self.mds_cluster.mon_manager.raw_cluster_cmd_result( + 'auth', 'caps', "client.{0}".format(self.mount_b.client_id), + 'mds', 'allow rw', + 'mon', 'allow r', + 'osd', 'allow rw pool={0}, allow rw pool={1}'.format( + self.backup_fs.get_data_pool_name(), self.backup_fs.get_data_pool_name())) + log.debug(f'mounting filesystem {self.secondary_fs_name}') + self.mount_b.umount_wait() + self.mount_b.mount_wait(cephfs_name=self.secondary_fs_name) + + repo = 'ceph-qa-suite' + repo_dir = 'ceph_repo' + repo_path = f'{repo_dir}/{repo}' + + def clone_repo(): + self.mount_a.run_shell([ + 'git', 'clone', '--branch', 'giant', + f'http://github.com/ceph/{repo}', repo_path]) + + def exec_git_cmd(cmd_list): + self.mount_a.run_shell(['git', '--git-dir', f'{self.mount_a.mountpoint}/{repo_path}/.git', *cmd_list]) + + self.mount_a.run_shell(["mkdir", repo_dir]) + clone_repo() + + self.enable_mirroring(self.primary_fs_name, self.primary_fs_id) + self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name) + + self.add_directory(self.primary_fs_name, self.primary_fs_id, f'/{repo_path}') + self.mount_a.run_shell(['mkdir', f'{repo_path}/.snap/snap_a']) + + # full copy, takes time + time.sleep(500) + self.check_peer_status(self.primary_fs_name, self.primary_fs_id, + "client.mirror_remote@ceph", f'/{repo_path}', 'snap_a', 1) + self.verify_snapshot(repo_path, 'snap_a') + + # create some diff + num = random.randint(5, 20) + log.debug(f'resetting to HEAD~{num}') + exec_git_cmd(["reset", "--hard", f'HEAD~{num}']) + + self.mount_a.run_shell(['mkdir', f'{repo_path}/.snap/snap_b']) + # incremental copy, should be fast + time.sleep(180) + self.check_peer_status(self.primary_fs_name, self.primary_fs_id, + "client.mirror_remote@ceph", f'/{repo_path}', 'snap_b', 2) + self.verify_snapshot(repo_path, 'snap_b') + + # diff again, this time back to HEAD + log.debug('resetting to HEAD') + exec_git_cmd(["pull"]) + + self.mount_a.run_shell(['mkdir', f'{repo_path}/.snap/snap_c']) + # incremental copy, should be fast + time.sleep(180) + self.check_peer_status(self.primary_fs_name, self.primary_fs_id, + "client.mirror_remote@ceph", f'/{repo_path}', 'snap_c', 3) + self.verify_snapshot(repo_path, 'snap_c') + + self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) + + def test_cephfs_mirror_incremental_sync_with_type_mixup(self): + """ Test incremental snapshot synchronization with file type changes. + + The same filename exist as a different type in subsequent snapshot. + This verifies if the mirror daemon can identify file type mismatch and + sync snapshots. + + \ snap_0 snap_1 snap_2 snap_3 + \----------------------------------------------- + file_x | reg sym dir reg + | + file_y | dir reg sym dir + | + file_z | sym dir reg sym + """ + log.debug('reconfigure client auth caps') + self.mds_cluster.mon_manager.raw_cluster_cmd_result( + 'auth', 'caps', "client.{0}".format(self.mount_b.client_id), + 'mds', 'allow rw', + 'mon', 'allow r', + 'osd', 'allow rw pool={0}, allow rw pool={1}'.format( + self.backup_fs.get_data_pool_name(), self.backup_fs.get_data_pool_name())) + log.debug(f'mounting filesystem {self.secondary_fs_name}') + self.mount_b.umount_wait() + self.mount_b.mount_wait(cephfs_name=self.secondary_fs_name) + + typs = deque(['reg', 'dir', 'sym']) + def cleanup_and_create_with_type(dirname, fnames): + self.mount_a.run_shell_payload(f"rm -rf {dirname}/*") + fidx = 0 + for t in typs: + fname = f'{dirname}/{fnames[fidx]}' + log.debug(f'file: {fname} type: {t}') + if t == 'reg': + self.mount_a.run_shell(["touch", fname]) + self.mount_a.write_file(fname, data=fname) + elif t == 'dir': + self.mount_a.run_shell(["mkdir", fname]) + elif t == 'sym': + # verify ELOOP in mirror daemon + self.mount_a.run_shell(["ln", "-s", "..", fname]) + fidx += 1 + + def verify_types(dirname, fnames, snap_name): + tidx = 0 + for fname in fnames: + t = self.mount_b.run_shell_payload(f"stat -c %F {dirname}/.snap/{snap_name}/{fname}").stdout.getvalue().strip() + if typs[tidx] == 'reg': + self.assertEquals('regular file', t) + elif typs[tidx] == 'dir': + self.assertEquals('directory', t) + elif typs[tidx] == 'sym': + self.assertEquals('symbolic link', t) + tidx += 1 + + self.enable_mirroring(self.primary_fs_name, self.primary_fs_id) + self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name) + + self.mount_a.run_shell(["mkdir", "d0"]) + self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d0') + + fnames = ['file_x', 'file_y', 'file_z'] + turns = 0 + while turns != len(typs): + snapname = f'snap_{turns}' + cleanup_and_create_with_type('d0', fnames) + self.mount_a.run_shell(['mkdir', f'd0/.snap/{snapname}']) + time.sleep(30) + self.check_peer_status(self.primary_fs_name, self.primary_fs_id, + "client.mirror_remote@ceph", '/d0', snapname, turns+1) + verify_types('d0', fnames, snapname) + # next type + typs.rotate(1) + turns += 1 + + self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) + + def test_cephfs_mirror_sync_with_purged_snapshot(self): + """Test snapshot synchronization in midst of snapshot deletes. + + Deleted the previous snapshot when the mirror daemon is figuring out + incremental differences between current and previous snaphot. The + mirror daemon should identify the purge and switch to using remote + comparison to sync the snapshot (in the next iteration of course). + """ + + log.debug('reconfigure client auth caps') + self.mds_cluster.mon_manager.raw_cluster_cmd_result( + 'auth', 'caps', "client.{0}".format(self.mount_b.client_id), + 'mds', 'allow rw', + 'mon', 'allow r', + 'osd', 'allow rw pool={0}, allow rw pool={1}'.format( + self.backup_fs.get_data_pool_name(), self.backup_fs.get_data_pool_name())) + log.debug(f'mounting filesystem {self.secondary_fs_name}') + self.mount_b.umount_wait() + self.mount_b.mount_wait(cephfs_name=self.secondary_fs_name) + + repo = 'ceph-qa-suite' + repo_dir = 'ceph_repo' + repo_path = f'{repo_dir}/{repo}' + + def clone_repo(): + self.mount_a.run_shell([ + 'git', 'clone', '--branch', 'giant', + f'http://github.com/ceph/{repo}', repo_path]) + + def exec_git_cmd(cmd_list): + self.mount_a.run_shell(['git', '--git-dir', f'{self.mount_a.mountpoint}/{repo_path}/.git', *cmd_list]) + + self.mount_a.run_shell(["mkdir", repo_dir]) + clone_repo() + + self.enable_mirroring(self.primary_fs_name, self.primary_fs_id) + self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name) + + self.add_directory(self.primary_fs_name, self.primary_fs_id, f'/{repo_path}') + self.mount_a.run_shell(['mkdir', f'{repo_path}/.snap/snap_a']) + + # full copy, takes time + time.sleep(500) + self.check_peer_status(self.primary_fs_name, self.primary_fs_id, + "client.mirror_remote@ceph", f'/{repo_path}', 'snap_a', 1) + self.verify_snapshot(repo_path, 'snap_a') + + # create some diff + num = random.randint(60, 100) + log.debug(f'resetting to HEAD~{num}') + exec_git_cmd(["reset", "--hard", f'HEAD~{num}']) + + self.mount_a.run_shell(['mkdir', f'{repo_path}/.snap/snap_b']) + + time.sleep(15) + self.mount_a.run_shell(['rmdir', f'{repo_path}/.snap/snap_a']) + + # incremental copy but based on remote dir_root + time.sleep(300) + self.check_peer_status(self.primary_fs_name, self.primary_fs_id, + "client.mirror_remote@ceph", f'/{repo_path}', 'snap_b', 2) + self.verify_snapshot(repo_path, 'snap_b') + + self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) + + def test_cephfs_mirror_peer_add_primary(self): + self.enable_mirroring(self.primary_fs_name, self.primary_fs_id) + self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name) + + # try adding the primary file system as a peer to secondary file + # system + try: + self.peer_add(self.secondary_fs_name, self.secondary_fs_id, "client.mirror_remote@ceph", self.primary_fs_name) + except CommandFailedError as ce: + if ce.exitstatus != errno.EINVAL: + raise RuntimeError('invalid errno when adding a primary file system') + else: + raise RuntimeError('adding peer should fail') + + self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) + + def test_cephfs_mirror_cancel_mirroring_and_readd(self): + """ + Test adding a directory path for synchronization post removal of already added directory paths + + ... to ensure that synchronization of the newly added directory path functions + as expected. Note that we schedule three (3) directories for mirroring to ensure + that all replayer threads (3 by default) in the mirror daemon are busy. + """ + log.debug('reconfigure client auth caps') + self.mds_cluster.mon_manager.raw_cluster_cmd_result( + 'auth', 'caps', "client.{0}".format(self.mount_b.client_id), + 'mds', 'allow rw', + 'mon', 'allow r', + 'osd', 'allow rw pool={0}, allow rw pool={1}'.format( + self.backup_fs.get_data_pool_name(), self.backup_fs.get_data_pool_name())) + + log.debug(f'mounting filesystem {self.secondary_fs_name}') + self.mount_b.umount_wait() + self.mount_b.mount_wait(cephfs_name=self.secondary_fs_name) + + # create a bunch of files in a directory to snap + self.mount_a.run_shell(["mkdir", "d0"]) + self.mount_a.run_shell(["mkdir", "d1"]) + self.mount_a.run_shell(["mkdir", "d2"]) + for i in range(4): + filename = f'file.{i}' + self.mount_a.write_n_mb(os.path.join('d0', filename), 1024) + self.mount_a.write_n_mb(os.path.join('d1', filename), 1024) + self.mount_a.write_n_mb(os.path.join('d2', filename), 1024) + + log.debug('enabling mirroring') + self.enable_mirroring(self.primary_fs_name, self.primary_fs_id) + log.debug('adding directory paths') + self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d0') + self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d1') + self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d2') + self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name) + + # take snapshots + log.debug('taking snapshots') + self.mount_a.run_shell(["mkdir", "d0/.snap/snap0"]) + self.mount_a.run_shell(["mkdir", "d1/.snap/snap0"]) + self.mount_a.run_shell(["mkdir", "d2/.snap/snap0"]) + + time.sleep(10) + log.debug('checking snap in progress') + self.check_peer_snap_in_progress(self.primary_fs_name, self.primary_fs_id, + "client.mirror_remote@ceph", '/d0', 'snap0') + self.check_peer_snap_in_progress(self.primary_fs_name, self.primary_fs_id, + "client.mirror_remote@ceph", '/d1', 'snap0') + self.check_peer_snap_in_progress(self.primary_fs_name, self.primary_fs_id, + "client.mirror_remote@ceph", '/d2', 'snap0') + + log.debug('removing directories 1') + self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d0') + log.debug('removing directories 2') + self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d1') + log.debug('removing directories 3') + self.remove_directory(self.primary_fs_name, self.primary_fs_id, '/d2') + + log.debug('removing snapshots') + self.mount_a.run_shell(["rmdir", "d0/.snap/snap0"]) + self.mount_a.run_shell(["rmdir", "d1/.snap/snap0"]) + self.mount_a.run_shell(["rmdir", "d2/.snap/snap0"]) + + for i in range(4): + filename = f'file.{i}' + log.debug(f'deleting {filename}') + self.mount_a.run_shell(["rm", "-f", os.path.join('d0', filename)]) + self.mount_a.run_shell(["rm", "-f", os.path.join('d1', filename)]) + self.mount_a.run_shell(["rm", "-f", os.path.join('d2', filename)]) + + log.debug('creating new files...') + self.mount_a.create_n_files('d0/file', 50, sync=True) + self.mount_a.create_n_files('d1/file', 50, sync=True) + self.mount_a.create_n_files('d2/file', 50, sync=True) + + log.debug('adding directory paths') + self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d0') + self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d1') + self.add_directory(self.primary_fs_name, self.primary_fs_id, '/d2') + + log.debug('creating new snapshots...') + self.mount_a.run_shell(["mkdir", "d0/.snap/snap0"]) + self.mount_a.run_shell(["mkdir", "d1/.snap/snap0"]) + self.mount_a.run_shell(["mkdir", "d2/.snap/snap0"]) + + time.sleep(60) + self.check_peer_status(self.primary_fs_name, self.primary_fs_id, + "client.mirror_remote@ceph", '/d0', 'snap0', 1) + self.verify_snapshot('d0', 'snap0') + + self.check_peer_status(self.primary_fs_name, self.primary_fs_id, + "client.mirror_remote@ceph", '/d1', 'snap0', 1) + self.verify_snapshot('d1', 'snap0') + + self.check_peer_status(self.primary_fs_name, self.primary_fs_id, + "client.mirror_remote@ceph", '/d2', 'snap0', 1) + self.verify_snapshot('d2', 'snap0') + + self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) + + def test_local_and_remote_dir_root_mode(self): + log.debug('reconfigure client auth caps') + cid = self.mount_b.client_id + data_pool = self.backup_fs.get_data_pool_name() + self.mds_cluster.mon_manager.raw_cluster_cmd_result( + 'auth', 'caps', f"client.{cid}", + 'mds', 'allow rw', + 'mon', 'allow r', + 'osd', f"allow rw pool={data_pool}, allow rw pool={data_pool}") + + log.debug(f'mounting filesystem {self.secondary_fs_name}') + self.mount_b.umount_wait() + self.mount_b.mount_wait(cephfs_name=self.secondary_fs_name) + + self.mount_a.run_shell(["mkdir", "l1"]) + self.mount_a.run_shell(["mkdir", "l1/.snap/snap0"]) + self.mount_a.run_shell(["chmod", "go-rwx", "l1"]) + + self.enable_mirroring(self.primary_fs_name, self.primary_fs_id) + self.add_directory(self.primary_fs_name, self.primary_fs_id, '/l1') + self.peer_add(self.primary_fs_name, self.primary_fs_id, "client.mirror_remote@ceph", self.secondary_fs_name) + + time.sleep(60) + self.check_peer_status(self.primary_fs_name, self.primary_fs_id, + "client.mirror_remote@ceph", '/l1', 'snap0', 1) + + mode_local = self.mount_a.run_shell(["stat", "--format=%A", "l1"]).stdout.getvalue().strip() + mode_remote = self.mount_b.run_shell(["stat", "--format=%A", "l1"]).stdout.getvalue().strip() + + self.assertTrue(mode_local == mode_remote, f"mode mismatch, local mode: {mode_local}, remote mode: {mode_remote}") + + self.disable_mirroring(self.primary_fs_name, self.primary_fs_id) + self.mount_a.run_shell(["rmdir", "l1/.snap/snap0"]) + self.mount_a.run_shell(["rmdir", "l1"]) diff --git a/qa/tasks/cephfs/test_misc.py b/qa/tasks/cephfs/test_misc.py new file mode 100644 index 000000000..8b48dee69 --- /dev/null +++ b/qa/tasks/cephfs/test_misc.py @@ -0,0 +1,640 @@ +from io import StringIO + +from tasks.cephfs.fuse_mount import FuseMount +from tasks.cephfs.cephfs_test_case import CephFSTestCase +from teuthology.exceptions import CommandFailedError +from textwrap import dedent +from threading import Thread +import errno +import platform +import time +import json +import logging +import os +import re + +log = logging.getLogger(__name__) + +class TestMisc(CephFSTestCase): + CLIENTS_REQUIRED = 2 + + def test_statfs_on_deleted_fs(self): + """ + That statfs does not cause monitors to SIGSEGV after fs deletion. + """ + + self.mount_b.umount_wait() + self.mount_a.run_shell_payload("stat -f .") + self.fs.delete_all_filesystems() + # This will hang either way, run in background. + p = self.mount_a.run_shell_payload("stat -f .", wait=False, timeout=60, check_status=False) + time.sleep(30) + self.assertFalse(p.finished) + # the process is stuck in uninterruptible sleep, just kill the mount + self.mount_a.umount_wait(force=True) + p.wait() + + def test_fuse_mount_on_already_mounted_path(self): + if platform.system() != "Linux": + self.skipTest("Require Linux platform") + + if not isinstance(self.mount_a, FuseMount): + self.skipTest("Require FUSE client") + + # Try to mount already mounted path + # expecting EBUSY error + try: + mount_cmd = ['sudo'] + self.mount_a._mount_bin + [self.mount_a.hostfs_mntpt] + self.mount_a.client_remote.run(args=mount_cmd, stderr=StringIO(), + stdout=StringIO(), timeout=60, omit_sudo=False) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.EBUSY) + else: + self.fail("Expected EBUSY") + + def test_getattr_caps(self): + """ + Check if MDS recognizes the 'mask' parameter of open request. + The parameter allows client to request caps when opening file + """ + + if not isinstance(self.mount_a, FuseMount): + self.skipTest("Require FUSE client") + + # Enable debug. Client will requests CEPH_CAP_XATTR_SHARED + # on lookup/open + self.mount_b.umount_wait() + self.set_conf('client', 'client debug getattr caps', 'true') + self.mount_b.mount_wait() + + # create a file and hold it open. MDS will issue CEPH_CAP_EXCL_* + # to mount_a + p = self.mount_a.open_background("testfile") + self.mount_b.wait_for_visible("testfile") + + # this triggers a lookup request and an open request. The debug + # code will check if lookup/open reply contains xattrs + self.mount_b.run_shell(["cat", "testfile"]) + + self.mount_a.kill_background(p) + + def test_root_rctime(self): + """ + Check that the root inode has a non-default rctime on startup. + """ + + t = time.time() + rctime = self.mount_a.getfattr(".", "ceph.dir.rctime") + log.info("rctime = {}".format(rctime)) + self.assertGreaterEqual(float(rctime), t - 10) + + def test_fs_new(self): + self.mount_a.umount_wait() + self.mount_b.umount_wait() + + data_pool_name = self.fs.get_data_pool_name() + + self.fs.fail() + + self.fs.mon_manager.raw_cluster_cmd('fs', 'rm', self.fs.name, + '--yes-i-really-mean-it') + + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete', + self.fs.metadata_pool_name, + self.fs.metadata_pool_name, + '--yes-i-really-really-mean-it') + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', + self.fs.metadata_pool_name, + '--pg_num_min', str(self.fs.pg_num_min)) + + # insert a garbage object + self.fs.radosm(["put", "foo", "-"], stdin=StringIO("bar")) + + def get_pool_df(fs, name): + try: + return fs.get_pool_df(name)['objects'] > 0 + except RuntimeError: + return False + + self.wait_until_true(lambda: get_pool_df(self.fs, self.fs.metadata_pool_name), timeout=30) + + try: + self.fs.mon_manager.raw_cluster_cmd('fs', 'new', self.fs.name, + self.fs.metadata_pool_name, + data_pool_name) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.EINVAL) + else: + raise AssertionError("Expected EINVAL") + + self.fs.mon_manager.raw_cluster_cmd('fs', 'new', self.fs.name, + self.fs.metadata_pool_name, + data_pool_name, "--force") + + self.fs.mon_manager.raw_cluster_cmd('fs', 'fail', self.fs.name) + + self.fs.mon_manager.raw_cluster_cmd('fs', 'rm', self.fs.name, + '--yes-i-really-mean-it') + + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'delete', + self.fs.metadata_pool_name, + self.fs.metadata_pool_name, + '--yes-i-really-really-mean-it') + self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'create', + self.fs.metadata_pool_name, + '--pg_num_min', str(self.fs.pg_num_min)) + self.fs.mon_manager.raw_cluster_cmd('fs', 'new', self.fs.name, + self.fs.metadata_pool_name, + data_pool_name, + '--allow_dangerous_metadata_overlay') + + def test_cap_revoke_nonresponder(self): + """ + Check that a client is evicted if it has not responded to cap revoke + request for configured number of seconds. + """ + session_timeout = self.fs.get_var("session_timeout") + eviction_timeout = session_timeout / 2.0 + + self.fs.mds_asok(['config', 'set', 'mds_cap_revoke_eviction_timeout', + str(eviction_timeout)]) + + cap_holder = self.mount_a.open_background() + + # Wait for the file to be visible from another client, indicating + # that mount_a has completed its network ops + self.mount_b.wait_for_visible() + + # Simulate client death + self.mount_a.suspend_netns() + + try: + # The waiter should get stuck waiting for the capability + # held on the MDS by the now-dead client A + cap_waiter = self.mount_b.write_background() + + a = time.time() + time.sleep(eviction_timeout) + cap_waiter.wait() + b = time.time() + cap_waited = b - a + log.info("cap_waiter waited {0}s".format(cap_waited)) + + # check if the cap is transferred before session timeout kicked in. + # this is a good enough check to ensure that the client got evicted + # by the cap auto evicter rather than transitioning to stale state + # and then getting evicted. + self.assertLess(cap_waited, session_timeout, + "Capability handover took {0}, expected less than {1}".format( + cap_waited, session_timeout + )) + + self.assertTrue(self.mds_cluster.is_addr_blocklisted( + self.mount_a.get_global_addr())) + self.mount_a._kill_background(cap_holder) + finally: + self.mount_a.resume_netns() + + def test_filtered_df(self): + pool_name = self.fs.get_data_pool_name() + raw_df = self.fs.get_pool_df(pool_name) + raw_avail = float(raw_df["max_avail"]) + out = self.fs.mon_manager.raw_cluster_cmd('osd', 'pool', 'get', + pool_name, 'size', + '-f', 'json-pretty') + _ = json.loads(out) + + proc = self.mount_a.run_shell(['df', '.']) + output = proc.stdout.getvalue() + fs_avail = output.split('\n')[1].split()[3] + fs_avail = float(fs_avail) * 1024 + + ratio = raw_avail / fs_avail + assert 0.9 < ratio < 1.1 + + def test_dump_inode(self): + info = self.fs.mds_asok(['dump', 'inode', '1']) + assert(info['path'] == "/") + + def test_dump_inode_hexademical(self): + self.mount_a.run_shell(["mkdir", "-p", "foo"]) + ino = self.mount_a.path_to_ino("foo") + assert type(ino) is int + info = self.fs.mds_asok(['dump', 'inode', hex(ino)]) + assert info['path'] == "/foo" + + def test_fs_lsflags(self): + """ + Check that the lsflags displays the default state and the new state of flags + """ + # Set some flags + self.fs.set_joinable(False) + self.fs.set_allow_new_snaps(False) + self.fs.set_allow_standby_replay(True) + + lsflags = json.loads(self.fs.mon_manager.raw_cluster_cmd('fs', 'lsflags', + self.fs.name, + "--format=json-pretty")) + self.assertEqual(lsflags["joinable"], False) + self.assertEqual(lsflags["allow_snaps"], False) + self.assertEqual(lsflags["allow_multimds_snaps"], True) + self.assertEqual(lsflags["allow_standby_replay"], True) + + def _test_sync_stuck_for_around_5s(self, dir_path, file_sync=False): + self.mount_a.run_shell(["mkdir", dir_path]) + + sync_dir_pyscript = dedent(""" + import os + + path = "{path}" + dfd = os.open(path, os.O_DIRECTORY) + os.fsync(dfd) + os.close(dfd) + """.format(path=dir_path)) + + # run create/delete directories and test the sync time duration + for i in range(300): + for j in range(5): + self.mount_a.run_shell(["mkdir", os.path.join(dir_path, f"{i}_{j}")]) + start = time.time() + if file_sync: + self.mount_a.run_shell(['python3', '-c', sync_dir_pyscript]) + else: + self.mount_a.run_shell(["sync"]) + duration = time.time() - start + log.info(f"sync mkdir i = {i}, duration = {duration}") + self.assertLess(duration, 4) + + for j in range(5): + self.mount_a.run_shell(["rm", "-rf", os.path.join(dir_path, f"{i}_{j}")]) + start = time.time() + if file_sync: + self.mount_a.run_shell(['python3', '-c', sync_dir_pyscript]) + else: + self.mount_a.run_shell(["sync"]) + duration = time.time() - start + log.info(f"sync rmdir i = {i}, duration = {duration}") + self.assertLess(duration, 4) + + self.mount_a.run_shell(["rm", "-rf", dir_path]) + + def test_filesystem_sync_stuck_for_around_5s(self): + """ + To check whether the fsync will be stuck to wait for the mdlog to be + flushed for at most 5 seconds. + """ + + dir_path = "filesystem_sync_do_not_wait_mdlog_testdir" + self._test_sync_stuck_for_around_5s(dir_path) + + def test_file_sync_stuck_for_around_5s(self): + """ + To check whether the filesystem sync will be stuck to wait for the + mdlog to be flushed for at most 5 seconds. + """ + + dir_path = "file_sync_do_not_wait_mdlog_testdir" + self._test_sync_stuck_for_around_5s(dir_path, True) + + def test_file_filesystem_sync_crash(self): + """ + To check whether the kernel crashes when doing the file/filesystem sync. + """ + + stop_thread = False + dir_path = "file_filesystem_sync_crash_testdir" + self.mount_a.run_shell(["mkdir", dir_path]) + + def mkdir_rmdir_thread(mount, path): + #global stop_thread + + log.info(" mkdir_rmdir_thread starting...") + num = 0 + while not stop_thread: + n = num + m = num + for __ in range(10): + mount.run_shell(["mkdir", os.path.join(path, f"{n}")]) + n += 1 + for __ in range(10): + mount.run_shell(["rm", "-rf", os.path.join(path, f"{m}")]) + m += 1 + num += 10 + log.info(" mkdir_rmdir_thread stopped") + + def filesystem_sync_thread(mount, path): + #global stop_thread + + log.info(" filesystem_sync_thread starting...") + while not stop_thread: + mount.run_shell(["sync"]) + log.info(" filesystem_sync_thread stopped") + + def file_sync_thread(mount, path): + #global stop_thread + + log.info(" file_sync_thread starting...") + pyscript = dedent(""" + import os + + path = "{path}" + dfd = os.open(path, os.O_DIRECTORY) + os.fsync(dfd) + os.close(dfd) + """.format(path=path)) + + while not stop_thread: + mount.run_shell(['python3', '-c', pyscript]) + log.info(" file_sync_thread stopped") + + td1 = Thread(target=mkdir_rmdir_thread, args=(self.mount_a, dir_path,)) + td2 = Thread(target=filesystem_sync_thread, args=(self.mount_a, dir_path,)) + td3 = Thread(target=file_sync_thread, args=(self.mount_a, dir_path,)) + + td1.start() + td2.start() + td3.start() + time.sleep(1200) # run 20 minutes + stop_thread = True + td1.join() + td2.join() + td3.join() + self.mount_a.run_shell(["rm", "-rf", dir_path]) + + def test_dump_inmemory_log_on_client_eviction(self): + """ + That the in-memory logs are dumped during a client eviction event. + """ + self.fs.mds_asok(['config', 'set', 'debug_mds', '1/10']) + self.fs.mds_asok(['config', 'set', 'mds_extraordinary_events_dump_interval', '1']) + mount_a_client_id = self.mount_a.get_global_id() + infos = self.fs.status().get_ranks(self.fs.id) + + #evict the client + self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id]) + time.sleep(10) #wait for 10 seconds for the logs dumping to complete. + + #The client is evicted, so unmount it. + try: + self.mount_a.umount_wait(require_clean=True, timeout=30) + except: + pass #continue with grepping the log + + eviction_log = f"Evicting (\(and blocklisting\) )?client session {mount_a_client_id} \(.+:.+/.+\)" + search_range = "/^--- begin dump of recent events ---$/,/^--- end dump of recent events ---$/p" + for info in infos: + mds_id = info['name'] + try: + remote = self.fs.mon_manager.find_remote('mds', mds_id) + out = remote.run(args=["sed", + "-n", + "{0}".format(search_range), + f"/var/log/ceph/{self.mount_a.cluster_name}-mds.{mds_id}.log"], + stdout=StringIO(), timeout=30) + except: + continue #continue with the next info + if out.stdout and re.search(eviction_log, out.stdout.getvalue().strip()): + return + self.assertTrue(False, "Failed to dump in-memory logs during client eviction") + + def test_dump_inmemory_log_on_missed_beacon_ack_from_monitors(self): + """ + That the in-memory logs are dumped when the mds misses beacon ACKs from monitors. + """ + self.fs.mds_asok(['config', 'set', 'debug_mds', '1/10']) + self.fs.mds_asok(['config', 'set', 'mds_extraordinary_events_dump_interval', '1']) + try: + mons = json.loads(self.fs.mon_manager.raw_cluster_cmd('mon', 'dump', '-f', 'json'))['mons'] + except: + self.assertTrue(False, "Error fetching monitors") + + #Freeze all monitors + for mon in mons: + mon_name = mon['name'] + log.info(f'Sending STOP to mon {mon_name}') + self.fs.mon_manager.signal_mon(mon_name, 19) + + time.sleep(10) #wait for 10 seconds to get the in-memory logs dumped + + #Unfreeze all monitors + for mon in mons: + mon_name = mon['name'] + log.info(f'Sending CONT to mon {mon_name}') + self.fs.mon_manager.signal_mon(mon_name, 18) + + missed_beacon_ack_log = "missed beacon ack from the monitors" + search_range = "/^--- begin dump of recent events ---$/,/^--- end dump of recent events ---$/p" + for info in self.fs.status().get_ranks(self.fs.id): + mds_id = info['name'] + try: + remote = self.fs.mon_manager.find_remote('mds', mds_id) + out = remote.run(args=["sed", + "-n", + "{0}".format(search_range), + f"/var/log/ceph/{self.mount_a.cluster_name}-mds.{mds_id}.log"], + stdout=StringIO(), timeout=30) + except: + continue #continue with the next info + if out.stdout and (missed_beacon_ack_log in out.stdout.getvalue().strip()): + return + self.assertTrue(False, "Failed to dump in-memory logs during missed beacon ack") + + def test_dump_inmemory_log_on_missed_internal_heartbeats(self): + """ + That the in-memory logs are dumped when the mds misses internal heartbeats. + """ + self.fs.mds_asok(['config', 'set', 'debug_mds', '1/10']) + self.fs.mds_asok(['config', 'set', 'mds_heartbeat_grace', '1']) + self.fs.mds_asok(['config', 'set', 'mds_extraordinary_events_dump_interval', '1']) + try: + mons = json.loads(self.fs.mon_manager.raw_cluster_cmd('mon', 'dump', '-f', 'json'))['mons'] + except: + self.assertTrue(False, "Error fetching monitors") + + #Freeze all monitors + for mon in mons: + mon_name = mon['name'] + log.info(f'Sending STOP to mon {mon_name}') + self.fs.mon_manager.signal_mon(mon_name, 19) + + time.sleep(10) #wait for 10 seconds to get the in-memory logs dumped + + #Unfreeze all monitors + for mon in mons: + mon_name = mon['name'] + log.info(f'Sending CONT to mon {mon_name}') + self.fs.mon_manager.signal_mon(mon_name, 18) + + missed_internal_heartbeat_log = \ + "Skipping beacon heartbeat to monitors \(last acked .+s ago\); MDS internal heartbeat is not healthy!" + search_range = "/^--- begin dump of recent events ---$/,/^--- end dump of recent events ---$/p" + for info in self.fs.status().get_ranks(self.fs.id): + mds_id = info['name'] + try: + remote = self.fs.mon_manager.find_remote('mds', mds_id) + out = remote.run(args=["sed", + "-n", + "{0}".format(search_range), + f"/var/log/ceph/{self.mount_a.cluster_name}-mds.{mds_id}.log"], + stdout=StringIO(), timeout=30) + except: + continue #continue with the next info + if out.stdout and re.search(missed_internal_heartbeat_log, out.stdout.getvalue().strip()): + return + self.assertTrue(False, "Failed to dump in-memory logs during missed internal heartbeat") + + def _session_client_ls(self, cmd): + mount_a_client_id = self.mount_a.get_global_id() + info = self.fs.rank_asok(cmd) + mount_a_mountpoint = self.mount_a.mountpoint + mount_b_mountpoint = self.mount_b.mountpoint + self.assertIsNotNone(info) + for i in range(0, len(info)): + self.assertIn(info[i]["client_metadata"]["mount_point"], + [mount_a_mountpoint, mount_b_mountpoint]) + info = self.fs.rank_asok(cmd + [f"id={mount_a_client_id}"]) + self.assertEqual(len(info), 1) + self.assertEqual(info[0]["id"], mount_a_client_id) + self.assertEqual(info[0]["client_metadata"]["mount_point"], mount_a_mountpoint) + info = self.fs.rank_asok(cmd + ['--cap_dump']) + for i in range(0, len(info)): + self.assertIn("caps", info[i]) + + def test_session_ls(self): + self._session_client_ls(['session', 'ls']) + + def test_client_ls(self): + self._session_client_ls(['client', 'ls']) + +class TestCacheDrop(CephFSTestCase): + CLIENTS_REQUIRED = 1 + + def _run_drop_cache_cmd(self, timeout=None): + result = None + args = ["cache", "drop"] + if timeout is not None: + args.append(str(timeout)) + result = self.fs.rank_tell(args) + return result + + def _setup(self, max_caps=20, threshold=400): + # create some files + self.mount_a.create_n_files("dc-dir/dc-file", 1000, sync=True) + + # Reduce this so the MDS doesn't rkcall the maximum for simple tests + self.fs.rank_asok(['config', 'set', 'mds_recall_max_caps', str(max_caps)]) + self.fs.rank_asok(['config', 'set', 'mds_recall_max_decay_threshold', str(threshold)]) + + def test_drop_cache_command(self): + """ + Basic test for checking drop cache command. + Confirm it halts without a timeout. + Note that the cache size post trimming is not checked here. + """ + mds_min_caps_per_client = int(self.fs.get_config("mds_min_caps_per_client")) + self._setup() + result = self._run_drop_cache_cmd() + self.assertEqual(result['client_recall']['return_code'], 0) + self.assertEqual(result['flush_journal']['return_code'], 0) + # It should take at least 1 second + self.assertGreater(result['duration'], 1) + self.assertGreaterEqual(result['trim_cache']['trimmed'], 1000-2*mds_min_caps_per_client) + + def test_drop_cache_command_timeout(self): + """ + Basic test for checking drop cache command. + Confirm recall halts early via a timeout. + Note that the cache size post trimming is not checked here. + """ + self._setup() + result = self._run_drop_cache_cmd(timeout=10) + self.assertEqual(result['client_recall']['return_code'], -errno.ETIMEDOUT) + self.assertEqual(result['flush_journal']['return_code'], 0) + self.assertGreater(result['duration'], 10) + self.assertGreaterEqual(result['trim_cache']['trimmed'], 100) # we did something, right? + + def test_drop_cache_command_dead_timeout(self): + """ + Check drop cache command with non-responding client using tell + interface. Note that the cache size post trimming is not checked + here. + """ + self._setup() + self.mount_a.suspend_netns() + # Note: recall is subject to the timeout. The journal flush will + # be delayed due to the client being dead. + result = self._run_drop_cache_cmd(timeout=5) + self.assertEqual(result['client_recall']['return_code'], -errno.ETIMEDOUT) + self.assertEqual(result['flush_journal']['return_code'], 0) + self.assertGreater(result['duration'], 5) + self.assertLess(result['duration'], 120) + # Note: result['trim_cache']['trimmed'] may be >0 because dropping the + # cache now causes the Locker to drive eviction of stale clients (a + # stale session will be autoclosed at mdsmap['session_timeout']). The + # particular operation causing this is journal flush which causes the + # MDS to wait wait for cap revoke. + #self.assertEqual(0, result['trim_cache']['trimmed']) + self.mount_a.resume_netns() + + def test_drop_cache_command_dead(self): + """ + Check drop cache command with non-responding client using tell + interface. Note that the cache size post trimming is not checked + here. + """ + self._setup() + self.mount_a.suspend_netns() + result = self._run_drop_cache_cmd() + self.assertEqual(result['client_recall']['return_code'], 0) + self.assertEqual(result['flush_journal']['return_code'], 0) + self.assertGreater(result['duration'], 5) + self.assertLess(result['duration'], 120) + # Note: result['trim_cache']['trimmed'] may be >0 because dropping the + # cache now causes the Locker to drive eviction of stale clients (a + # stale session will be autoclosed at mdsmap['session_timeout']). The + # particular operation causing this is journal flush which causes the + # MDS to wait wait for cap revoke. + self.mount_a.resume_netns() + +class TestSkipReplayInoTable(CephFSTestCase): + MDSS_REQUIRED = 1 + CLIENTS_REQUIRED = 1 + + def test_alloc_cinode_assert(self): + """ + Test alloc CInode assert. + + See: https://tracker.ceph.com/issues/52280 + """ + + # Create a directory and the mds will journal this and then crash + self.mount_a.run_shell(["rm", "-rf", "test_alloc_ino"]) + self.mount_a.run_shell(["mkdir", "test_alloc_ino"]) + + status = self.fs.status() + rank0 = self.fs.get_rank(rank=0, status=status) + + self.fs.mds_asok(['config', 'set', 'mds_kill_skip_replaying_inotable', "true"]) + # This will make the MDS crash, since we only have one MDS in the + # cluster and without the "wait=False" it will stuck here forever. + self.mount_a.run_shell(["mkdir", "test_alloc_ino/dir1"], wait=False) + + # sleep 10 seconds to make sure the journal logs are flushed and + # the mds crashes + time.sleep(10) + + # Now set the mds config to skip replaying the inotable + self.fs.set_ceph_conf('mds', 'mds_inject_skip_replaying_inotable', True) + self.fs.set_ceph_conf('mds', 'mds_wipe_sessions', True) + + self.fs.mds_restart() + # sleep 5 seconds to make sure the mds tell command won't stuck + time.sleep(5) + self.fs.wait_for_daemons() + + self.delete_mds_coredump(rank0['name']); + + self.mount_a.run_shell(["mkdir", "test_alloc_ino/dir2"]) + + ls_out = set(self.mount_a.ls("test_alloc_ino/")) + self.assertEqual(ls_out, set({"dir1", "dir2"})) diff --git a/qa/tasks/cephfs/test_multifs_auth.py b/qa/tasks/cephfs/test_multifs_auth.py new file mode 100644 index 000000000..c9ea5f528 --- /dev/null +++ b/qa/tasks/cephfs/test_multifs_auth.py @@ -0,0 +1,297 @@ +""" +Test for Ceph clusters with multiple FSs. +""" +import logging + +from tasks.cephfs.cephfs_test_case import CephFSTestCase +from tasks.cephfs.caps_helper import CapTester + +from teuthology.exceptions import CommandFailedError + + +log = logging.getLogger(__name__) + + +class TestMultiFS(CephFSTestCase): + client_id = 'testuser' + client_name = 'client.' + client_id + # one dedicated for each FS + MDSS_REQUIRED = 2 + CLIENTS_REQUIRED = 2 + + def setUp(self): + super(TestMultiFS, self).setUp() + + self.captester = CapTester() + + # we might have it - the client - if the same cluster was used for a + # different vstart_runner.py run. + self.run_cluster_cmd(f'auth rm {self.client_name}') + + self.fs1 = self.fs + self.fs2 = self.mds_cluster.newfs(name='cephfs2', create=True) + + # we'll reassign caps to client.1 so that it can operate with cephfs2 + self.run_cluster_cmd(f'auth caps client.{self.mount_b.client_id} mon ' + f'"allow r" osd "allow rw ' + f'pool={self.fs2.get_data_pool_name()}" mds allow') + self.mount_b.remount(cephfs_name=self.fs2.name) + + +class TestMONCaps(TestMultiFS): + + def test_moncap_with_one_fs_names(self): + moncap = f'allow r fsname={self.fs1.name}' + self.create_client(self.client_id, moncap) + + self.captester.run_mon_cap_tests(self.fs1, self.client_id) + + def test_moncap_with_multiple_fs_names(self): + moncap = (f'allow r fsname={self.fs1.name}, ' + f'allow r fsname={self.fs2.name}') + self.create_client(self.client_id, moncap) + + self.captester.run_mon_cap_tests(self.fs1, self.client_id) + + def test_moncap_with_blanket_allow(self): + moncap = 'allow r' + self.create_client(self.client_id, moncap) + + self.captester.run_mon_cap_tests(self.fs1, self.client_id) + + +#TODO: add tests for capsecs 'p' and 's'. +class TestMDSCaps(TestMultiFS): + """ + 0. Have 2 FSs on Ceph cluster. + 1. Create new files on both FSs. + 2. Create a new client that has authorization for both FSs. + 3. Remount the current mounts with this new client. + 4. Test read and write on both FSs. + """ + def setUp(self): + super(self.__class__, self).setUp() + self.mounts = (self.mount_a, self.mount_b) + + def test_rw_with_fsname_and_no_path_in_cap(self): + PERM = 'rw' + self.captester.write_test_files(self.mounts) + keyring_paths = self._create_client(PERM, fsname=True) + self.remount_with_new_client(keyring_paths) + + self.captester.run_mds_cap_tests(PERM) + + def test_r_with_fsname_and_no_path_in_cap(self): + PERM = 'r' + self.captester.write_test_files(self.mounts) + keyring_paths = self._create_client(PERM, fsname=True) + self.remount_with_new_client(keyring_paths) + + self.captester.run_mds_cap_tests(PERM) + + def test_rw_with_fsname_and_path_in_cap(self): + PERM, CEPHFS_MNTPT = 'rw', 'dir1' + self.mount_a.run_shell(f'mkdir {CEPHFS_MNTPT}') + self.mount_b.run_shell(f'mkdir {CEPHFS_MNTPT}') + self.captester.write_test_files(self.mounts, CEPHFS_MNTPT) + keyring_paths = self._create_client(PERM, fsname=True) + self.remount_with_new_client(keyring_paths, CEPHFS_MNTPT) + + self.captester.run_mds_cap_tests(PERM, CEPHFS_MNTPT) + + def test_r_with_fsname_and_path_in_cap(self): + PERM, CEPHFS_MNTPT = 'r', 'dir1' + self.mount_a.run_shell(f'mkdir {CEPHFS_MNTPT}') + self.mount_b.run_shell(f'mkdir {CEPHFS_MNTPT}') + self.captester.write_test_files(self.mounts, CEPHFS_MNTPT) + keyring_paths = self._create_client(PERM, fsname=True) + self.remount_with_new_client(keyring_paths, CEPHFS_MNTPT) + + self.captester.run_mds_cap_tests(PERM, CEPHFS_MNTPT) + + # XXX: this tests the backward compatibility; "allow rw path=<dir1>" is + # treated as "allow rw fsname=* path=<dir1>" + def test_rw_with_no_fsname_and_path_in_cap(self): + PERM, CEPHFS_MNTPT = 'rw', 'dir1' + self.mount_a.run_shell(f'mkdir {CEPHFS_MNTPT}') + self.mount_b.run_shell(f'mkdir {CEPHFS_MNTPT}') + self.captester.write_test_files(self.mounts, CEPHFS_MNTPT) + keyring_paths = self._create_client(PERM) + self.remount_with_new_client(keyring_paths, CEPHFS_MNTPT) + + self.captester.run_mds_cap_tests(PERM, CEPHFS_MNTPT) + + # XXX: this tests the backward compatibility; "allow r path=<dir1>" is + # treated as "allow r fsname=* path=<dir1>" + def test_r_with_no_fsname_and_path_in_cap(self): + PERM, CEPHFS_MNTPT = 'r', 'dir1' + self.mount_a.run_shell(f'mkdir {CEPHFS_MNTPT}') + self.mount_b.run_shell(f'mkdir {CEPHFS_MNTPT}') + self.captester.write_test_files(self.mounts, CEPHFS_MNTPT) + keyring_paths = self._create_client(PERM) + self.remount_with_new_client(keyring_paths, CEPHFS_MNTPT) + + self.captester.run_mds_cap_tests(PERM, CEPHFS_MNTPT) + + def test_rw_with_no_fsname_and_no_path(self): + PERM = 'rw' + self.captester.write_test_files(self.mounts) + keyring_paths = self._create_client(PERM) + self.remount_with_new_client(keyring_paths) + + self.captester.run_mds_cap_tests(PERM) + + def test_r_with_no_fsname_and_no_path(self): + PERM = 'r' + self.captester.write_test_files(self.mounts) + keyring_paths = self._create_client(PERM) + self.remount_with_new_client(keyring_paths) + + self.captester.run_mds_cap_tests(PERM) + + def tearDown(self): + self.mount_a.umount_wait() + self.mount_b.umount_wait() + + super(type(self), self).tearDown() + + def generate_caps(self, perm, fsname, cephfs_mntpt): + moncap = 'allow r' + osdcap = (f'allow {perm} tag cephfs data={self.fs1.name}, ' + f'allow {perm} tag cephfs data={self.fs2.name}') + + if fsname: + if cephfs_mntpt == '/': + mdscap = (f'allow {perm} fsname={self.fs1.name}, ' + f'allow {perm} fsname={self.fs2.name}') + else: + mdscap = (f'allow {perm} fsname={self.fs1.name} ' + f'path=/{cephfs_mntpt}, ' + f'allow {perm} fsname={self.fs2.name} ' + f'path=/{cephfs_mntpt}') + else: + if cephfs_mntpt == '/': + mdscap = f'allow {perm}' + else: + mdscap = f'allow {perm} path=/{cephfs_mntpt}' + + return moncap, osdcap, mdscap + + def _create_client(self, perm, fsname=False, cephfs_mntpt='/'): + moncap, osdcap, mdscap = self.generate_caps(perm, fsname, + cephfs_mntpt) + + keyring = self.create_client(self.client_id, moncap, osdcap, mdscap) + keyring_paths = [] + for mount_x in self.mounts: + keyring_paths.append(mount_x.client_remote.mktemp(data=keyring)) + + return keyring_paths + + def remount_with_new_client(self, keyring_paths, cephfs_mntpt='/'): + if isinstance(cephfs_mntpt, str) and cephfs_mntpt != '/' : + cephfs_mntpt = '/' + cephfs_mntpt + + self.mount_a.remount(client_id=self.client_id, + client_keyring_path=keyring_paths[0], + client_remote=self.mount_a.client_remote, + cephfs_name=self.fs1.name, + cephfs_mntpt=cephfs_mntpt, + hostfs_mntpt=self.mount_a.hostfs_mntpt, + wait=True) + self.mount_b.remount(client_id=self.client_id, + client_keyring_path=keyring_paths[1], + client_remote=self.mount_b.client_remote, + cephfs_name=self.fs2.name, + cephfs_mntpt=cephfs_mntpt, + hostfs_mntpt=self.mount_b.hostfs_mntpt, + wait=True) + + +class TestClientsWithoutAuth(TestMultiFS): + + def setUp(self): + super(TestClientsWithoutAuth, self).setUp() + + # TODO: When MON and OSD caps for a Ceph FS are assigned to a + # client but MDS caps are not, mount.ceph prints "permission + # denied". But when MON caps are not assigned and MDS and OSD + # caps are, mount.ceph prints "no mds server or cluster laggy" + # instead of "permission denied". + # + # Before uncommenting the following line a fix would be required + # for latter case to change "no mds server is up or the cluster is + # laggy" to "permission denied". + self.kernel_errmsgs = ('permission denied', 'no mds server is up or ' + 'the cluster is laggy', 'no such file or ' + 'directory', + 'input/output error') + + # TODO: When MON and OSD caps are assigned for a Ceph FS to a + # client but MDS caps are not, ceph-fuse prints "operation not + # permitted". But when MON caps are not assigned and MDS and OSD + # caps are, ceph-fuse prints "no such file or directory" instead + # of "operation not permitted". + # + # Before uncommenting the following line a fix would be required + # for the latter case to change "no such file or directory" to + # "operation not permitted". + #self.assertIn('operation not permitted', retval[2].lower()) + self.fuse_errmsgs = ('operation not permitted', 'no such file or ' + 'directory') + + if 'kernel' in str(type(self.mount_a)).lower(): + self.errmsgs = self.kernel_errmsgs + elif 'fuse' in str(type(self.mount_a)).lower(): + self.errmsgs = self.fuse_errmsgs + else: + raise RuntimeError('strange, the client was neither based on ' + 'kernel nor FUSE.') + + def check_that_mount_failed_for_right_reason(self, stderr): + stderr = stderr.lower() + for errmsg in self.errmsgs: + if errmsg in stderr: + break + else: + raise AssertionError('can\'t find expected set of words in the ' + f'stderr\nself.errmsgs - {self.errmsgs}\n' + f'stderr - {stderr}') + + def test_mount_all_caps_absent(self): + # setup part... + keyring = self.fs1.authorize(self.client_id, ('/', 'rw')) + keyring_path = self.mount_a.client_remote.mktemp(data=keyring) + + # mount the FS for which client has no auth... + retval = self.mount_a.remount(client_id=self.client_id, + client_keyring_path=keyring_path, + cephfs_name=self.fs2.name, + check_status=False) + + # tests... + self.assertIsInstance(retval, tuple) + self.assertEqual(len(retval), 3) + self.assertIsInstance(retval[0], CommandFailedError) + self.check_that_mount_failed_for_right_reason(retval[2]) + + def test_mount_mon_and_osd_caps_present_mds_caps_absent(self): + # setup part... + moncap = f'allow rw fsname={self.fs1.name}, allow rw fsname={self.fs2.name}' + mdscap = f'allow rw fsname={self.fs1.name}' + osdcap = (f'allow rw tag cephfs data={self.fs1.name}, allow rw tag ' + f'cephfs data={self.fs2.name}') + keyring = self.create_client(self.client_id, moncap, osdcap, mdscap) + keyring_path = self.mount_a.client_remote.mktemp(data=keyring) + + # mount the FS for which client has no auth... + retval = self.mount_a.remount(client_id=self.client_id, + client_keyring_path=keyring_path, + cephfs_name=self.fs2.name, + check_status=False) + + # tests... + self.assertIsInstance(retval, tuple) + self.assertEqual(len(retval), 3) + self.assertIsInstance(retval[0], CommandFailedError) + self.check_that_mount_failed_for_right_reason(retval[2]) diff --git a/qa/tasks/cephfs/test_multimds_misc.py b/qa/tasks/cephfs/test_multimds_misc.py new file mode 100644 index 000000000..2bb6257c7 --- /dev/null +++ b/qa/tasks/cephfs/test_multimds_misc.py @@ -0,0 +1,223 @@ +import logging +import errno +from tasks.cephfs.cephfs_test_case import CephFSTestCase +from teuthology.contextutil import safe_while +from teuthology.exceptions import CommandFailedError + +log = logging.getLogger(__name__) + +class TestScrub2(CephFSTestCase): + MDSS_REQUIRED = 3 + CLIENTS_REQUIRED = 1 + + def _check_scrub_status(self, result=None, reverse=False): + self.assertEqual(self.fs.wait_until_scrub_complete(result=result, rank=1, + sleep=5, timeout=30, + reverse=reverse), True) + self.assertEqual(self.fs.wait_until_scrub_complete(result=result, rank=2, + sleep=5, timeout=30, + reverse=reverse), True) + self.assertEqual(self.fs.wait_until_scrub_complete(result=result, rank=0, + sleep=5, timeout=30, + reverse=reverse), True) + + def _check_task_status_na(self, timo=120): + """ check absence of scrub status in ceph status """ + with safe_while(sleep=1, tries=120, action='wait for task status') as proceed: + while proceed(): + active = self.fs.get_active_names() + log.debug("current active={0}".format(active)) + task_status = self.fs.get_task_status("scrub status") + if not active[0] in task_status: + return True + + def _check_task_status(self, expected_status, timo=120): + """ check scrub status for current active mds in ceph status """ + with safe_while(sleep=1, tries=120, action='wait for task status') as proceed: + while proceed(): + active = self.fs.get_active_names() + log.debug("current active={0}".format(active)) + task_status = self.fs.get_task_status("scrub status") + try: + if task_status[active[0]].startswith(expected_status): + return True + except KeyError: + pass + + def _find_path_inos(self, root_path): + inos = [] + p = self.mount_a.run_shell(["find", root_path]) + paths = p.stdout.getvalue().strip().split() + for path in paths: + inos.append(self.mount_a.path_to_ino(path)) + return inos + + def _setup_subtrees(self): + self.fs.set_max_mds(3) + self.fs.wait_for_daemons() + status = self.fs.status() + + path = 'd1/d2/d3/d4/d5/d6/d7/d8' + self.mount_a.run_shell(['mkdir', '-p', path]) + self.mount_a.run_shell(['sync', path]) + + self.mount_a.setfattr("d1/d2", "ceph.dir.pin", "0") + self.mount_a.setfattr("d1/d2/d3/d4", "ceph.dir.pin", "1") + self.mount_a.setfattr("d1/d2/d3/d4/d5/d6", "ceph.dir.pin", "2") + + self._wait_subtrees([('/d1/d2', 0), ('/d1/d2/d3/d4', 1)], status, 0) + self._wait_subtrees([('/d1/d2/d3/d4', 1), ('/d1/d2/d3/d4/d5/d6', 2)], status, 1) + self._wait_subtrees([('/d1/d2/d3/d4', 1), ('/d1/d2/d3/d4/d5/d6', 2)], status, 2) + + for rank in range(3): + self.fs.rank_tell(["flush", "journal"], rank) + + def test_apply_tag(self): + self._setup_subtrees() + inos = self._find_path_inos('d1/d2/d3/') + + tag = "tag123" + out_json = self.fs.rank_tell(["tag", "path", "/d1/d2/d3", tag], 0) + self.assertNotEqual(out_json, None) + self.assertEqual(out_json["return_code"], 0) + self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True) + + def assertTagged(ino): + file_obj_name = "{0:x}.00000000".format(ino) + self.fs.radosm(["getxattr", file_obj_name, "scrub_tag"]) + + for ino in inos: + assertTagged(ino) + + def test_scrub_backtrace(self): + self._setup_subtrees() + inos = self._find_path_inos('d1/d2/d3/') + + for ino in inos: + file_obj_name = "{0:x}.00000000".format(ino) + self.fs.radosm(["rmxattr", file_obj_name, "parent"]) + + out_json = self.fs.run_scrub(["start", "/d1/d2/d3", "recursive,force"], 0) + self.assertNotEqual(out_json, None) + self.assertEqual(out_json["return_code"], 0) + self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True) + + def _check_damage(mds_rank, inos): + all_damage = self.fs.rank_tell(["damage", "ls"], mds_rank) + damage = [d for d in all_damage if d['ino'] in inos and d['damage_type'] == "backtrace"] + return len(damage) >= len(inos) + + self.assertTrue(_check_damage(0, inos[0:2])) + self.assertTrue(_check_damage(1, inos[2:4])) + self.assertTrue(_check_damage(2, inos[4:6])) + + def test_scrub_non_mds0(self): + self._setup_subtrees() + + def expect_exdev(cmd, mds): + try: + self.fs.mon_manager.raw_cluster_cmd('tell', 'mds.{0}'.format(mds), *cmd) + except CommandFailedError as e: + if e.exitstatus == errno.EXDEV: + pass + else: + raise + else: + raise RuntimeError("expected failure") + + rank1 = self.fs.get_rank(rank=1) + expect_exdev(["scrub", "start", "/d1/d2/d3"], rank1["name"]) + expect_exdev(["scrub", "abort"], rank1["name"]) + expect_exdev(["scrub", "pause"], rank1["name"]) + expect_exdev(["scrub", "resume"], rank1["name"]) + + def test_scrub_abort_mds0(self): + self._setup_subtrees() + + inos = self._find_path_inos('d1/d2/d3/') + + for ino in inos: + file_obj_name = "{0:x}.00000000".format(ino) + self.fs.radosm(["rmxattr", file_obj_name, "parent"]) + + out_json = self.fs.run_scrub(["start", "/d1/d2/d3", "recursive,force"], 0) + self.assertNotEqual(out_json, None) + + res = self.fs.run_scrub(["abort"]) + self.assertEqual(res['return_code'], 0) + + # Abort and verify in both mdss. We also check the status in rank 0 mds because + # it is supposed to gather the scrub status from other mdss. + self._check_scrub_status() + + # sleep enough to fetch updated task status + checked = self._check_task_status_na() + self.assertTrue(checked) + + def test_scrub_pause_and_resume_mds0(self): + self._setup_subtrees() + + inos = self._find_path_inos('d1/d2/d3/') + + for ino in inos: + file_obj_name = "{0:x}.00000000".format(ino) + self.fs.radosm(["rmxattr", file_obj_name, "parent"]) + + out_json = self.fs.run_scrub(["start", "/d1/d2/d3", "recursive,force"], 0) + self.assertNotEqual(out_json, None) + + res = self.fs.run_scrub(["pause"]) + self.assertEqual(res['return_code'], 0) + + self._check_scrub_status(result="PAUSED") + + checked = self._check_task_status("paused") + self.assertTrue(checked) + + # resume and verify + res = self.fs.run_scrub(["resume"]) + self.assertEqual(res['return_code'], 0) + + self._check_scrub_status(result="PAUSED", reverse=True) + + checked = self._check_task_status_na() + self.assertTrue(checked) + + def test_scrub_pause_and_resume_with_abort_mds0(self): + self._setup_subtrees() + + inos = self._find_path_inos('d1/d2/d3/') + + for ino in inos: + file_obj_name = "{0:x}.00000000".format(ino) + self.fs.radosm(["rmxattr", file_obj_name, "parent"]) + + out_json = self.fs.run_scrub(["start", "/d1/d2/d3", "recursive,force"], 0) + self.assertNotEqual(out_json, None) + + res = self.fs.run_scrub(["pause"]) + self.assertEqual(res['return_code'], 0) + + self._check_scrub_status(result="PAUSED") + + checked = self._check_task_status("paused") + self.assertTrue(checked) + + res = self.fs.run_scrub(["abort"]) + self.assertEqual(res['return_code'], 0) + + self._check_scrub_status(result="PAUSED") + self._check_scrub_status(result="0 inodes") + + # scrub status should still be paused... + checked = self._check_task_status("paused") + self.assertTrue(checked) + + # resume and verify + res = self.fs.run_scrub(["resume"]) + self.assertEqual(res['return_code'], 0) + + self._check_scrub_status(result="PAUSED", reverse=True) + + checked = self._check_task_status_na() + self.assertTrue(checked) diff --git a/qa/tasks/cephfs/test_newops.py b/qa/tasks/cephfs/test_newops.py new file mode 100644 index 000000000..0071cb5d3 --- /dev/null +++ b/qa/tasks/cephfs/test_newops.py @@ -0,0 +1,18 @@ +import logging +from tasks.cephfs.cephfs_test_case import CephFSTestCase + +log = logging.getLogger(__name__) + +class TestNewOps(CephFSTestCase): + def test_newops_getvxattr(self): + """ + For nautilus it will crash the MDSs when receive unknown OPs, as a workaround + the clients should avoid sending them to nautilus + """ + + log.info("Test for new getvxattr op...") + self.mount_a.run_shell(["mkdir", "newop_getvxattr_dir"]) + + # to test whether will nautilus crash the MDSs + self.mount_a.getfattr("./newop_getvxattr_dir", "ceph.dir.pin.random") + log.info("Test for new getvxattr op succeeds") diff --git a/qa/tasks/cephfs/test_nfs.py b/qa/tasks/cephfs/test_nfs.py new file mode 100644 index 000000000..0a10709e6 --- /dev/null +++ b/qa/tasks/cephfs/test_nfs.py @@ -0,0 +1,880 @@ +# NOTE: these tests are not yet compatible with vstart_runner.py. +import errno +import json +import time +import logging +from io import BytesIO, StringIO + +from tasks.mgr.mgr_test_case import MgrTestCase +from teuthology import contextutil +from teuthology.exceptions import CommandFailedError + +log = logging.getLogger(__name__) + +NFS_POOL_NAME = '.nfs' # should match mgr_module.py + +# TODO Add test for cluster update when ganesha can be deployed on multiple ports. +class TestNFS(MgrTestCase): + def _cmd(self, *args): + return self.mgr_cluster.mon_manager.raw_cluster_cmd(*args) + + def _nfs_cmd(self, *args): + return self._cmd("nfs", *args) + + def _nfs_complete_cmd(self, cmd): + return self.mgr_cluster.mon_manager.run_cluster_cmd(args=f"nfs {cmd}", + stdout=StringIO(), + stderr=StringIO(), + check_status=False) + + def _orch_cmd(self, *args): + return self._cmd("orch", *args) + + def _sys_cmd(self, cmd): + ret = self.ctx.cluster.run(args=cmd, check_status=False, stdout=BytesIO(), stderr=BytesIO()) + stdout = ret[0].stdout + if stdout: + return stdout.getvalue() + + def setUp(self): + super(TestNFS, self).setUp() + self._load_module('nfs') + self.cluster_id = "test" + self.export_type = "cephfs" + self.pseudo_path = "/cephfs" + self.path = "/" + self.fs_name = "nfs-cephfs" + self.expected_name = "nfs.test" + self.sample_export = { + "export_id": 1, + "path": self.path, + "cluster_id": self.cluster_id, + "pseudo": self.pseudo_path, + "access_type": "RW", + "squash": "none", + "security_label": True, + "protocols": [ + 4 + ], + "transports": [ + "TCP" + ], + "fsal": { + "name": "CEPH", + "user_id": "nfs.test.1", + "fs_name": self.fs_name, + }, + "clients": [] + } + + def _check_nfs_server_status(self): + res = self._sys_cmd(['sudo', 'systemctl', 'status', 'nfs-server']) + if isinstance(res, bytes) and b'Active: active' in res: + self._disable_nfs() + + def _disable_nfs(self): + log.info("Disabling NFS") + self._sys_cmd(['sudo', 'systemctl', 'disable', 'nfs-server', '--now']) + + def _fetch_nfs_daemons_details(self, enable_json=False): + args = ('ps', f'--service_name={self.expected_name}') + if enable_json: + args = (*args, '--format=json') + return self._orch_cmd(*args) + + def _check_nfs_cluster_event(self, expected_event): + ''' + Check whether an event occured during the lifetime of the NFS service + :param expected_event: event that was expected to occur + ''' + event_occurred = False + # Wait few seconds for NFS daemons' status to be updated + with contextutil.safe_while(sleep=10, tries=18, _raise=False) as proceed: + while not event_occurred and proceed(): + daemons_details = json.loads( + self._fetch_nfs_daemons_details(enable_json=True)) + log.info('daemons details %s', daemons_details) + # 'events' key may not exist in the daemon description + # after a mgr fail over and could take some time to appear + # (it's populated on first daemon event) + if 'events' not in daemons_details[0]: + continue + for event in daemons_details[0]['events']: + log.info('daemon event %s', event) + if expected_event in event: + event_occurred = True + break + return event_occurred + + def _check_nfs_cluster_status(self, expected_status, fail_msg): + ''' + Check the current status of the NFS service + :param expected_status: Status to be verified + :param fail_msg: Message to be printed if test failed + ''' + # Wait for a minute as ganesha daemon takes some time to be + # deleted/created + with contextutil.safe_while(sleep=6, tries=10, _raise=False) as proceed: + while proceed(): + if expected_status in self._fetch_nfs_daemons_details(): + return + self.fail(fail_msg) + + def _check_auth_ls(self, export_id=1, check_in=False): + ''' + Tests export user id creation or deletion. + :param export_id: Denotes export number + :param check_in: Check specified export id + ''' + output = self._cmd('auth', 'ls') + client_id = f'client.nfs.{self.cluster_id}' + if check_in: + self.assertIn(f'{client_id}.{export_id}', output) + else: + self.assertNotIn(f'{client_id}.{export_id}', output) + + def _test_idempotency(self, cmd_func, cmd_args): + ''' + Test idempotency of commands. It first runs the TestNFS test method + for a command and then checks the result of command run again. TestNFS + test method has required checks to verify that command works. + :param cmd_func: TestNFS method + :param cmd_args: nfs command arguments to be run + ''' + cmd_func() + ret = self.mgr_cluster.mon_manager.raw_cluster_cmd_result(*cmd_args) + if ret != 0: + self.fail("Idempotency test failed") + + def _test_create_cluster(self): + ''' + Test single nfs cluster deployment. + ''' + with contextutil.safe_while(sleep=4, tries=10) as proceed: + while proceed(): + try: + # Disable any running nfs ganesha daemon + self._check_nfs_server_status() + cluster_create = self._nfs_complete_cmd( + f'cluster create {self.cluster_id}') + if cluster_create.stderr and 'cluster already exists' \ + in cluster_create.stderr.getvalue(): + self._test_delete_cluster() + continue + # Check for expected status and daemon name + # (nfs.<cluster_id>) + self._check_nfs_cluster_status( + 'running', 'NFS Ganesha cluster deployment failed') + break + except (AssertionError, CommandFailedError) as e: + log.warning(f'{e}, retrying') + + def _test_delete_cluster(self): + ''' + Test deletion of a single nfs cluster. + ''' + self._nfs_cmd('cluster', 'rm', self.cluster_id) + self._check_nfs_cluster_status('No daemons reported', + 'NFS Ganesha cluster could not be deleted') + + def _test_list_cluster(self, empty=False): + ''' + Test listing of deployed nfs clusters. If nfs cluster is deployed then + it checks for expected cluster id. Otherwise checks nothing is listed. + :param empty: If true it denotes no cluster is deployed. + ''' + nfs_output = self._nfs_cmd('cluster', 'ls') + jdata = json.loads(nfs_output) + if empty: + self.assertEqual(len(jdata), 0) + else: + cluster_id = self.cluster_id + self.assertEqual([cluster_id], jdata) + + def _create_export(self, export_id, create_fs=False, extra_cmd=None): + ''' + Test creation of a single export. + :param export_id: Denotes export number + :param create_fs: If false filesytem exists. Otherwise create it. + :param extra_cmd: List of extra arguments for creating export. + ''' + if create_fs: + self._cmd('fs', 'volume', 'create', self.fs_name) + with contextutil.safe_while(sleep=5, tries=30) as proceed: + while proceed(): + output = self._cmd( + 'orch', 'ls', '-f', 'json', + '--service-name', f'mds.{self.fs_name}' + ) + j = json.loads(output) + if j[0]['status']['running']: + break + export_cmd = ['nfs', 'export', 'create', 'cephfs', + '--fsname', self.fs_name, '--cluster-id', self.cluster_id] + if isinstance(extra_cmd, list): + export_cmd.extend(extra_cmd) + else: + export_cmd.extend(['--pseudo-path', self.pseudo_path]) + # Runs the nfs export create command + self._cmd(*export_cmd) + # Check if user id for export is created + self._check_auth_ls(export_id, check_in=True) + res = self._sys_cmd(['rados', '-p', NFS_POOL_NAME, '-N', self.cluster_id, 'get', + f'export-{export_id}', '-']) + # Check if export object is created + if res == b'': + self.fail("Export cannot be created") + + def _create_default_export(self): + ''' + Deploy a single nfs cluster and create export with default options. + ''' + self._test_create_cluster() + self._create_export(export_id='1', create_fs=True) + + def _delete_export(self): + ''' + Delete an export. + ''' + self._nfs_cmd('export', 'rm', self.cluster_id, self.pseudo_path) + self._check_auth_ls() + + def _test_list_export(self): + ''' + Test listing of created exports. + ''' + nfs_output = json.loads(self._nfs_cmd('export', 'ls', self.cluster_id)) + self.assertIn(self.pseudo_path, nfs_output) + + def _test_list_detailed(self, sub_vol_path): + ''' + Test listing of created exports with detailed option. + :param sub_vol_path: Denotes path of subvolume + ''' + nfs_output = json.loads(self._nfs_cmd('export', 'ls', self.cluster_id, '--detailed')) + # Export-1 with default values (access type = rw and path = '\') + self.assertDictEqual(self.sample_export, nfs_output[0]) + # Export-2 with r only + self.sample_export['export_id'] = 2 + self.sample_export['pseudo'] = self.pseudo_path + '1' + self.sample_export['access_type'] = 'RO' + self.sample_export['fsal']['user_id'] = f'{self.expected_name}.2' + self.assertDictEqual(self.sample_export, nfs_output[1]) + # Export-3 for subvolume with r only + self.sample_export['export_id'] = 3 + self.sample_export['path'] = sub_vol_path + self.sample_export['pseudo'] = self.pseudo_path + '2' + self.sample_export['fsal']['user_id'] = f'{self.expected_name}.3' + self.assertDictEqual(self.sample_export, nfs_output[2]) + # Export-4 for subvolume + self.sample_export['export_id'] = 4 + self.sample_export['pseudo'] = self.pseudo_path + '3' + self.sample_export['access_type'] = 'RW' + self.sample_export['fsal']['user_id'] = f'{self.expected_name}.4' + self.assertDictEqual(self.sample_export, nfs_output[3]) + + def _get_export(self): + ''' + Returns export block in json format + ''' + return json.loads(self._nfs_cmd('export', 'info', self.cluster_id, self.pseudo_path)) + + def _test_get_export(self): + ''' + Test fetching of created export. + ''' + nfs_output = self._get_export() + self.assertDictEqual(self.sample_export, nfs_output) + + def _check_export_obj_deleted(self, conf_obj=False): + ''' + Test if export or config object are deleted successfully. + :param conf_obj: It denotes config object needs to be checked + ''' + rados_obj_ls = self._sys_cmd(['rados', '-p', NFS_POOL_NAME, '-N', self.cluster_id, 'ls']) + + if b'export-' in rados_obj_ls or (conf_obj and b'conf-nfs' in rados_obj_ls): + self.fail("Delete export failed") + + def _get_port_ip_info(self): + ''' + Return port and ip for a cluster + ''' + #{'test': {'backend': [{'hostname': 'smithi068', 'ip': '172.21.15.68', + #'port': 2049}]}} + with contextutil.safe_while(sleep=5, tries=6) as proceed: + while proceed(): + try: + info_output = json.loads( + self._nfs_cmd('cluster', 'info', + self.cluster_id))['test']['backend'][0] + return info_output["port"], info_output["ip"] + except (IndexError, CommandFailedError) as e: + if 'list index out of range' in str(e): + log.warning('no port and/or ip found, retrying') + else: + log.warning(f'{e}, retrying') + + def _test_mnt(self, pseudo_path, port, ip, check=True): + ''' + Test mounting of created exports + :param pseudo_path: It is the pseudo root name + :param port: Port of deployed nfs cluster + :param ip: IP of deployed nfs cluster + :param check: It denotes if i/o testing needs to be done + ''' + tries = 3 + while True: + try: + self.ctx.cluster.run( + args=['sudo', 'mount', '-t', 'nfs', '-o', f'port={port}', + f'{ip}:{pseudo_path}', '/mnt']) + break + except CommandFailedError as e: + if tries: + tries -= 1 + time.sleep(2) + continue + # Check if mount failed only when non existing pseudo path is passed + if not check and e.exitstatus == 32: + return + raise + + self.ctx.cluster.run(args=['sudo', 'chmod', '1777', '/mnt']) + + try: + self.ctx.cluster.run(args=['touch', '/mnt/test']) + out_mnt = self._sys_cmd(['ls', '/mnt']) + self.assertEqual(out_mnt, b'test\n') + finally: + self.ctx.cluster.run(args=['sudo', 'umount', '/mnt']) + + def _write_to_read_only_export(self, pseudo_path, port, ip): + ''' + Check if write to read only export fails + ''' + try: + self._test_mnt(pseudo_path, port, ip) + except CommandFailedError as e: + # Write to cephfs export should fail for test to pass + self.assertEqual( + e.exitstatus, errno.EPERM, + 'invalid error code on trying to write to read-only export') + else: + self.fail('expected write to a read-only export to fail') + + def _create_cluster_with_fs(self, fs_name, mnt_pt=None): + """ + create a cluster along with fs and mount it to the path supplied + :param fs_name: name of CephFS volume to be created + :param mnt_pt: mount fs to the path + """ + self._test_create_cluster() + self._cmd('fs', 'volume', 'create', fs_name) + with contextutil.safe_while(sleep=5, tries=30) as proceed: + while proceed(): + output = self._cmd( + 'orch', 'ls', '-f', 'json', + '--service-name', f'mds.{fs_name}' + ) + j = json.loads(output) + if j[0]['status']['running']: + break + if mnt_pt: + with contextutil.safe_while(sleep=3, tries=3) as proceed: + while proceed(): + try: + self.ctx.cluster.run(args=['sudo', 'ceph-fuse', mnt_pt]) + break + except CommandFailedError as e: + log.warning(f'{e}, retrying') + self.ctx.cluster.run(args=['sudo', 'chmod', '1777', mnt_pt]) + + def _delete_cluster_with_fs(self, fs_name, mnt_pt=None, mode=None): + """ + delete cluster along with fs and unmount it from the path supplied + :param fs_name: name of CephFS volume to be deleted + :param mnt_pt: unmount fs from the path + :param mode: revert to this mode + """ + if mnt_pt: + self.ctx.cluster.run(args=['sudo', 'umount', mnt_pt]) + if mode: + if isinstance(mode, bytes): + mode = mode.decode().strip() + self.ctx.cluster.run(args=['sudo', 'chmod', mode, mnt_pt]) + self._cmd('fs', 'volume', 'rm', fs_name, '--yes-i-really-mean-it') + self._test_delete_cluster() + + def test_create_and_delete_cluster(self): + ''' + Test successful creation and deletion of the nfs cluster. + ''' + self._test_create_cluster() + self._test_list_cluster() + self._test_delete_cluster() + # List clusters again to ensure no cluster is shown + self._test_list_cluster(empty=True) + + def test_create_delete_cluster_idempotency(self): + ''' + Test idempotency of cluster create and delete commands. + ''' + self._test_idempotency(self._test_create_cluster, ['nfs', 'cluster', 'create', self.cluster_id]) + self._test_idempotency(self._test_delete_cluster, ['nfs', 'cluster', 'rm', self.cluster_id]) + + def test_create_cluster_with_invalid_cluster_id(self): + ''' + Test nfs cluster deployment failure with invalid cluster id. + ''' + try: + invalid_cluster_id = '/cluster_test' # Only [A-Za-z0-9-_.] chars are valid + self._nfs_cmd('cluster', 'create', invalid_cluster_id) + self.fail(f"Cluster successfully created with invalid cluster id {invalid_cluster_id}") + except CommandFailedError as e: + # Command should fail for test to pass + if e.exitstatus != errno.EINVAL: + raise + + def test_create_and_delete_export(self): + ''' + Test successful creation and deletion of the cephfs export. + ''' + self._create_default_export() + self._test_get_export() + port, ip = self._get_port_ip_info() + self._test_mnt(self.pseudo_path, port, ip) + self._delete_export() + # Check if rados export object is deleted + self._check_export_obj_deleted() + self._test_mnt(self.pseudo_path, port, ip, False) + self._test_delete_cluster() + + def test_create_delete_export_idempotency(self): + ''' + Test idempotency of export create and delete commands. + ''' + self._test_idempotency(self._create_default_export, [ + 'nfs', 'export', 'create', 'cephfs', + '--fsname', self.fs_name, '--cluster-id', self.cluster_id, + '--pseudo-path', self.pseudo_path]) + self._test_idempotency(self._delete_export, ['nfs', 'export', 'rm', self.cluster_id, + self.pseudo_path]) + self._test_delete_cluster() + + def test_create_multiple_exports(self): + ''' + Test creating multiple exports with different access type and path. + ''' + # Export-1 with default values (access type = rw and path = '\') + self._create_default_export() + # Export-2 with r only + self._create_export(export_id='2', + extra_cmd=['--pseudo-path', self.pseudo_path+'1', '--readonly']) + # Export-3 for subvolume with r only + self._cmd('fs', 'subvolume', 'create', self.fs_name, 'sub_vol') + fs_path = self._cmd('fs', 'subvolume', 'getpath', self.fs_name, 'sub_vol').strip() + self._create_export(export_id='3', + extra_cmd=['--pseudo-path', self.pseudo_path+'2', '--readonly', + '--path', fs_path]) + # Export-4 for subvolume + self._create_export(export_id='4', + extra_cmd=['--pseudo-path', self.pseudo_path+'3', + '--path', fs_path]) + # Check if exports gets listed + self._test_list_detailed(fs_path) + self._test_delete_cluster() + # Check if rados ganesha conf object is deleted + self._check_export_obj_deleted(conf_obj=True) + self._check_auth_ls() + + def test_exports_on_mgr_restart(self): + ''' + Test export availability on restarting mgr. + ''' + self._create_default_export() + # unload and load module will restart the mgr + self._unload_module("cephadm") + self._load_module("cephadm") + self._orch_cmd("set", "backend", "cephadm") + # Check if ganesha daemon is running + self._check_nfs_cluster_status('running', 'Failed to redeploy NFS Ganesha cluster') + # Checks if created export is listed + self._test_list_export() + port, ip = self._get_port_ip_info() + self._test_mnt(self.pseudo_path, port, ip) + self._delete_export() + self._test_delete_cluster() + + def test_export_create_with_non_existing_fsname(self): + ''' + Test creating export with non-existing filesystem. + ''' + try: + fs_name = 'nfs-test' + self._test_create_cluster() + self._nfs_cmd('export', 'create', 'cephfs', + '--fsname', fs_name, '--cluster-id', self.cluster_id, + '--pseudo-path', self.pseudo_path) + self.fail(f"Export created with non-existing filesystem {fs_name}") + except CommandFailedError as e: + # Command should fail for test to pass + if e.exitstatus != errno.ENOENT: + raise + finally: + self._test_delete_cluster() + + def test_export_create_with_non_existing_clusterid(self): + ''' + Test creating cephfs export with non-existing nfs cluster. + ''' + try: + cluster_id = 'invalidtest' + self._nfs_cmd('export', 'create', 'cephfs', '--fsname', self.fs_name, + '--cluster-id', cluster_id, '--pseudo-path', self.pseudo_path) + self.fail(f"Export created with non-existing cluster id {cluster_id}") + except CommandFailedError as e: + # Command should fail for test to pass + if e.exitstatus != errno.ENOENT: + raise + + def test_export_create_with_relative_pseudo_path_and_root_directory(self): + ''' + Test creating cephfs export with relative or '/' pseudo path. + ''' + def check_pseudo_path(pseudo_path): + try: + self._nfs_cmd('export', 'create', 'cephfs', '--fsname', self.fs_name, + '--cluster-id', self.cluster_id, + '--pseudo-path', pseudo_path) + self.fail(f"Export created for {pseudo_path}") + except CommandFailedError as e: + # Command should fail for test to pass + if e.exitstatus != errno.EINVAL: + raise + + self._test_create_cluster() + self._cmd('fs', 'volume', 'create', self.fs_name) + check_pseudo_path('invalidpath') + check_pseudo_path('/') + check_pseudo_path('//') + self._cmd('fs', 'volume', 'rm', self.fs_name, '--yes-i-really-mean-it') + self._test_delete_cluster() + + def test_write_to_read_only_export(self): + ''' + Test write to readonly export. + ''' + self._test_create_cluster() + self._create_export(export_id='1', create_fs=True, + extra_cmd=['--pseudo-path', self.pseudo_path, '--readonly']) + port, ip = self._get_port_ip_info() + self._check_nfs_cluster_status('running', 'NFS Ganesha cluster restart failed') + self._write_to_read_only_export(self.pseudo_path, port, ip) + self._test_delete_cluster() + + def test_cluster_info(self): + ''' + Test cluster info outputs correct ip and hostname + ''' + self._test_create_cluster() + info_output = json.loads(self._nfs_cmd('cluster', 'info', self.cluster_id)) + print(f'info {info_output}') + info_ip = info_output[self.cluster_id].get('backend', [])[0].pop("ip") + host_details = { + self.cluster_id: { + 'backend': [ + { + "hostname": self._sys_cmd(['hostname']).decode("utf-8").strip(), + "port": 2049 + } + ], + "virtual_ip": None, + } + } + host_ip = self._sys_cmd(['hostname', '-I']).decode("utf-8").split() + print(f'host_ip is {host_ip}, info_ip is {info_ip}') + self.assertDictEqual(info_output, host_details) + self.assertTrue(info_ip in host_ip) + self._test_delete_cluster() + + def test_cluster_set_reset_user_config(self): + ''' + Test cluster is created using user config and reverts back to default + config on reset. + ''' + self._test_create_cluster() + + pool = NFS_POOL_NAME + user_id = 'test' + fs_name = 'user_test_fs' + pseudo_path = '/ceph' + self._cmd('fs', 'volume', 'create', fs_name) + time.sleep(20) + key = self._cmd('auth', 'get-or-create-key', f'client.{user_id}', 'mon', + 'allow r', 'osd', + f'allow rw pool={pool} namespace={self.cluster_id}, allow rw tag cephfs data={fs_name}', + 'mds', f'allow rw path={self.path}').strip() + config = f""" LOG {{ + Default_log_level = FULL_DEBUG; + }} + + EXPORT {{ + Export_Id = 100; + Transports = TCP; + Path = /; + Pseudo = {pseudo_path}; + Protocols = 4; + Access_Type = RW; + Attr_Expiration_Time = 0; + Squash = None; + FSAL {{ + Name = CEPH; + Filesystem = {fs_name}; + User_Id = {user_id}; + Secret_Access_Key = '{key}'; + }} + }}""" + port, ip = self._get_port_ip_info() + self.ctx.cluster.run(args=['ceph', 'nfs', 'cluster', 'config', + 'set', self.cluster_id, '-i', '-'], stdin=config) + time.sleep(30) + res = self._sys_cmd(['rados', '-p', pool, '-N', self.cluster_id, 'get', + f'userconf-nfs.{user_id}', '-']) + self.assertEqual(config, res.decode('utf-8')) + self._test_mnt(pseudo_path, port, ip) + self._nfs_cmd('cluster', 'config', 'reset', self.cluster_id) + rados_obj_ls = self._sys_cmd(['rados', '-p', NFS_POOL_NAME, '-N', self.cluster_id, 'ls']) + if b'conf-nfs' not in rados_obj_ls and b'userconf-nfs' in rados_obj_ls: + self.fail("User config not deleted") + time.sleep(30) + self._test_mnt(pseudo_path, port, ip, False) + self._cmd('fs', 'volume', 'rm', fs_name, '--yes-i-really-mean-it') + self._test_delete_cluster() + + def test_cluster_set_user_config_with_non_existing_clusterid(self): + ''' + Test setting user config for non-existing nfs cluster. + ''' + cluster_id = 'invalidtest' + with contextutil.safe_while(sleep=3, tries=3) as proceed: + while proceed(): + try: + self.ctx.cluster.run(args=['ceph', 'nfs', 'cluster', + 'config', 'set', cluster_id, + '-i', '-'], stdin='testing') + self.fail(f"User config set for non-existing cluster" + f"{cluster_id}") + except CommandFailedError as e: + # Command should fail for test to pass + if e.exitstatus == errno.ENOENT: + break + log.warning('exitstatus != ENOENT, retrying') + + def test_cluster_reset_user_config_with_non_existing_clusterid(self): + ''' + Test resetting user config for non-existing nfs cluster. + ''' + try: + cluster_id = 'invalidtest' + self._nfs_cmd('cluster', 'config', 'reset', cluster_id) + self.fail(f"User config reset for non-existing cluster {cluster_id}") + except CommandFailedError as e: + # Command should fail for test to pass + if e.exitstatus != errno.ENOENT: + raise + + def test_create_export_via_apply(self): + ''' + Test creation of export via apply + ''' + self._test_create_cluster() + self.ctx.cluster.run(args=['ceph', 'nfs', 'export', 'apply', + self.cluster_id, '-i', '-'], + stdin=json.dumps({ + "path": "/", + "pseudo": "/cephfs", + "squash": "none", + "access_type": "rw", + "protocols": [4], + "fsal": { + "name": "CEPH", + "fs_name": self.fs_name + } + })) + port, ip = self._get_port_ip_info() + self._test_mnt(self.pseudo_path, port, ip) + self._check_nfs_cluster_status( + 'running', 'NFS Ganesha cluster not running after new export was applied') + self._test_delete_cluster() + + def test_update_export(self): + ''' + Test update of export's pseudo path and access type from rw to ro + ''' + self._create_default_export() + port, ip = self._get_port_ip_info() + self._test_mnt(self.pseudo_path, port, ip) + export_block = self._get_export() + new_pseudo_path = '/testing' + export_block['pseudo'] = new_pseudo_path + export_block['access_type'] = 'RO' + self.ctx.cluster.run(args=['ceph', 'nfs', 'export', 'apply', + self.cluster_id, '-i', '-'], + stdin=json.dumps(export_block)) + if not self._check_nfs_cluster_event('restart'): + self.fail("updating export's pseudo path should trigger restart of NFS service") + self._check_nfs_cluster_status('running', 'NFS Ganesha cluster not running after restart') + self._write_to_read_only_export(new_pseudo_path, port, ip) + self._test_delete_cluster() + + def test_update_export_ro_to_rw(self): + ''' + Test update of export's access level from ro to rw + ''' + self._test_create_cluster() + self._create_export( + export_id='1', create_fs=True, + extra_cmd=['--pseudo-path', self.pseudo_path, '--readonly']) + port, ip = self._get_port_ip_info() + self._write_to_read_only_export(self.pseudo_path, port, ip) + export_block = self._get_export() + export_block['access_type'] = 'RW' + self.ctx.cluster.run( + args=['ceph', 'nfs', 'export', 'apply', self.cluster_id, '-i', '-'], + stdin=json.dumps(export_block)) + if self._check_nfs_cluster_event('restart'): + self.fail("update of export's access type should not trigger NFS service restart") + self._test_mnt(self.pseudo_path, port, ip) + self._test_delete_cluster() + + def test_update_export_with_invalid_values(self): + ''' + Test update of export with invalid values + ''' + self._create_default_export() + export_block = self._get_export() + + def update_with_invalid_values(key, value, fsal=False): + export_block_new = dict(export_block) + if fsal: + export_block_new['fsal'] = dict(export_block['fsal']) + export_block_new['fsal'][key] = value + else: + export_block_new[key] = value + try: + self.ctx.cluster.run(args=['ceph', 'nfs', 'export', 'apply', + self.cluster_id, '-i', '-'], + stdin=json.dumps(export_block_new)) + except CommandFailedError: + pass + + update_with_invalid_values('export_id', 9) + update_with_invalid_values('cluster_id', 'testing_new') + update_with_invalid_values('pseudo', 'test_relpath') + update_with_invalid_values('access_type', 'W') + update_with_invalid_values('squash', 'no_squash') + update_with_invalid_values('security_label', 'invalid') + update_with_invalid_values('protocols', [2]) + update_with_invalid_values('transports', ['UD']) + update_with_invalid_values('name', 'RGW', True) + update_with_invalid_values('user_id', 'testing_export', True) + update_with_invalid_values('fs_name', 'b', True) + self._test_delete_cluster() + + def test_cmds_without_reqd_args(self): + ''' + Test that cmd fails on not passing required arguments + ''' + def exec_cmd_invalid(*cmd): + try: + self._nfs_cmd(*cmd) + self.fail(f"nfs {cmd} command executed successfully without required arguments") + except CommandFailedError as e: + # Command should fail for test to pass + if e.exitstatus != errno.EINVAL: + raise + + exec_cmd_invalid('cluster', 'create') + exec_cmd_invalid('cluster', 'delete') + exec_cmd_invalid('cluster', 'config', 'set') + exec_cmd_invalid('cluster', 'config', 'reset') + exec_cmd_invalid('export', 'create', 'cephfs') + exec_cmd_invalid('export', 'create', 'cephfs', 'clusterid') + exec_cmd_invalid('export', 'create', 'cephfs', 'clusterid', 'a_fs') + exec_cmd_invalid('export', 'ls') + exec_cmd_invalid('export', 'delete') + exec_cmd_invalid('export', 'delete', 'clusterid') + exec_cmd_invalid('export', 'info') + exec_cmd_invalid('export', 'info', 'clusterid') + exec_cmd_invalid('export', 'apply') + + def test_non_existent_cluster(self): + """ + Test that cluster info doesn't throw junk data for non-existent cluster + """ + cluster_ls = self._nfs_cmd('cluster', 'ls') + self.assertNotIn('foo', cluster_ls, 'cluster foo exists') + try: + self._nfs_cmd('cluster', 'info', 'foo') + self.fail("nfs cluster info foo returned successfully for non-existent cluster") + except CommandFailedError as e: + if e.exitstatus != errno.ENOENT: + raise + + def test_nfs_export_with_invalid_path(self): + """ + Test that nfs exports can't be created with invalid path + """ + mnt_pt = '/mnt' + preserve_mode = self._sys_cmd(['stat', '-c', '%a', mnt_pt]) + self._create_cluster_with_fs(self.fs_name, mnt_pt) + try: + self._create_export(export_id='123', + extra_cmd=['--pseudo-path', self.pseudo_path, + '--path', '/non_existent_dir']) + except CommandFailedError as e: + if e.exitstatus != errno.ENOENT: + raise + self._delete_cluster_with_fs(self.fs_name, mnt_pt, preserve_mode) + + def test_nfs_export_creation_at_filepath(self): + """ + Test that nfs exports can't be created at a filepath + """ + mnt_pt = '/mnt' + preserve_mode = self._sys_cmd(['stat', '-c', '%a', mnt_pt]) + self._create_cluster_with_fs(self.fs_name, mnt_pt) + self.ctx.cluster.run(args=['touch', f'{mnt_pt}/testfile']) + try: + self._create_export(export_id='123', extra_cmd=['--pseudo-path', + self.pseudo_path, + '--path', + '/testfile']) + except CommandFailedError as e: + if e.exitstatus != errno.ENOTDIR: + raise + self.ctx.cluster.run(args=['rm', '-rf', '/mnt/testfile']) + self._delete_cluster_with_fs(self.fs_name, mnt_pt, preserve_mode) + + def test_nfs_export_creation_at_symlink(self): + """ + Test that nfs exports can't be created at a symlink path + """ + mnt_pt = '/mnt' + preserve_mode = self._sys_cmd(['stat', '-c', '%a', mnt_pt]) + self._create_cluster_with_fs(self.fs_name, mnt_pt) + self.ctx.cluster.run(args=['mkdir', f'{mnt_pt}/testdir']) + self.ctx.cluster.run(args=['ln', '-s', f'{mnt_pt}/testdir', + f'{mnt_pt}/testdir_symlink']) + try: + self._create_export(export_id='123', + extra_cmd=['--pseudo-path', + self.pseudo_path, + '--path', + '/testdir_symlink']) + except CommandFailedError as e: + if e.exitstatus != errno.ENOTDIR: + raise + self.ctx.cluster.run(args=['rm', '-rf', f'{mnt_pt}/*']) + self._delete_cluster_with_fs(self.fs_name, mnt_pt, preserve_mode) diff --git a/qa/tasks/cephfs/test_openfiletable.py b/qa/tasks/cephfs/test_openfiletable.py new file mode 100644 index 000000000..eff6b5093 --- /dev/null +++ b/qa/tasks/cephfs/test_openfiletable.py @@ -0,0 +1,85 @@ +import time +import logging +from tasks.cephfs.cephfs_test_case import CephFSTestCase + +log = logging.getLogger(__name__) + +class OpenFileTable(CephFSTestCase): + CLIENTS_REQUIRED = 1 + MDSS_REQUIRED = 1 + + def _check_oft_counter(self, name, count): + perf_dump = self.fs.mds_asok(['perf', 'dump']) + if perf_dump['oft'][name] == count: + return True + return False + + def test_max_items_per_obj(self): + """ + The maximum number of openfiles omap objects keys are now equal to + osd_deep_scrub_large_omap_object_key_threshold option. + """ + self.set_conf("mds", "osd_deep_scrub_large_omap_object_key_threshold", "5") + + self.fs.mds_restart() + self.fs.wait_for_daemons() + + # Write some bytes to a file + size_mb = 1 + + # Hold the file open + file_count = 8 + for i in range(0, file_count): + filename = "open_file{}".format(i) + p = self.mount_a.open_background(filename) + self.mount_a.write_n_mb(filename, size_mb) + + time.sleep(10) + + """ + With osd_deep_scrub_large_omap_object_key_threshold value as 5 and + opening 8 files we should have a new rados object with name + mds0_openfiles.1 to hold the extra keys. + """ + + self.fs.radosm(["stat", "mds0_openfiles.1"]) + + # Now close the file + self.mount_a.kill_background(p) + + def test_perf_counters(self): + """ + Opening a file should increment omap_total_updates by 1. + """ + + self.set_conf("mds", "osd_deep_scrub_large_omap_object_key_threshold", "1") + self.fs.mds_restart() + self.fs.wait_for_daemons() + + perf_dump = self.fs.mds_asok(['perf', 'dump']) + omap_total_updates_0 = perf_dump['oft']['omap_total_updates'] + log.info("omap_total_updates_0:{}".format(omap_total_updates_0)) + + # Open the file + p = self.mount_a.open_background("omap_counter_test_file") + self.wait_until_true(lambda: self._check_oft_counter('omap_total_updates', 2), timeout=120) + + perf_dump = self.fs.mds_asok(['perf', 'dump']) + omap_total_updates_1 = perf_dump['oft']['omap_total_updates'] + log.info("omap_total_updates_1:{}".format(omap_total_updates_1)) + + self.assertTrue((omap_total_updates_1 - omap_total_updates_0) == 2) + + # Now close the file + self.mount_a.kill_background(p) + # Ensure that the file does not exist any more + self.wait_until_true(lambda: self._check_oft_counter('omap_total_removes', 1), timeout=120) + self.wait_until_true(lambda: self._check_oft_counter('omap_total_kv_pairs', 1), timeout=120) + + perf_dump = self.fs.mds_asok(['perf', 'dump']) + omap_total_removes = perf_dump['oft']['omap_total_removes'] + omap_total_kv_pairs = perf_dump['oft']['omap_total_kv_pairs'] + log.info("omap_total_removes:{}".format(omap_total_removes)) + log.info("omap_total_kv_pairs:{}".format(omap_total_kv_pairs)) + self.assertTrue(omap_total_removes == 1) + self.assertTrue(omap_total_kv_pairs == 1) diff --git a/qa/tasks/cephfs/test_pool_perm.py b/qa/tasks/cephfs/test_pool_perm.py new file mode 100644 index 000000000..9912debed --- /dev/null +++ b/qa/tasks/cephfs/test_pool_perm.py @@ -0,0 +1,109 @@ +from textwrap import dedent +from teuthology.exceptions import CommandFailedError +from tasks.cephfs.cephfs_test_case import CephFSTestCase +import os + + +class TestPoolPerm(CephFSTestCase): + def test_pool_perm(self): + self.mount_a.run_shell(["touch", "test_file"]) + + file_path = os.path.join(self.mount_a.mountpoint, "test_file") + + remote_script = dedent(""" + import os + import errno + + fd = os.open("{path}", os.O_RDWR) + try: + if {check_read}: + ret = os.read(fd, 1024) + else: + os.write(fd, b'content') + except OSError as e: + if e.errno != errno.EPERM: + raise + else: + raise RuntimeError("client does not check permission of data pool") + """) + + client_name = "client.{0}".format(self.mount_a.client_id) + + # set data pool read only + self.fs.mon_manager.raw_cluster_cmd_result( + 'auth', 'caps', client_name, 'mds', 'allow', 'mon', 'allow r', 'osd', + 'allow r pool={0}'.format(self.fs.get_data_pool_name())) + + self.mount_a.umount_wait() + self.mount_a.mount_wait() + + # write should fail + self.mount_a.run_python(remote_script.format(path=file_path, check_read=str(False))) + + # set data pool write only + self.fs.mon_manager.raw_cluster_cmd_result( + 'auth', 'caps', client_name, 'mds', 'allow', 'mon', 'allow r', 'osd', + 'allow w pool={0}'.format(self.fs.get_data_pool_name())) + + self.mount_a.umount_wait() + self.mount_a.mount_wait() + + # read should fail + self.mount_a.run_python(remote_script.format(path=file_path, check_read=str(True))) + + def test_forbidden_modification(self): + """ + That a client who does not have the capability for setting + layout pools is prevented from doing so. + """ + + # Set up + client_name = "client.{0}".format(self.mount_a.client_id) + new_pool_name = "data_new" + self.fs.add_data_pool(new_pool_name) + + self.mount_a.run_shell(["touch", "layoutfile"]) + self.mount_a.run_shell(["mkdir", "layoutdir"]) + + # Set MDS 'rw' perms: missing 'p' means no setting pool layouts + self.fs.mon_manager.raw_cluster_cmd_result( + 'auth', 'caps', client_name, 'mds', 'allow rw', 'mon', 'allow r', + 'osd', + 'allow rw pool={0},allow rw pool={1}'.format( + self.fs.get_data_pool_names()[0], + self.fs.get_data_pool_names()[1], + )) + + self.mount_a.umount_wait() + self.mount_a.mount_wait() + + with self.assertRaises(CommandFailedError): + self.mount_a.setfattr("layoutfile", "ceph.file.layout.pool", + new_pool_name) + with self.assertRaises(CommandFailedError): + self.mount_a.setfattr("layoutdir", "ceph.dir.layout.pool", + new_pool_name) + self.mount_a.umount_wait() + + # Set MDS 'rwp' perms: should now be able to set layouts + self.fs.mon_manager.raw_cluster_cmd_result( + 'auth', 'caps', client_name, 'mds', 'allow rwp', 'mon', 'allow r', + 'osd', + 'allow rw pool={0},allow rw pool={1}'.format( + self.fs.get_data_pool_names()[0], + self.fs.get_data_pool_names()[1], + )) + self.mount_a.mount_wait() + self.mount_a.setfattr("layoutfile", "ceph.file.layout.pool", + new_pool_name) + self.mount_a.setfattr("layoutdir", "ceph.dir.layout.pool", + new_pool_name) + self.mount_a.umount_wait() + + def tearDown(self): + self.fs.mon_manager.raw_cluster_cmd_result( + 'auth', 'caps', "client.{0}".format(self.mount_a.client_id), + 'mds', 'allow', 'mon', 'allow r', 'osd', + 'allow rw pool={0}'.format(self.fs.get_data_pool_names()[0])) + super(TestPoolPerm, self).tearDown() + diff --git a/qa/tasks/cephfs/test_quota.py b/qa/tasks/cephfs/test_quota.py new file mode 100644 index 000000000..0386672bd --- /dev/null +++ b/qa/tasks/cephfs/test_quota.py @@ -0,0 +1,106 @@ + +from tasks.cephfs.cephfs_test_case import CephFSTestCase + +from teuthology.exceptions import CommandFailedError + +class TestQuota(CephFSTestCase): + CLIENTS_REQUIRED = 2 + MDSS_REQUIRED = 1 + + def test_remote_update_getfattr(self): + """ + That quota changes made from one client are visible to another + client looking at ceph.quota xattrs + """ + self.mount_a.run_shell(["mkdir", "subdir"]) + + self.assertEqual( + self.mount_a.getfattr("./subdir", "ceph.quota.max_files"), + None) + self.assertEqual( + self.mount_b.getfattr("./subdir", "ceph.quota.max_files"), + None) + + self.mount_a.setfattr("./subdir", "ceph.quota.max_files", "10") + self.assertEqual( + self.mount_a.getfattr("./subdir", "ceph.quota.max_files"), + "10") + + # Should be visible as soon as setxattr operation completes on + # mds (we get here sooner because setfattr gets an early reply) + self.wait_until_equal( + lambda: self.mount_b.getfattr("./subdir", "ceph.quota.max_files"), + "10", timeout=10) + + def test_remote_update_df(self): + """ + That when a client modifies the quota on a directory used + as another client's root, the other client sees the change + reflected in their statfs output. + """ + + self.mount_b.umount_wait() + + self.mount_a.run_shell(["mkdir", "subdir"]) + + size_before = 1024 * 1024 * 128 + self.mount_a.setfattr("./subdir", "ceph.quota.max_bytes", + "%s" % size_before) + + self.mount_b.mount_wait(cephfs_mntpt="/subdir") + + self.assertDictEqual( + self.mount_b.df(), + { + "total": size_before, + "used": 0, + "available": size_before + }) + + size_after = 1024 * 1024 * 256 + self.mount_a.setfattr("./subdir", "ceph.quota.max_bytes", + "%s" % size_after) + + # Should be visible as soon as setxattr operation completes on + # mds (we get here sooner because setfattr gets an early reply) + self.wait_until_equal( + lambda: self.mount_b.df(), + { + "total": size_after, + "used": 0, + "available": size_after + }, + timeout=10 + ) + + def test_remote_update_write(self): + """ + That when a client modifies the quota on a directory used + as another client's root, the other client sees the effect + of the change when writing data. + """ + + self.mount_a.run_shell(["mkdir", "subdir_files"]) + self.mount_a.run_shell(["mkdir", "subdir_data"]) + + # Set some nice high quotas that mount_b's initial operations + # will be well within + self.mount_a.setfattr("./subdir_files", "ceph.quota.max_files", "100") + self.mount_a.setfattr("./subdir_data", "ceph.quota.max_bytes", "104857600") + + # Do some writes within my quota + self.mount_b.create_n_files("subdir_files/file", 20) + self.mount_b.write_n_mb("subdir_data/file", 20) + + # Set quotas lower than what mount_b already wrote, it should + # refuse to write more once it's seen them + self.mount_a.setfattr("./subdir_files", "ceph.quota.max_files", "10") + self.mount_a.setfattr("./subdir_data", "ceph.quota.max_bytes", "1048576") + + # Do some writes that would have been okay within the old quota, + # but are forbidden under the new quota + with self.assertRaises(CommandFailedError): + self.mount_b.create_n_files("subdir_files/file", 40) + with self.assertRaises(CommandFailedError): + self.mount_b.write_n_mb("subdir_data/file", 40) + diff --git a/qa/tasks/cephfs/test_readahead.py b/qa/tasks/cephfs/test_readahead.py new file mode 100644 index 000000000..7e6270f03 --- /dev/null +++ b/qa/tasks/cephfs/test_readahead.py @@ -0,0 +1,26 @@ +import logging +from tasks.cephfs.cephfs_test_case import CephFSTestCase + +log = logging.getLogger(__name__) + + +class TestReadahead(CephFSTestCase): + def test_flush(self): + # Create 32MB file + self.mount_a.run_shell(["dd", "if=/dev/urandom", "of=foo", "bs=1M", "count=32"]) + + # Unmount and remount the client to flush cache + self.mount_a.umount_wait() + self.mount_a.mount_wait() + + initial_op_read = self.mount_a.get_op_read_count() + self.mount_a.run_shell(["dd", "if=foo", "of=/dev/null", "bs=128k", "count=32"]) + op_read = self.mount_a.get_op_read_count() + self.assertGreaterEqual(op_read, initial_op_read) + op_read -= initial_op_read + log.info("read operations: {0}".format(op_read)) + + # with exponentially increasing readahead, we should see fewer than 10 operations + # but this test simply checks if the client is doing a remote read for each local read + if op_read >= 32: + raise RuntimeError("readahead not working") diff --git a/qa/tasks/cephfs/test_recovery_fs.py b/qa/tasks/cephfs/test_recovery_fs.py new file mode 100644 index 000000000..bbcdf9769 --- /dev/null +++ b/qa/tasks/cephfs/test_recovery_fs.py @@ -0,0 +1,38 @@ +import logging +from os.path import join as os_path_join + +from tasks.cephfs.cephfs_test_case import CephFSTestCase + +log = logging.getLogger(__name__) + +class TestFSRecovery(CephFSTestCase): + """ + Tests for recovering FS after loss of FSMap + """ + + CLIENTS_REQUIRED = 1 + MDSS_REQUIRED = 3 + + def test_recover_fs_after_fsmap_removal(self): + data_pool = self.fs.get_data_pool_name() + metadata_pool = self.fs.get_metadata_pool_name() + # write data in mount, and fsync + self.mount_a.create_n_files('file_on_fs', 1, sync=True) + # faild MDSs to allow removing the file system in the next step + self.fs.fail() + # Remove file system to lose FSMap and keep the pools intact. + # This mimics the scenario where the monitor store is rebuilt + # using OSDs to recover a cluster with corrupt monitor store. + # The FSMap is permanently lost, but the FS pools are + # recovered/intact + self.fs.rm() + # Recreate file system with pool and previous fscid + self.fs.mon_manager.raw_cluster_cmd( + 'fs', 'new', self.fs.name, metadata_pool, data_pool, + '--recover', '--force', '--fscid', f'{self.fs.id}') + self.fs.set_joinable() + # Check status of file system + self.fs.wait_for_daemons() + # check data in file sytem is intact + filepath = os_path_join(self.mount_a.hostfs_mntpt, 'file_on_fs_0') + self.assertEqual(self.mount_a.read_file(filepath), "0") diff --git a/qa/tasks/cephfs/test_recovery_pool.py b/qa/tasks/cephfs/test_recovery_pool.py new file mode 100644 index 000000000..8c4e1967d --- /dev/null +++ b/qa/tasks/cephfs/test_recovery_pool.py @@ -0,0 +1,179 @@ +""" +Test our tools for recovering metadata from the data pool into an alternate pool +""" + +import logging +import traceback +from collections import namedtuple + +from teuthology.exceptions import CommandFailedError +from tasks.cephfs.cephfs_test_case import CephFSTestCase + +log = logging.getLogger(__name__) + + +ValidationError = namedtuple("ValidationError", ["exception", "backtrace"]) + + +class OverlayWorkload(object): + def __init__(self): + self._initial_state = None + + # Accumulate backtraces for every failed validation, and return them. Backtraces + # are rather verbose, but we only see them when something breaks, and they + # let us see which check failed without having to decorate each check with + # a string + self._errors = [] + + def assert_equal(self, a, b): + try: + if a != b: + raise AssertionError("{0} != {1}".format(a, b)) + except AssertionError as e: + self._errors.append( + ValidationError(e, traceback.format_exc(3)) + ) + + def write(self): + """ + Write the workload files to the mount + """ + raise NotImplementedError() + + def validate(self): + """ + Read from the mount and validate that the workload files are present (i.e. have + survived or been reconstructed from the test scenario) + """ + raise NotImplementedError() + + def damage(self, fs): + """ + Damage the filesystem pools in ways that will be interesting to recover from. By + default just wipe everything in the metadata pool + """ + + pool = fs.get_metadata_pool_name() + fs.rados(["purge", pool, '--yes-i-really-really-mean-it']) + + def flush(self, fs): + """ + Called after client unmount, after write: flush whatever you want + """ + fs.rank_asok(["flush", "journal"]) + + +class SimpleOverlayWorkload(OverlayWorkload): + """ + Single file, single directory, check that it gets recovered and so does its size + """ + def write(self, mount): + mount.run_shell(["mkdir", "subdir"]) + mount.write_n_mb("subdir/sixmegs", 6) + self._initial_state = mount.stat("subdir/sixmegs") + + def validate(self, recovery_mount): + recovery_mount.run_shell(["ls", "subdir"]) + st = recovery_mount.stat("subdir/sixmegs") + self.assert_equal(st['st_size'], self._initial_state['st_size']) + return self._errors + +class TestRecoveryPool(CephFSTestCase): + MDSS_REQUIRED = 2 + CLIENTS_REQUIRED = 1 + REQUIRE_RECOVERY_FILESYSTEM = True + + def is_marked_damaged(self, rank): + mds_map = self.fs.get_mds_map() + return rank in mds_map['damaged'] + + def _rebuild_metadata(self, workload, other_pool=None, workers=1): + """ + That when all objects in metadata pool are removed, we can rebuild a metadata pool + based on the contents of a data pool, and a client can see and read our files. + """ + + # First, inject some files + + workload.write(self.mount_a) + + # Unmount the client and flush the journal: the tool should also cope with + # situations where there is dirty metadata, but we'll test that separately + self.mount_a.umount_wait() + workload.flush(self.fs) + self.fs.fail() + + # After recovery, we need the MDS to not be strict about stats (in production these options + # are off by default, but in QA we need to explicitly disable them) + # Note: these have to be written to ceph.conf to override existing ceph.conf values. + self.fs.set_ceph_conf('mds', 'mds verify scatter', False) + self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False) + self.fs.mds_restart() + + # Apply any data damage the workload wants + workload.damage(self.fs) + + # Create the alternate pool if requested + recovery_fs = self.mds_cluster.newfs(name="recovery_fs", create=False) + recovery_fs.set_data_pool_name(self.fs.get_data_pool_name()) + recovery_fs.create(recover=True, metadata_overlay=True) + + recovery_pool = recovery_fs.get_metadata_pool_name() + recovery_fs.mon_manager.raw_cluster_cmd('-s') + + # Reset the MDS map in case multiple ranks were in play: recovery procedure + # only understands how to rebuild metadata under rank 0 + #self.fs.reset() + #self.fs.table_tool([self.fs.name + ":0", "reset", "session"]) + #self.fs.table_tool([self.fs.name + ":0", "reset", "snap"]) + #self.fs.table_tool([self.fs.name + ":0", "reset", "inode"]) + + # Run the recovery procedure + recovery_fs.data_scan(['init', '--force-init', + '--filesystem', recovery_fs.name, + '--alternate-pool', recovery_pool]) + recovery_fs.table_tool([recovery_fs.name + ":0", "reset", "session"]) + recovery_fs.table_tool([recovery_fs.name + ":0", "reset", "snap"]) + recovery_fs.table_tool([recovery_fs.name + ":0", "reset", "inode"]) + if False: + with self.assertRaises(CommandFailedError): + # Normal reset should fail when no objects are present, we'll use --force instead + self.fs.journal_tool(["journal", "reset"], 0) + + recovery_fs.data_scan(['scan_extents', '--alternate-pool', + recovery_pool, '--filesystem', self.fs.name, + self.fs.get_data_pool_name()]) + recovery_fs.data_scan(['scan_inodes', '--alternate-pool', + recovery_pool, '--filesystem', self.fs.name, + '--force-corrupt', '--force-init', + self.fs.get_data_pool_name()]) + recovery_fs.data_scan(['scan_links', '--filesystem', recovery_fs.name]) + recovery_fs.journal_tool(['event', 'recover_dentries', 'list', + '--alternate-pool', recovery_pool], 0) + recovery_fs.journal_tool(["journal", "reset", "--force"], 0) + + # Start the MDS + recovery_fs.set_joinable() + status = recovery_fs.wait_for_daemons() + + self.config_set('mds', 'debug_mds', '20') + for rank in recovery_fs.get_ranks(status=status): + recovery_fs.rank_tell(['scrub', 'start', '/', 'force,recursive,repair'], rank=rank['rank'], status=status) + log.info(str(recovery_fs.status())) + + # Mount a client + self.mount_a.mount_wait(cephfs_name=recovery_fs.name) + + # See that the files are present and correct + errors = workload.validate(self.mount_a) + if errors: + log.error("Validation errors found: {0}".format(len(errors))) + for e in errors: + log.error(e.exception) + log.error(e.backtrace) + raise AssertionError("Validation failed, first error: {0}\n{1}".format( + errors[0].exception, errors[0].backtrace + )) + + def test_rebuild_simple(self): + self._rebuild_metadata(SimpleOverlayWorkload()) diff --git a/qa/tasks/cephfs/test_scrub.py b/qa/tasks/cephfs/test_scrub.py new file mode 100644 index 000000000..647860129 --- /dev/null +++ b/qa/tasks/cephfs/test_scrub.py @@ -0,0 +1,187 @@ +""" +Test CephFS scrub (distinct from OSD scrub) functionality +""" + +from io import BytesIO +import logging +from collections import namedtuple + +from tasks.cephfs.cephfs_test_case import CephFSTestCase + +log = logging.getLogger(__name__) + +ValidationError = namedtuple("ValidationError", ["exception", "backtrace"]) + + +class Workload(CephFSTestCase): + def __init__(self, test, filesystem, mount): + super().__init__() + self._test = test + self._mount = mount + self._filesystem = filesystem + self._initial_state = None + + # Accumulate backtraces for every failed validation, and return them. Backtraces + # are rather verbose, but we only see them when something breaks, and they + # let us see which check failed without having to decorate each check with + # a string + self._errors = [] + + def write(self): + """ + Write the workload files to the mount + """ + raise NotImplementedError() + + def validate(self): + """ + Read from the mount and validate that the workload files are present (i.e. have + survived or been reconstructed from the test scenario) + """ + raise NotImplementedError() + + def damage(self): + """ + Damage the filesystem pools in ways that will be interesting to recover from. By + default just wipe everything in the metadata pool + """ + # Delete every object in the metadata pool + pool = self._filesystem.get_metadata_pool_name() + self._filesystem.rados(["purge", pool, '--yes-i-really-really-mean-it']) + + def flush(self): + """ + Called after client unmount, after write: flush whatever you want + """ + self._filesystem.mds_asok(["flush", "journal"]) + + +class BacktraceWorkload(Workload): + """ + Single file, single directory, wipe the backtrace and check it. + """ + def write(self): + self._mount.run_shell(["mkdir", "subdir"]) + self._mount.write_n_mb("subdir/sixmegs", 6) + + def validate(self): + st = self._mount.stat("subdir/sixmegs") + self._filesystem.mds_asok(["flush", "journal"]) + bt = self._filesystem.read_backtrace(st['st_ino']) + parent = bt['ancestors'][0]['dname'] + self.assertEqual(parent, 'sixmegs') + return self._errors + + def damage(self): + st = self._mount.stat("subdir/sixmegs") + self._filesystem.mds_asok(["flush", "journal"]) + self._filesystem._write_data_xattr(st['st_ino'], "parent", "") + + def create_files(self, nfiles=1000): + self._mount.create_n_files("scrub-new-files/file", nfiles) + + +class DupInodeWorkload(Workload): + """ + Duplicate an inode and try scrubbing it twice." + """ + + def write(self): + self._mount.run_shell(["mkdir", "parent"]) + self._mount.run_shell(["mkdir", "parent/child"]) + self._mount.write_n_mb("parent/parentfile", 6) + self._mount.write_n_mb("parent/child/childfile", 6) + + def damage(self): + self._mount.umount_wait() + self._filesystem.mds_asok(["flush", "journal"]) + self._filesystem.fail() + d = self._filesystem.radosmo(["getomapval", "10000000000.00000000", "parentfile_head", "-"]) + self._filesystem.radosm(["setomapval", "10000000000.00000000", "shadow_head"], stdin=BytesIO(d)) + self._test.config_set('mds', 'mds_hack_allow_loading_invalid_metadata', True) + self._filesystem.set_joinable() + self._filesystem.wait_for_daemons() + + def validate(self): + out_json = self._filesystem.run_scrub(["start", "/", "recursive,repair"]) + self.assertNotEqual(out_json, None) + self.assertEqual(out_json["return_code"], 0) + self.assertEqual(self._filesystem.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True) + self.assertTrue(self._filesystem.are_daemons_healthy()) + return self._errors + + +class TestScrub(CephFSTestCase): + MDSS_REQUIRED = 1 + + def setUp(self): + super().setUp() + + def _scrub(self, workload, workers=1): + """ + That when all objects in metadata pool are removed, we can rebuild a metadata pool + based on the contents of a data pool, and a client can see and read our files. + """ + + # First, inject some files + + workload.write() + + # are off by default, but in QA we need to explicitly disable them) + self.fs.set_ceph_conf('mds', 'mds verify scatter', False) + self.fs.set_ceph_conf('mds', 'mds debug scatterstat', False) + + # Apply any data damage the workload wants + workload.damage() + + out_json = self.fs.run_scrub(["start", "/", "recursive,repair"]) + self.assertNotEqual(out_json, None) + self.assertEqual(out_json["return_code"], 0) + self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True) + + # See that the files are present and correct + errors = workload.validate() + if errors: + log.error("Validation errors found: {0}".format(len(errors))) + for e in errors: + log.error(e.exception) + log.error(e.backtrace) + raise AssertionError("Validation failed, first error: {0}\n{1}".format( + errors[0].exception, errors[0].backtrace + )) + + def _get_damage_count(self, damage_type='backtrace'): + out_json = self.fs.rank_tell(["damage", "ls"]) + self.assertNotEqual(out_json, None) + + damage_count = 0 + for it in out_json: + if it['damage_type'] == damage_type: + damage_count += 1 + return damage_count + + def _scrub_new_files(self, workload): + """ + That scrubbing new files does not lead to errors + """ + workload.create_files(1000) + self.fs.wait_until_scrub_complete() + self.assertEqual(self._get_damage_count(), 0) + + def test_scrub_backtrace_for_new_files(self): + self._scrub_new_files(BacktraceWorkload(self, self.fs, self.mount_a)) + + def test_scrub_backtrace(self): + self._scrub(BacktraceWorkload(self, self.fs, self.mount_a)) + + def test_scrub_dup_inode(self): + self._scrub(DupInodeWorkload(self, self.fs, self.mount_a)) + + def test_mdsdir_scrub_backtrace(self): + damage_count = self._get_damage_count() + self.assertNotIn("MDS_DAMAGE", self.mds_cluster.mon_manager.get_mon_health()['checks']) + + out_json = self.fs.run_scrub(["start", "~mdsdir", "recursive"]) + self.assertEqual(self.fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True) + self.assertEqual(self._get_damage_count(), damage_count) + self.assertNotIn("MDS_DAMAGE", self.mds_cluster.mon_manager.get_mon_health()['checks']) diff --git a/qa/tasks/cephfs/test_scrub_checks.py b/qa/tasks/cephfs/test_scrub_checks.py new file mode 100644 index 000000000..e41b997a6 --- /dev/null +++ b/qa/tasks/cephfs/test_scrub_checks.py @@ -0,0 +1,462 @@ +""" +MDS admin socket scrubbing-related tests. +""" +import json +import logging +import errno +import time +from teuthology.exceptions import CommandFailedError +from teuthology.contextutil import safe_while +import os +from tasks.cephfs.cephfs_test_case import CephFSTestCase + +log = logging.getLogger(__name__) + +class TestScrubControls(CephFSTestCase): + """ + Test basic scrub control operations such as abort, pause and resume. + """ + + MDSS_REQUIRED = 2 + CLIENTS_REQUIRED = 1 + + def _abort_scrub(self, expected): + res = self.fs.run_scrub(["abort"]) + self.assertEqual(res['return_code'], expected) + def _pause_scrub(self, expected): + res = self.fs.run_scrub(["pause"]) + self.assertEqual(res['return_code'], expected) + def _resume_scrub(self, expected): + res = self.fs.run_scrub(["resume"]) + self.assertEqual(res['return_code'], expected) + def _check_task_status(self, expected_status, timo=120): + """ check scrub status for current active mds in ceph status """ + with safe_while(sleep=1, tries=120, action='wait for task status') as proceed: + while proceed(): + active = self.fs.get_active_names() + log.debug("current active={0}".format(active)) + task_status = self.fs.get_task_status("scrub status") + try: + if task_status[active[0]].startswith(expected_status): + return True + except KeyError: + pass + + def _check_task_status_na(self, timo=120): + """ check absence of scrub status in ceph status """ + with safe_while(sleep=1, tries=120, action='wait for task status') as proceed: + while proceed(): + active = self.fs.get_active_names() + log.debug("current active={0}".format(active)) + task_status = self.fs.get_task_status("scrub status") + if not active[0] in task_status: + return True + + def create_scrub_data(self, test_dir): + for i in range(32): + dirname = "dir.{0}".format(i) + dirpath = os.path.join(test_dir, dirname) + self.mount_a.run_shell_payload(f""" +set -e +mkdir -p {dirpath} +for ((i = 0; i < 32; i++)); do + dd if=/dev/urandom of={dirpath}/filename.$i bs=1M conv=fdatasync count=1 +done +""") + + def test_scrub_abort(self): + test_dir = "scrub_control_test_path" + abs_test_path = "/{0}".format(test_dir) + + self.create_scrub_data(test_dir) + + out_json = self.fs.run_scrub(["start", abs_test_path, "recursive"]) + self.assertNotEqual(out_json, None) + + # abort and verify + self._abort_scrub(0) + self.fs.wait_until_scrub_complete(sleep=5, timeout=30) + + # sleep enough to fetch updated task status + checked = self._check_task_status_na() + self.assertTrue(checked) + + def test_scrub_pause_and_resume(self): + test_dir = "scrub_control_test_path" + abs_test_path = "/{0}".format(test_dir) + + log.info("mountpoint: {0}".format(self.mount_a.mountpoint)) + client_path = os.path.join(self.mount_a.mountpoint, test_dir) + log.info("client_path: {0}".format(client_path)) + + self.create_scrub_data(test_dir) + + out_json = self.fs.run_scrub(["start", abs_test_path, "recursive"]) + self.assertNotEqual(out_json, None) + + # pause and verify + self._pause_scrub(0) + out_json = self.fs.get_scrub_status() + self.assertTrue("PAUSED" in out_json['status']) + + checked = self._check_task_status("paused") + self.assertTrue(checked) + + # resume and verify + self._resume_scrub(0) + out_json = self.fs.get_scrub_status() + self.assertFalse("PAUSED" in out_json['status']) + + checked = self._check_task_status_na() + self.assertTrue(checked) + + def test_scrub_pause_and_resume_with_abort(self): + test_dir = "scrub_control_test_path" + abs_test_path = "/{0}".format(test_dir) + + self.create_scrub_data(test_dir) + + out_json = self.fs.run_scrub(["start", abs_test_path, "recursive"]) + self.assertNotEqual(out_json, None) + + # pause and verify + self._pause_scrub(0) + out_json = self.fs.get_scrub_status() + self.assertTrue("PAUSED" in out_json['status']) + + checked = self._check_task_status("paused") + self.assertTrue(checked) + + # abort and verify + self._abort_scrub(0) + out_json = self.fs.get_scrub_status() + self.assertTrue("PAUSED" in out_json['status']) + self.assertTrue("0 inodes" in out_json['status']) + + # scrub status should still be paused... + checked = self._check_task_status("paused") + self.assertTrue(checked) + + # resume and verify + self._resume_scrub(0) + self.assertTrue(self.fs.wait_until_scrub_complete(sleep=5, timeout=30)) + + checked = self._check_task_status_na() + self.assertTrue(checked) + + def test_scrub_task_status_on_mds_failover(self): + (original_active, ) = self.fs.get_active_names() + original_standbys = self.mds_cluster.get_standby_daemons() + + test_dir = "scrub_control_test_path" + abs_test_path = "/{0}".format(test_dir) + + self.create_scrub_data(test_dir) + + out_json = self.fs.run_scrub(["start", abs_test_path, "recursive"]) + self.assertNotEqual(out_json, None) + + # pause and verify + self._pause_scrub(0) + out_json = self.fs.get_scrub_status() + self.assertTrue("PAUSED" in out_json['status']) + + checked = self._check_task_status("paused") + self.assertTrue(checked) + + # Kill the rank 0 + self.fs.mds_stop(original_active) + + def promoted(): + active = self.fs.get_active_names() + return active and active[0] in original_standbys + + log.info("Waiting for promotion of one of the original standbys {0}".format( + original_standbys)) + self.wait_until_true(promoted, timeout=self.fs.beacon_timeout) + + self._check_task_status_na() + +class TestScrubChecks(CephFSTestCase): + """ + Run flush and scrub commands on the specified files in the filesystem. This + task will run through a sequence of operations, but it is not comprehensive + on its own -- it doesn't manipulate the mds cache state to test on both + in- and out-of-memory parts of the hierarchy. So it's designed to be run + multiple times within a single test run, so that the test can manipulate + memory state. + + Usage: + mds_scrub_checks: + mds_rank: 0 + path: path/to/test/dir + client: 0 + run_seq: [0-9]+ + + Increment the run_seq on subsequent invocations within a single test run; + it uses that value to generate unique folder and file names. + """ + + MDSS_REQUIRED = 1 + CLIENTS_REQUIRED = 1 + + def test_scrub_checks(self): + self._checks(0) + self._checks(1) + + def _checks(self, run_seq): + mds_rank = 0 + test_dir = "scrub_test_path" + + abs_test_path = "/{0}".format(test_dir) + + log.info("mountpoint: {0}".format(self.mount_a.mountpoint)) + client_path = os.path.join(self.mount_a.mountpoint, test_dir) + log.info("client_path: {0}".format(client_path)) + + log.info("Cloning repo into place") + repo_path = TestScrubChecks.clone_repo(self.mount_a, client_path) + + log.info("Initiating mds_scrub_checks on mds.{id_} test_path {path}, run_seq {seq}".format( + id_=mds_rank, path=abs_test_path, seq=run_seq) + ) + + + success_validator = lambda j, r: self.json_validator(j, r, "return_code", 0) + + nep = "{test_path}/i/dont/exist".format(test_path=abs_test_path) + self.asok_command(mds_rank, "flush_path {nep}".format(nep=nep), + lambda j, r: self.json_validator(j, r, "return_code", -errno.ENOENT)) + self.tell_command(mds_rank, "scrub start {nep}".format(nep=nep), + lambda j, r: self.json_validator(j, r, "return_code", -errno.ENOENT)) + + test_repo_path = "{test_path}/ceph-qa-suite".format(test_path=abs_test_path) + dirpath = "{repo_path}/suites".format(repo_path=test_repo_path) + + if run_seq == 0: + log.info("First run: flushing {dirpath}".format(dirpath=dirpath)) + command = "flush_path {dirpath}".format(dirpath=dirpath) + self.asok_command(mds_rank, command, success_validator) + command = "scrub start {dirpath}".format(dirpath=dirpath) + self.tell_command(mds_rank, command, success_validator) + + filepath = "{repo_path}/suites/fs/verify/validater/valgrind.yaml".format( + repo_path=test_repo_path) + if run_seq == 0: + log.info("First run: flushing {filepath}".format(filepath=filepath)) + command = "flush_path {filepath}".format(filepath=filepath) + self.asok_command(mds_rank, command, success_validator) + command = "scrub start {filepath}".format(filepath=filepath) + self.tell_command(mds_rank, command, success_validator) + + if run_seq == 0: + log.info("First run: flushing base dir /") + command = "flush_path /" + self.asok_command(mds_rank, command, success_validator) + command = "scrub start /" + self.tell_command(mds_rank, command, success_validator) + + new_dir = "{repo_path}/new_dir_{i}".format(repo_path=repo_path, i=run_seq) + test_new_dir = "{repo_path}/new_dir_{i}".format(repo_path=test_repo_path, + i=run_seq) + self.mount_a.run_shell(["mkdir", new_dir]) + command = "flush_path {dir}".format(dir=test_new_dir) + self.asok_command(mds_rank, command, success_validator) + + new_file = "{repo_path}/new_file_{i}".format(repo_path=repo_path, + i=run_seq) + test_new_file = "{repo_path}/new_file_{i}".format(repo_path=test_repo_path, + i=run_seq) + self.mount_a.write_n_mb(new_file, 1) + + command = "flush_path {file}".format(file=test_new_file) + self.asok_command(mds_rank, command, success_validator) + + # check that scrub fails on errors + ino = self.mount_a.path_to_ino(new_file) + rados_obj_name = "{ino:x}.00000000".format(ino=ino) + command = "scrub start {file}".format(file=test_new_file) + + def _check_and_clear_damage(ino, dtype): + all_damage = self.fs.rank_tell(["damage", "ls"], mds_rank) + damage = [d for d in all_damage if d['ino'] == ino and d['damage_type'] == dtype] + for d in damage: + self.fs.mon_manager.raw_cluster_cmd( + 'tell', 'mds.{0}'.format(self.fs.get_active_names()[mds_rank]), + "damage", "rm", str(d['id'])) + return len(damage) > 0 + + # Missing parent xattr + self.assertFalse(_check_and_clear_damage(ino, "backtrace")); + self.fs.rados(["rmxattr", rados_obj_name, "parent"], pool=self.fs.get_data_pool_name()) + self.tell_command(mds_rank, command, success_validator) + self.fs.wait_until_scrub_complete(sleep=5, timeout=30) + self.assertTrue(_check_and_clear_damage(ino, "backtrace")); + + command = "flush_path /" + self.asok_command(mds_rank, command, success_validator) + + def scrub_with_stray_evaluation(self, fs, mnt, path, flag, files=2000, + _hard_links=3): + fs.set_allow_new_snaps(True) + + test_dir = "stray_eval_dir" + mnt.run_shell(["mkdir", test_dir]) + client_path = os.path.join(mnt.mountpoint, test_dir) + mnt.create_n_files(fs_path=f"{test_dir}/file", count=files, + hard_links=_hard_links) + mnt.run_shell(["mkdir", f"{client_path}/.snap/snap1-{test_dir}"]) + mnt.run_shell(f"find {client_path}/ -type f -delete") + mnt.run_shell(["rmdir", f"{client_path}/.snap/snap1-{test_dir}"]) + perf_dump = fs.rank_tell(["perf", "dump"], 0) + self.assertNotEqual(perf_dump.get('mds_cache').get('num_strays'), + 0, "mdcache.num_strays is zero") + + log.info( + f"num of strays: {perf_dump.get('mds_cache').get('num_strays')}") + + out_json = fs.run_scrub(["start", path, flag]) + self.assertNotEqual(out_json, None) + self.assertEqual(out_json["return_code"], 0) + + self.assertEqual( + fs.wait_until_scrub_complete(tag=out_json["scrub_tag"]), True) + + perf_dump = fs.rank_tell(["perf", "dump"], 0) + self.assertEqual(int(perf_dump.get('mds_cache').get('num_strays')), + 0, "mdcache.num_strays is non-zero") + + def test_scrub_repair(self): + mds_rank = 0 + test_dir = "scrub_repair_path" + + self.mount_a.run_shell(["mkdir", test_dir]) + self.mount_a.run_shell(["touch", "{0}/file".format(test_dir)]) + dir_objname = "{:x}.00000000".format(self.mount_a.path_to_ino(test_dir)) + + self.mount_a.umount_wait() + + # flush journal entries to dirfrag objects, and expire journal + self.fs.mds_asok(['flush', 'journal']) + self.fs.mds_stop() + + # remove the dentry from dirfrag, cause incorrect fragstat/rstat + self.fs.radosm(["rmomapkey", dir_objname, "file_head"]) + + self.fs.mds_fail_restart() + self.fs.wait_for_daemons() + + self.mount_a.mount_wait() + + # fragstat indicates the directory is not empty, rmdir should fail + with self.assertRaises(CommandFailedError) as ar: + self.mount_a.run_shell(["rmdir", test_dir]) + self.assertEqual(ar.exception.exitstatus, 1) + + self.tell_command(mds_rank, "scrub start /{0} repair".format(test_dir), + lambda j, r: self.json_validator(j, r, "return_code", 0)) + + # wait a few second for background repair + time.sleep(10) + + # fragstat should be fixed + self.mount_a.run_shell(["rmdir", test_dir]) + + def test_stray_evaluation_with_scrub(self): + """ + test that scrub can iterate over ~mdsdir and evaluate strays + """ + self.scrub_with_stray_evaluation(self.fs, self.mount_a, "~mdsdir", + "recursive") + + def test_flag_scrub_mdsdir(self): + """ + test flag scrub_mdsdir + """ + self.scrub_with_stray_evaluation(self.fs, self.mount_a, "/", + "recursive,scrub_mdsdir") + + @staticmethod + def json_validator(json_out, rc, element, expected_value): + if rc != 0: + return False, "asok command returned error {rc}".format(rc=rc) + element_value = json_out.get(element) + if element_value != expected_value: + return False, "unexpectedly got {jv} instead of {ev}!".format( + jv=element_value, ev=expected_value) + return True, "Succeeded" + + def tell_command(self, mds_rank, command, validator): + log.info("Running command '{command}'".format(command=command)) + + command_list = command.split() + jout = self.fs.rank_tell(command_list, mds_rank) + + log.info("command '{command}' returned '{jout}'".format( + command=command, jout=jout)) + + success, errstring = validator(jout, 0) + if not success: + raise AsokCommandFailedError(command, 0, jout, errstring) + return jout + + def asok_command(self, mds_rank, command, validator): + log.info("Running command '{command}'".format(command=command)) + + command_list = command.split() + + # we just assume there's an active mds for every rank + mds_id = self.fs.get_active_names()[mds_rank] + proc = self.fs.mon_manager.admin_socket('mds', mds_id, + command_list, check_status=False) + rout = proc.exitstatus + sout = proc.stdout.getvalue() + + if sout.strip(): + jout = json.loads(sout) + else: + jout = None + + log.info("command '{command}' got response code '{rout}' and stdout '{sout}'".format( + command=command, rout=rout, sout=sout)) + + success, errstring = validator(jout, rout) + + if not success: + raise AsokCommandFailedError(command, rout, jout, errstring) + + return jout + + @staticmethod + def clone_repo(client_mount, path): + repo = "ceph-qa-suite" + repo_path = os.path.join(path, repo) + client_mount.run_shell(["mkdir", "-p", path]) + + try: + client_mount.stat(repo_path) + except CommandFailedError: + client_mount.run_shell([ + "git", "clone", '--branch', 'giant', + "http://github.com/ceph/{repo}".format(repo=repo), + "{path}/{repo}".format(path=path, repo=repo) + ]) + + return repo_path + + +class AsokCommandFailedError(Exception): + """ + Exception thrown when we get an unexpected response + on an admin socket command + """ + + def __init__(self, command, rc, json_out, errstring): + self.command = command + self.rc = rc + self.json = json_out + self.errstring = errstring + + def __str__(self): + return "Admin socket: {command} failed with rc={rc} json output={json}, because '{es}'".format( + command=self.command, rc=self.rc, json=self.json, es=self.errstring) diff --git a/qa/tasks/cephfs/test_sessionmap.py b/qa/tasks/cephfs/test_sessionmap.py new file mode 100644 index 000000000..ad6fd1d60 --- /dev/null +++ b/qa/tasks/cephfs/test_sessionmap.py @@ -0,0 +1,232 @@ +import time +import json +import logging + +from tasks.cephfs.fuse_mount import FuseMount +from teuthology.exceptions import CommandFailedError +from tasks.cephfs.cephfs_test_case import CephFSTestCase + +log = logging.getLogger(__name__) + + +class TestSessionMap(CephFSTestCase): + CLIENTS_REQUIRED = 2 + MDSS_REQUIRED = 2 + + def test_tell_session_drop(self): + """ + That when a `tell` command is sent using the python CLI, + its MDS session is gone after it terminates + """ + self.mount_a.umount_wait() + self.mount_b.umount_wait() + + status = self.fs.status() + self.fs.rank_tell(["session", "ls"], status=status) + + ls_data = self.fs.rank_asok(['session', 'ls'], status=status) + self.assertEqual(len(ls_data), 0) + + def _get_connection_count(self, status=None): + perf = self.fs.rank_asok(["perf", "dump"], status=status) + conn = 0 + for module, dump in perf.items(): + if "AsyncMessenger::Worker" in module: + conn += dump['msgr_active_connections'] + return conn + + def test_tell_conn_close(self): + """ + That when a `tell` command is sent using the python CLI, + the conn count goes back to where it started (i.e. we aren't + leaving connections open) + """ + self.config_set('mds', 'ms_async_reap_threshold', '1') + + self.mount_a.umount_wait() + self.mount_b.umount_wait() + + status = self.fs.status() + s = self._get_connection_count(status=status) + self.fs.rank_tell(["session", "ls"], status=status) + self.wait_until_true( + lambda: self._get_connection_count(status=status) == s, + timeout=30 + ) + + def test_mount_conn_close(self): + """ + That when a client unmounts, the thread count on the MDS goes back + to what it was before the client mounted + """ + self.config_set('mds', 'ms_async_reap_threshold', '1') + + self.mount_a.umount_wait() + self.mount_b.umount_wait() + + status = self.fs.status() + s = self._get_connection_count(status=status) + self.mount_a.mount_wait() + self.assertGreater(self._get_connection_count(status=status), s) + self.mount_a.umount_wait() + self.wait_until_true( + lambda: self._get_connection_count(status=status) == s, + timeout=30 + ) + + def test_version_splitting(self): + """ + That when many sessions are updated, they are correctly + split into multiple versions to obey mds_sessionmap_keys_per_op + """ + + self.mount_a.umount_wait() + self.mount_b.umount_wait() + + # Configure MDS to write one OMAP key at once + self.set_conf('mds', 'mds_sessionmap_keys_per_op', 1) + self.fs.mds_fail_restart() + status = self.fs.wait_for_daemons() + + # Bring the clients back + self.mount_a.mount_wait() + self.mount_b.mount_wait() + + # See that they've got sessions + self.assert_session_count(2, mds_id=self.fs.get_rank(status=status)['name']) + + # See that we persist their sessions + self.fs.rank_asok(["flush", "journal"], rank=0, status=status) + table_json = json.loads(self.fs.table_tool(["0", "show", "session"])) + log.info("SessionMap: {0}".format(json.dumps(table_json, indent=2))) + self.assertEqual(table_json['0']['result'], 0) + self.assertEqual(len(table_json['0']['data']['sessions']), 2) + + # Now, induce a "force_open_sessions" event by exporting a dir + self.mount_a.run_shell(["mkdir", "bravo"]) + self.mount_a.run_shell(["touch", "bravo/file_a"]) + self.mount_b.run_shell(["touch", "bravo/file_b"]) + + self.fs.set_max_mds(2) + status = self.fs.wait_for_daemons() + + def get_omap_wrs(): + return self.fs.rank_asok(['perf', 'dump', 'objecter'], rank=1, status=status)['objecter']['omap_wr'] + + # Flush so that there are no dirty sessions on rank 1 + self.fs.rank_asok(["flush", "journal"], rank=1, status=status) + + # Export so that we get a force_open to rank 1 for the two sessions from rank 0 + initial_omap_wrs = get_omap_wrs() + self.fs.rank_asok(['export', 'dir', '/bravo', '1'], rank=0, status=status) + + # This is the critical (if rather subtle) check: that in the process of doing an export dir, + # we hit force_open_sessions, and as a result we end up writing out the sessionmap. There + # will be two sessions dirtied here, and because we have set keys_per_op to 1, we should see + # a single session get written out (the first of the two, triggered by the second getting marked + # dirty) + # The number of writes is two per session, because the header (sessionmap version) update and + # KV write both count. Also, multiply by 2 for each openfile table update. + self.wait_until_true( + lambda: get_omap_wrs() - initial_omap_wrs == 2*2, + timeout=30 # Long enough for an export to get acked + ) + + # Now end our sessions and check the backing sessionmap is updated correctly + self.mount_a.umount_wait() + self.mount_b.umount_wait() + + # In-memory sessionmap check + self.assert_session_count(0, mds_id=self.fs.get_rank(status=status)['name']) + + # On-disk sessionmap check + self.fs.rank_asok(["flush", "journal"], rank=0, status=status) + table_json = json.loads(self.fs.table_tool(["0", "show", "session"])) + log.info("SessionMap: {0}".format(json.dumps(table_json, indent=2))) + self.assertEqual(table_json['0']['result'], 0) + self.assertEqual(len(table_json['0']['data']['sessions']), 0) + + def _configure_auth(self, mount, id_name, mds_caps, osd_caps=None, mon_caps=None): + """ + Set up auth credentials for a client mount, and write out the keyring + for the client to use. + """ + + if osd_caps is None: + osd_caps = "allow rw" + + if mon_caps is None: + mon_caps = "allow r" + + out = self.fs.mon_manager.raw_cluster_cmd( + "auth", "get-or-create", "client.{name}".format(name=id_name), + "mds", mds_caps, + "osd", osd_caps, + "mon", mon_caps + ) + mount.client_id = id_name + mount.client_remote.write_file(mount.get_keyring_path(), out, sudo=True) + self.set_conf("client.{name}".format(name=id_name), "keyring", mount.get_keyring_path()) + + def test_session_reject(self): + if not isinstance(self.mount_a, FuseMount): + self.skipTest("Requires FUSE client to inject client metadata") + + self.mount_a.run_shell(["mkdir", "foo"]) + self.mount_a.run_shell(["mkdir", "foo/bar"]) + self.mount_a.umount_wait() + + # Mount B will be my rejected client + self.mount_b.umount_wait() + + # Configure a client that is limited to /foo/bar + self._configure_auth(self.mount_b, "badguy", "allow rw path=/foo/bar") + # Check he can mount that dir and do IO + self.mount_b.mount_wait(cephfs_mntpt="/foo/bar") + self.mount_b.create_destroy() + self.mount_b.umount_wait() + + # Configure the client to claim that its mount point metadata is /baz + self.set_conf("client.badguy", "client_metadata", "root=/baz") + # Try to mount the client, see that it fails + with self.assert_cluster_log("client session with non-allowable root '/baz' denied"): + with self.assertRaises(CommandFailedError): + self.mount_b.mount_wait(cephfs_mntpt="/foo/bar") + + def test_session_evict_blocklisted(self): + """ + Check that mds evicts blocklisted client + """ + if not isinstance(self.mount_a, FuseMount): + self.skipTest("Requires FUSE client to use " + "mds_cluster.is_addr_blocklisted()") + + self.fs.set_max_mds(2) + status = self.fs.wait_for_daemons() + + self.mount_a.run_shell_payload("mkdir {d0,d1} && touch {d0,d1}/file") + self.mount_a.setfattr("d0", "ceph.dir.pin", "0") + self.mount_a.setfattr("d1", "ceph.dir.pin", "1") + self._wait_subtrees([('/d0', 0), ('/d1', 1)], status=status) + + self.mount_a.run_shell(["touch", "d0/f0"]) + self.mount_a.run_shell(["touch", "d1/f0"]) + self.mount_b.run_shell(["touch", "d0/f1"]) + self.mount_b.run_shell(["touch", "d1/f1"]) + + self.assert_session_count(2, mds_id=self.fs.get_rank(rank=0, status=status)['name']) + self.assert_session_count(2, mds_id=self.fs.get_rank(rank=1, status=status)['name']) + + mount_a_client_id = self.mount_a.get_global_id() + self.fs.mds_asok(['session', 'evict', "%s" % mount_a_client_id], + mds_id=self.fs.get_rank(rank=0, status=status)['name']) + self.wait_until_true(lambda: self.mds_cluster.is_addr_blocklisted( + self.mount_a.get_global_addr()), timeout=30) + + # 10 seconds should be enough for evicting client + time.sleep(10) + self.assert_session_count(1, mds_id=self.fs.get_rank(rank=0, status=status)['name']) + self.assert_session_count(1, mds_id=self.fs.get_rank(rank=1, status=status)['name']) + + self.mount_a.kill_cleanup() + self.mount_a.mount_wait() diff --git a/qa/tasks/cephfs/test_snap_schedules.py b/qa/tasks/cephfs/test_snap_schedules.py new file mode 100644 index 000000000..0264cac32 --- /dev/null +++ b/qa/tasks/cephfs/test_snap_schedules.py @@ -0,0 +1,607 @@ +import os +import json +import time +import errno +import logging + +from tasks.cephfs.cephfs_test_case import CephFSTestCase +from teuthology.exceptions import CommandFailedError +from datetime import datetime, timedelta + +log = logging.getLogger(__name__) + +def extract_schedule_and_retention_spec(spec=[]): + schedule = set([s[0] for s in spec]) + retention = set([s[1] for s in spec]) + return (schedule, retention) + +def seconds_upto_next_schedule(time_from, timo): + ts = int(time_from) + return ((int(ts / 60) * 60) + timo) - ts + +class TestSnapSchedulesHelper(CephFSTestCase): + CLIENTS_REQUIRED = 1 + + TEST_VOLUME_NAME = 'snap_vol' + TEST_DIRECTORY = 'snap_test_dir1' + + # this should be in sync with snap_schedule format + SNAPSHOT_TS_FORMAT = '%Y-%m-%d-%H_%M_%S' + + def check_scheduled_snapshot(self, exec_time, timo): + now = time.time() + delta = now - exec_time + log.debug(f'exec={exec_time}, now = {now}, timo = {timo}') + # tolerate snapshot existance in the range [-5,+5] + self.assertTrue((delta <= timo + 5) and (delta >= timo - 5)) + + def _fs_cmd(self, *args): + return self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", *args) + + def fs_snap_schedule_cmd(self, *args, **kwargs): + if 'fs' in kwargs: + fs = kwargs.pop('fs') + args += ('--fs', fs) + if 'format' in kwargs: + fmt = kwargs.pop('format') + args += ('--format', fmt) + for name, val in kwargs.items(): + args += (str(val),) + res = self._fs_cmd('snap-schedule', *args) + log.debug(f'res={res}') + return res + + def _create_or_reuse_test_volume(self): + result = json.loads(self._fs_cmd("volume", "ls")) + if len(result) == 0: + self.vol_created = True + self.volname = TestSnapSchedulesHelper.TEST_VOLUME_NAME + self._fs_cmd("volume", "create", self.volname) + else: + self.volname = result[0]['name'] + + def _enable_snap_schedule(self): + return self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "enable", "snap_schedule") + + def _disable_snap_schedule(self): + return self.mgr_cluster.mon_manager.raw_cluster_cmd("mgr", "module", "disable", "snap_schedule") + + def _allow_minute_granularity_snapshots(self): + self.config_set('mgr', 'mgr/snap_schedule/allow_m_granularity', True) + + def _dump_on_update(self): + self.config_set('mgr', 'mgr/snap_schedule/dump_on_update', True) + + def setUp(self): + super(TestSnapSchedulesHelper, self).setUp() + self.volname = None + self.vol_created = False + self._create_or_reuse_test_volume() + self.create_cbks = [] + self.remove_cbks = [] + # used to figure out which snapshots are created/deleted + self.snapshots = set() + self._enable_snap_schedule() + self._allow_minute_granularity_snapshots() + self._dump_on_update() + + def tearDown(self): + if self.vol_created: + self._delete_test_volume() + self._disable_snap_schedule() + super(TestSnapSchedulesHelper, self).tearDown() + + def _schedule_to_timeout(self, schedule): + mult = schedule[-1] + period = int(schedule[0:-1]) + if mult == 'M': + return period * 60 + elif mult == 'h': + return period * 60 * 60 + elif mult == 'd': + return period * 60 * 60 * 24 + elif mult == 'w': + return period * 60 * 60 * 24 * 7 + else: + raise RuntimeError('schedule multiplier not recognized') + + def add_snap_create_cbk(self, cbk): + self.create_cbks.append(cbk) + def remove_snap_create_cbk(self, cbk): + self.create_cbks.remove(cbk) + + def add_snap_remove_cbk(self, cbk): + self.remove_cbks.append(cbk) + def remove_snap_remove_cbk(self, cbk): + self.remove_cbks.remove(cbk) + + def assert_if_not_verified(self): + self.assertListEqual(self.create_cbks, []) + self.assertListEqual(self.remove_cbks, []) + + def verify(self, dir_path, max_trials): + trials = 0 + snap_path = f'{dir_path}/.snap' + while (len(self.create_cbks) or len(self.remove_cbks)) and trials < max_trials: + snapshots = set(self.mount_a.ls(path=snap_path)) + log.info(f'snapshots: {snapshots}') + added = snapshots - self.snapshots + log.info(f'added: {added}') + removed = self.snapshots - snapshots + log.info(f'removed: {removed}') + if added: + for cbk in list(self.create_cbks): + res = cbk(list(added)) + if res: + self.remove_snap_create_cbk(cbk) + break + if removed: + for cbk in list(self.remove_cbks): + res = cbk(list(removed)) + if res: + self.remove_snap_remove_cbk(cbk) + break + self.snapshots = snapshots + trials += 1 + time.sleep(1) + + def calc_wait_time_and_snap_name(self, snap_sched_exec_epoch, schedule): + timo = self._schedule_to_timeout(schedule) + # calculate wait time upto the next minute + wait_timo = seconds_upto_next_schedule(snap_sched_exec_epoch, timo) + + # expected "scheduled" snapshot name + ts_name = (datetime.utcfromtimestamp(snap_sched_exec_epoch) + + timedelta(seconds=wait_timo)).strftime(TestSnapSchedulesHelper.SNAPSHOT_TS_FORMAT) + return (wait_timo, ts_name) + + def verify_schedule(self, dir_path, schedules, retentions=[]): + log.debug(f'expected_schedule: {schedules}, expected_retention: {retentions}') + + result = self.fs_snap_schedule_cmd('list', path=dir_path, format='json') + json_res = json.loads(result) + log.debug(f'json_res: {json_res}') + + for schedule in schedules: + self.assertTrue(schedule in json_res['schedule']) + for retention in retentions: + self.assertTrue(retention in json_res['retention']) + +class TestSnapSchedules(TestSnapSchedulesHelper): + def remove_snapshots(self, dir_path): + snap_path = f'{dir_path}/.snap' + + snapshots = self.mount_a.ls(path=snap_path) + for snapshot in snapshots: + snapshot_path = os.path.join(snap_path, snapshot) + log.debug(f'removing snapshot: {snapshot_path}') + self.mount_a.run_shell(['rmdir', snapshot_path]) + + def test_non_existent_snap_schedule_list(self): + """Test listing snap schedules on a non-existing filesystem path failure""" + try: + self.fs_snap_schedule_cmd('list', path=TestSnapSchedules.TEST_DIRECTORY) + except CommandFailedError as ce: + if ce.exitstatus != errno.ENOENT: + raise RuntimeError('incorrect errno when listing a non-existing snap schedule') + else: + raise RuntimeError('expected "fs snap-schedule list" to fail') + + def test_non_existent_schedule(self): + """Test listing non-existing snap schedules failure""" + self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedules.TEST_DIRECTORY]) + + try: + self.fs_snap_schedule_cmd('list', path=TestSnapSchedules.TEST_DIRECTORY) + except CommandFailedError as ce: + if ce.exitstatus != errno.ENOENT: + raise RuntimeError('incorrect errno when listing a non-existing snap schedule') + else: + raise RuntimeError('expected "fs snap-schedule list" returned fail') + + self.mount_a.run_shell(['rmdir', TestSnapSchedules.TEST_DIRECTORY]) + + def test_snap_schedule_list_post_schedule_remove(self): + """Test listing snap schedules post removal of a schedule""" + self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedules.TEST_DIRECTORY]) + + self.fs_snap_schedule_cmd('add', path=TestSnapSchedules.TEST_DIRECTORY, snap_schedule='1h') + + self.fs_snap_schedule_cmd('remove', path=TestSnapSchedules.TEST_DIRECTORY) + + try: + self.fs_snap_schedule_cmd('list', path=TestSnapSchedules.TEST_DIRECTORY) + except CommandFailedError as ce: + if ce.exitstatus != errno.ENOENT: + raise RuntimeError('incorrect errno when listing a non-existing snap schedule') + else: + raise RuntimeError('"fs snap-schedule list" returned error') + + self.mount_a.run_shell(['rmdir', TestSnapSchedules.TEST_DIRECTORY]) + + def test_snap_schedule(self): + """Test existence of a scheduled snapshot""" + self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedules.TEST_DIRECTORY]) + + # set a schedule on the dir + self.fs_snap_schedule_cmd('add', path=TestSnapSchedules.TEST_DIRECTORY, snap_schedule='1M') + exec_time = time.time() + + timo, snap_sfx = self.calc_wait_time_and_snap_name(exec_time, '1M') + log.debug(f'expecting snap {TestSnapSchedules.TEST_DIRECTORY}/.snap/scheduled-{snap_sfx} in ~{timo}s...') + to_wait = timo + 2 # some leeway to avoid false failures... + + # verify snapshot schedule + self.verify_schedule(TestSnapSchedules.TEST_DIRECTORY, ['1M']) + + def verify_added(snaps_added): + log.debug(f'snapshots added={snaps_added}') + self.assertEqual(len(snaps_added), 1) + snapname = snaps_added[0] + if snapname.startswith('scheduled-'): + if snapname[10:26] == snap_sfx[:16]: + self.check_scheduled_snapshot(exec_time, timo) + return True + return False + self.add_snap_create_cbk(verify_added) + self.verify(TestSnapSchedules.TEST_DIRECTORY, to_wait) + self.assert_if_not_verified() + + # remove snapshot schedule + self.fs_snap_schedule_cmd('remove', path=TestSnapSchedules.TEST_DIRECTORY) + + # remove all scheduled snapshots + self.remove_snapshots(TestSnapSchedules.TEST_DIRECTORY) + + self.mount_a.run_shell(['rmdir', TestSnapSchedules.TEST_DIRECTORY]) + + def test_multi_snap_schedule(self): + """Test exisitence of multiple scheduled snapshots""" + self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedules.TEST_DIRECTORY]) + + # set schedules on the dir + self.fs_snap_schedule_cmd('add', path=TestSnapSchedules.TEST_DIRECTORY, snap_schedule='1M') + self.fs_snap_schedule_cmd('add', path=TestSnapSchedules.TEST_DIRECTORY, snap_schedule='2M') + exec_time = time.time() + + timo_1, snap_sfx_1 = self.calc_wait_time_and_snap_name(exec_time, '1M') + log.debug(f'expecting snap {TestSnapSchedules.TEST_DIRECTORY}/.snap/scheduled-{snap_sfx_1} in ~{timo_1}s...') + timo_2, snap_sfx_2 = self.calc_wait_time_and_snap_name(exec_time, '2M') + log.debug(f'expecting snap {TestSnapSchedules.TEST_DIRECTORY}/.snap/scheduled-{snap_sfx_2} in ~{timo_2}s...') + to_wait = timo_2 + 2 # use max timeout + + # verify snapshot schedule + self.verify_schedule(TestSnapSchedules.TEST_DIRECTORY, ['1M', '2M']) + + def verify_added_1(snaps_added): + log.debug(f'snapshots added={snaps_added}') + self.assertEqual(len(snaps_added), 1) + snapname = snaps_added[0] + if snapname.startswith('scheduled-'): + if snapname[10:26] == snap_sfx_1[:16]: + self.check_scheduled_snapshot(exec_time, timo_1) + return True + return False + def verify_added_2(snaps_added): + log.debug(f'snapshots added={snaps_added}') + self.assertEqual(len(snaps_added), 1) + snapname = snaps_added[0] + if snapname.startswith('scheduled-'): + if snapname[10:26] == snap_sfx_2[:16]: + self.check_scheduled_snapshot(exec_time, timo_2) + return True + return False + self.add_snap_create_cbk(verify_added_1) + self.add_snap_create_cbk(verify_added_2) + self.verify(TestSnapSchedules.TEST_DIRECTORY, to_wait) + self.assert_if_not_verified() + + # remove snapshot schedule + self.fs_snap_schedule_cmd('remove', path=TestSnapSchedules.TEST_DIRECTORY) + + # remove all scheduled snapshots + self.remove_snapshots(TestSnapSchedules.TEST_DIRECTORY) + + self.mount_a.run_shell(['rmdir', TestSnapSchedules.TEST_DIRECTORY]) + + def test_snap_schedule_with_retention(self): + """Test scheduled snapshots along with rentention policy""" + self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedules.TEST_DIRECTORY]) + + # set a schedule on the dir + self.fs_snap_schedule_cmd('add', path=TestSnapSchedules.TEST_DIRECTORY, snap_schedule='1M') + self.fs_snap_schedule_cmd('retention', 'add', path=TestSnapSchedules.TEST_DIRECTORY, retention_spec_or_period='1M') + exec_time = time.time() + + timo_1, snap_sfx = self.calc_wait_time_and_snap_name(exec_time, '1M') + log.debug(f'expecting snap {TestSnapSchedules.TEST_DIRECTORY}/.snap/scheduled-{snap_sfx} in ~{timo_1}s...') + to_wait = timo_1 + 2 # some leeway to avoid false failures... + + # verify snapshot schedule + self.verify_schedule(TestSnapSchedules.TEST_DIRECTORY, ['1M'], retentions=[{'M':1}]) + + def verify_added(snaps_added): + log.debug(f'snapshots added={snaps_added}') + self.assertEqual(len(snaps_added), 1) + snapname = snaps_added[0] + if snapname.startswith('scheduled-'): + if snapname[10:26] == snap_sfx[:16]: + self.check_scheduled_snapshot(exec_time, timo_1) + return True + return False + self.add_snap_create_cbk(verify_added) + self.verify(TestSnapSchedules.TEST_DIRECTORY, to_wait) + self.assert_if_not_verified() + + timo_2 = timo_1 + 60 # expected snapshot removal timeout + def verify_removed(snaps_removed): + log.debug(f'snapshots removed={snaps_removed}') + self.assertEqual(len(snaps_removed), 1) + snapname = snaps_removed[0] + if snapname.startswith('scheduled-'): + if snapname[10:26] == snap_sfx[:16]: + self.check_scheduled_snapshot(exec_time, timo_2) + return True + return False + log.debug(f'expecting removal of snap {TestSnapSchedules.TEST_DIRECTORY}/.snap/scheduled-{snap_sfx} in ~{timo_2}s...') + to_wait = timo_2 + self.add_snap_remove_cbk(verify_removed) + self.verify(TestSnapSchedules.TEST_DIRECTORY, to_wait+2) + self.assert_if_not_verified() + + # remove snapshot schedule + self.fs_snap_schedule_cmd('remove', path=TestSnapSchedules.TEST_DIRECTORY) + + # remove all scheduled snapshots + self.remove_snapshots(TestSnapSchedules.TEST_DIRECTORY) + + self.mount_a.run_shell(['rmdir', TestSnapSchedules.TEST_DIRECTORY]) + + def get_snap_stats(self, dir_path): + snap_path = f"{dir_path}/.snap"[1:] + snapshots = self.mount_a.ls(path=snap_path) + fs_count = len(snapshots) + log.debug(f'snapshots: {snapshots}') + + result = self.fs_snap_schedule_cmd('status', path=dir_path, + format='json') + json_res = json.loads(result)[0] + db_count = int(json_res['created_count']) + log.debug(f'json_res: {json_res}') + + snap_stats = dict() + snap_stats['fs_count'] = fs_count + snap_stats['db_count'] = db_count + + log.debug(f'fs_count: {fs_count}') + log.debug(f'db_count: {db_count}') + + return snap_stats + + def verify_snap_stats(self, dir_path): + snap_stats = self.get_snap_stats(dir_path) + self.assertTrue(snap_stats['fs_count'] == snap_stats['db_count']) + + def test_concurrent_snap_creates(self): + """Test concurrent snap creates in same file-system without db issues""" + """ + Test snap creates at same cadence on same fs to verify correct stats. + A single SQLite DB Connection handle cannot be used to run concurrent + transactions and results transaction aborts. This test makes sure that + proper care has been taken in the code to avoid such situation by + verifying number of dirs created on the file system with the + created_count in the schedule_meta table for the specific path. + """ + self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedules.TEST_DIRECTORY]) + + testdirs = [] + for d in range(10): + testdirs.append(os.path.join("/", TestSnapSchedules.TEST_DIRECTORY, "dir" + str(d))) + + for d in testdirs: + self.mount_a.run_shell(['mkdir', '-p', d[1:]]) + self.fs_snap_schedule_cmd('add', path=d, snap_schedule='1M') + + exec_time = time.time() + timo_1, snap_sfx = self.calc_wait_time_and_snap_name(exec_time, '1M') + + for d in testdirs: + self.fs_snap_schedule_cmd('activate', path=d, snap_schedule='1M') + + # we wait for 10 snaps to be taken + wait_time = timo_1 + 10 * 60 + 15 + time.sleep(wait_time) + + for d in testdirs: + self.fs_snap_schedule_cmd('deactivate', path=d, snap_schedule='1M') + + for d in testdirs: + self.verify_snap_stats(d) + + for d in testdirs: + self.fs_snap_schedule_cmd('remove', path=d, snap_schedule='1M') + self.remove_snapshots(d[1:]) + self.mount_a.run_shell(['rmdir', d[1:]]) + + def test_snap_schedule_with_mgr_restart(self): + """Test that snap schedule is resumed after mgr restart""" + self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedules.TEST_DIRECTORY]) + testdir = os.path.join("/", TestSnapSchedules.TEST_DIRECTORY, "test_restart") + self.mount_a.run_shell(['mkdir', '-p', testdir[1:]]) + self.fs_snap_schedule_cmd('add', path=testdir, snap_schedule='1M') + + exec_time = time.time() + timo_1, snap_sfx = self.calc_wait_time_and_snap_name(exec_time, '1M') + + self.fs_snap_schedule_cmd('activate', path=testdir, snap_schedule='1M') + + # we wait for 10 snaps to be taken + wait_time = timo_1 + 10 * 60 + 15 + time.sleep(wait_time) + + old_stats = self.get_snap_stats(testdir) + self.assertTrue(old_stats['fs_count'] == old_stats['db_count']) + self.assertTrue(old_stats['fs_count'] > 9) + + # restart mgr + active_mgr = self.mgr_cluster.mon_manager.get_mgr_dump()['active_name'] + log.debug(f'restarting active mgr: {active_mgr}') + self.mgr_cluster.mon_manager.revive_mgr(active_mgr) + time.sleep(300) # sleep for 5 minutes + self.fs_snap_schedule_cmd('deactivate', path=testdir, snap_schedule='1M') + + new_stats = self.get_snap_stats(testdir) + self.assertTrue(new_stats['fs_count'] == new_stats['db_count']) + self.assertTrue(new_stats['fs_count'] > old_stats['fs_count']) + self.assertTrue(new_stats['db_count'] > old_stats['db_count']) + + # cleanup + self.fs_snap_schedule_cmd('remove', path=testdir, snap_schedule='1M') + self.remove_snapshots(testdir[1:]) + self.mount_a.run_shell(['rmdir', testdir[1:]]) + + def test_schedule_auto_deactivation_for_non_existent_path(self): + """ + Test that a non-existent path leads to schedule deactivation after a few retries. + """ + self.fs_snap_schedule_cmd('add', path="/bad-path", snap_schedule='1M') + start_time = time.time() + + while time.time() - start_time < 60.0: + s = self.fs_snap_schedule_cmd('status', path="/bad-path", format='json') + json_status = json.loads(s)[0] + + self.assertTrue(int(json_status['active']) == 1) + time.sleep(60) + + s = self.fs_snap_schedule_cmd('status', path="/bad-path", format='json') + json_status = json.loads(s)[0] + self.assertTrue(int(json_status['active']) == 0) + + # remove snapshot schedule + self.fs_snap_schedule_cmd('remove', path="/bad-path") + + def test_snap_schedule_for_number_of_snaps_retention(self): + """ + Test that number of snaps retained are as per user spec. + """ + total_snaps = 55 + test_dir = '/' + TestSnapSchedules.TEST_DIRECTORY + + self.mount_a.run_shell(['mkdir', '-p', test_dir[1:]]) + + # set a schedule on the dir + self.fs_snap_schedule_cmd('add', path=test_dir, snap_schedule='1M') + self.fs_snap_schedule_cmd('retention', 'add', path=test_dir, + retention_spec_or_period=f'{total_snaps}n') + exec_time = time.time() + + timo_1, snap_sfx = self.calc_wait_time_and_snap_name(exec_time, '1M') + + # verify snapshot schedule + self.verify_schedule(test_dir, ['1M']) + + # we wait for total_snaps snaps to be taken + wait_time = timo_1 + total_snaps * 60 + 15 + time.sleep(wait_time) + + snap_stats = self.get_snap_stats(test_dir) + self.assertTrue(snap_stats['fs_count'] == total_snaps) + self.assertTrue(snap_stats['db_count'] >= total_snaps) + + # remove snapshot schedule + self.fs_snap_schedule_cmd('remove', path=test_dir) + + # remove all scheduled snapshots + self.remove_snapshots(test_dir[1:]) + + self.mount_a.run_shell(['rmdir', test_dir[1:]]) + + +class TestSnapSchedulesSnapdir(TestSnapSchedulesHelper): + def remove_snapshots(self, dir_path, sdn): + snap_path = f'{dir_path}/{sdn}' + + snapshots = self.mount_a.ls(path=snap_path) + for snapshot in snapshots: + snapshot_path = os.path.join(snap_path, snapshot) + log.debug(f'removing snapshot: {snapshot_path}') + self.mount_a.run_shell(['rmdir', snapshot_path]) + + def get_snap_dir_name(self): + from tasks.cephfs.fuse_mount import FuseMount + from tasks.cephfs.kernel_mount import KernelMount + + if isinstance(self.mount_a, KernelMount): + sdn = self.mount_a.client_config.get('snapdirname', '.snap') + elif isinstance(self.mount_a, FuseMount): + sdn = self.mount_a.client_config.get('client_snapdir', '.snap') + self.fs.set_ceph_conf('client', 'client snapdir', sdn) + self.mount_a.remount() + return sdn + + def test_snap_dir_name(self): + """Test the correctness of snap directory name""" + self.mount_a.run_shell(['mkdir', '-p', TestSnapSchedulesSnapdir.TEST_DIRECTORY]) + + # set a schedule on the dir + self.fs_snap_schedule_cmd('add', path=TestSnapSchedulesSnapdir.TEST_DIRECTORY, snap_schedule='1M') + self.fs_snap_schedule_cmd('retention', 'add', path=TestSnapSchedulesSnapdir.TEST_DIRECTORY, retention_spec_or_period='1M') + exec_time = time.time() + + timo, snap_sfx = self.calc_wait_time_and_snap_name(exec_time, '1M') + sdn = self.get_snap_dir_name() + log.info(f'expecting snap {TestSnapSchedulesSnapdir.TEST_DIRECTORY}/{sdn}/scheduled-{snap_sfx} in ~{timo}s...') + + # verify snapshot schedule + self.verify_schedule(TestSnapSchedulesSnapdir.TEST_DIRECTORY, ['1M'], retentions=[{'M':1}]) + + # remove snapshot schedule + self.fs_snap_schedule_cmd('remove', path=TestSnapSchedulesSnapdir.TEST_DIRECTORY) + + # remove all scheduled snapshots + self.remove_snapshots(TestSnapSchedulesSnapdir.TEST_DIRECTORY, sdn) + + self.mount_a.run_shell(['rmdir', TestSnapSchedulesSnapdir.TEST_DIRECTORY]) + + +""" +Note that the class TestSnapSchedulesMandatoryFSArgument tests snap-schedule +commands only for multi-fs scenario. Commands for a single default fs should +pass for tests defined above or elsewhere. +""" + + +class TestSnapSchedulesMandatoryFSArgument(TestSnapSchedulesHelper): + REQUIRE_BACKUP_FILESYSTEM = True + TEST_DIRECTORY = 'mandatory_fs_argument_test_dir' + + def test_snap_schedule_without_fs_argument(self): + """Test command fails without --fs argument in presence of multiple fs""" + test_path = TestSnapSchedulesMandatoryFSArgument.TEST_DIRECTORY + self.mount_a.run_shell(['mkdir', '-p', test_path]) + + # try setting a schedule on the dir; this should fail now that we are + # working with mutliple fs; we need the --fs argument if there are more + # than one fs hosted by the same cluster + with self.assertRaises(CommandFailedError): + self.fs_snap_schedule_cmd('add', test_path, snap_schedule='1M') + + self.mount_a.run_shell(['rmdir', test_path]) + + def test_snap_schedule_for_non_default_fs(self): + """Test command succes with --fs argument for non-default fs""" + test_path = TestSnapSchedulesMandatoryFSArgument.TEST_DIRECTORY + self.mount_a.run_shell(['mkdir', '-p', test_path]) + + # use the backup fs as the second fs; all these commands must pass + self.fs_snap_schedule_cmd('add', test_path, snap_schedule='1M', fs='backup_fs') + self.fs_snap_schedule_cmd('activate', test_path, snap_schedule='1M', fs='backup_fs') + self.fs_snap_schedule_cmd('retention', 'add', test_path, retention_spec_or_period='1M', fs='backup_fs') + self.fs_snap_schedule_cmd('list', test_path, fs='backup_fs', format='json') + self.fs_snap_schedule_cmd('status', test_path, fs='backup_fs', format='json') + self.fs_snap_schedule_cmd('retention', 'remove', test_path, retention_spec_or_period='1M', fs='backup_fs') + self.fs_snap_schedule_cmd('deactivate', test_path, snap_schedule='1M', fs='backup_fs') + self.fs_snap_schedule_cmd('remove', test_path, snap_schedule='1M', fs='backup_fs') + + self.mount_a.run_shell(['rmdir', test_path]) diff --git a/qa/tasks/cephfs/test_snapshots.py b/qa/tasks/cephfs/test_snapshots.py new file mode 100644 index 000000000..608dcc81f --- /dev/null +++ b/qa/tasks/cephfs/test_snapshots.py @@ -0,0 +1,605 @@ +import errno +import logging +import signal +from textwrap import dedent +from tasks.cephfs.fuse_mount import FuseMount +from tasks.cephfs.cephfs_test_case import CephFSTestCase +from teuthology.orchestra.run import Raw +from teuthology.exceptions import CommandFailedError + +log = logging.getLogger(__name__) + +MDS_RESTART_GRACE = 60 + +class TestSnapshots(CephFSTestCase): + MDSS_REQUIRED = 3 + LOAD_SETTINGS = ["mds_max_snaps_per_dir"] + + def _check_subtree(self, rank, path, status=None): + got_subtrees = self.fs.rank_asok(["get", "subtrees"], rank=rank, status=status) + for s in got_subtrees: + if s['dir']['path'] == path and s['auth_first'] == rank: + return True + return False + + def _get_snapclient_dump(self, rank=0, status=None): + return self.fs.rank_asok(["dump", "snaps"], rank=rank, status=status) + + def _get_snapserver_dump(self, rank=0, status=None): + return self.fs.rank_asok(["dump", "snaps", "--server"], rank=rank, status=status) + + def _get_last_created_snap(self, rank=0, status=None): + return int(self._get_snapserver_dump(rank,status=status)["last_created"]) + + def _get_last_destroyed_snap(self, rank=0, status=None): + return int(self._get_snapserver_dump(rank,status=status)["last_destroyed"]) + + def _get_pending_snap_update(self, rank=0, status=None): + return self._get_snapserver_dump(rank,status=status)["pending_update"] + + def _get_pending_snap_destroy(self, rank=0, status=None): + return self._get_snapserver_dump(rank,status=status)["pending_destroy"] + + def test_allow_new_snaps_config(self): + """ + Check whether 'allow_new_snaps' setting works + """ + self.mount_a.run_shell(["mkdir", "test-allow-snaps"]) + + self.fs.set_allow_new_snaps(False); + try: + self.mount_a.run_shell(["mkdir", "test-allow-snaps/.snap/snap00"]) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EPERM, "expected EPERM") + else: + self.fail("expected snap creatiion to fail") + + self.fs.set_allow_new_snaps(True); + self.mount_a.run_shell(["mkdir", "test-allow-snaps/.snap/snap00"]) + self.mount_a.run_shell(["rmdir", "test-allow-snaps/.snap/snap00"]) + self.mount_a.run_shell(["rmdir", "test-allow-snaps"]) + + def test_kill_mdstable(self): + """ + check snaptable transcation + """ + if not isinstance(self.mount_a, FuseMount): + self.skipTest("Require FUSE client to forcibly kill mount") + + self.fs.set_allow_new_snaps(True); + self.fs.set_max_mds(2) + status = self.fs.wait_for_daemons() + + # setup subtrees + self.mount_a.run_shell(["mkdir", "-p", "d1/dir"]) + self.mount_a.setfattr("d1", "ceph.dir.pin", "1") + self._wait_subtrees([("/d1", 1)], rank=1, path="/d1") + + last_created = self._get_last_created_snap(rank=0,status=status) + + # mds_kill_mdstable_at: + # 1: MDSTableServer::handle_prepare + # 2: MDSTableServer::_prepare_logged + # 5: MDSTableServer::handle_commit + # 6: MDSTableServer::_commit_logged + for i in [1,2,5,6]: + log.info("testing snapserver mds_kill_mdstable_at={0}".format(i)) + + status = self.fs.status() + rank0 = self.fs.get_rank(rank=0, status=status) + self.fs.rank_freeze(True, rank=0) + self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=0, status=status) + proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s1{0}".format(i)], wait=False) + self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=self.fs.beacon_timeout); + self.delete_mds_coredump(rank0['name']); + + self.fs.rank_fail(rank=0) + self.fs.mds_restart(rank0['name']) + self.wait_for_daemon_start([rank0['name']]) + status = self.fs.wait_for_daemons() + + proc.wait() + last_created += 1 + self.wait_until_true(lambda: self._get_last_created_snap(rank=0) == last_created, timeout=30) + + self.set_conf("mds", "mds_reconnect_timeout", "5") + + self.mount_a.run_shell(["rmdir", Raw("d1/dir/.snap/*")]) + + # set mds_kill_mdstable_at, also kill snapclient + for i in [2,5,6]: + log.info("testing snapserver mds_kill_mdstable_at={0}, also kill snapclient".format(i)) + status = self.fs.status() + last_created = self._get_last_created_snap(rank=0, status=status) + + rank0 = self.fs.get_rank(rank=0, status=status) + rank1 = self.fs.get_rank(rank=1, status=status) + self.fs.rank_freeze(True, rank=0) # prevent failover... + self.fs.rank_freeze(True, rank=1) # prevent failover... + self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=0, status=status) + proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s2{0}".format(i)], wait=False) + self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=self.fs.beacon_timeout); + self.delete_mds_coredump(rank0['name']); + + self.fs.rank_signal(signal.SIGKILL, rank=1) + + self.mount_a.kill() + self.mount_a.kill_cleanup() + + self.fs.rank_fail(rank=0) + self.fs.mds_restart(rank0['name']) + self.wait_for_daemon_start([rank0['name']]) + + self.fs.wait_for_state('up:resolve', rank=0, timeout=MDS_RESTART_GRACE) + if i in [2,5]: + self.assertEqual(len(self._get_pending_snap_update(rank=0)), 1) + elif i == 6: + self.assertEqual(len(self._get_pending_snap_update(rank=0)), 0) + self.assertGreater(self._get_last_created_snap(rank=0), last_created) + + self.fs.rank_fail(rank=1) + self.fs.mds_restart(rank1['name']) + self.wait_for_daemon_start([rank1['name']]) + self.fs.wait_for_state('up:active', rank=0, timeout=MDS_RESTART_GRACE) + + if i in [2,5]: + self.wait_until_true(lambda: len(self._get_pending_snap_update(rank=0)) == 0, timeout=30) + if i == 2: + self.assertEqual(self._get_last_created_snap(rank=0), last_created) + else: + self.assertGreater(self._get_last_created_snap(rank=0), last_created) + + self.mount_a.mount_wait() + + self.mount_a.run_shell(["rmdir", Raw("d1/dir/.snap/*")]) + + # mds_kill_mdstable_at: + # 3: MDSTableClient::handle_request (got agree) + # 4: MDSTableClient::commit + # 7: MDSTableClient::handle_request (got ack) + for i in [3,4,7]: + log.info("testing snapclient mds_kill_mdstable_at={0}".format(i)) + last_created = self._get_last_created_snap(rank=0) + + status = self.fs.status() + rank1 = self.fs.get_rank(rank=1, status=status) + self.fs.rank_freeze(True, rank=1) # prevent failover... + self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "{0}".format(i)], rank=1, status=status) + proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s3{0}".format(i)], wait=False) + self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=self.fs.beacon_timeout); + self.delete_mds_coredump(rank1['name']); + + self.mount_a.kill() + self.mount_a.kill_cleanup() + + if i in [3,4]: + self.assertEqual(len(self._get_pending_snap_update(rank=0)), 1) + elif i == 7: + self.assertEqual(len(self._get_pending_snap_update(rank=0)), 0) + self.assertGreater(self._get_last_created_snap(rank=0), last_created) + + self.fs.rank_fail(rank=1) + self.fs.mds_restart(rank1['name']) + self.wait_for_daemon_start([rank1['name']]) + status = self.fs.wait_for_daemons(timeout=MDS_RESTART_GRACE) + + if i in [3,4]: + self.wait_until_true(lambda: len(self._get_pending_snap_update(rank=0)) == 0, timeout=30) + if i == 3: + self.assertEqual(self._get_last_created_snap(rank=0), last_created) + else: + self.assertGreater(self._get_last_created_snap(rank=0), last_created) + + self.mount_a.mount_wait() + + self.mount_a.run_shell(["rmdir", Raw("d1/dir/.snap/*")]) + + # mds_kill_mdstable_at: + # 3: MDSTableClient::handle_request (got agree) + # 8: MDSTableServer::handle_rollback + log.info("testing snapclient mds_kill_mdstable_at=3, snapserver mds_kill_mdstable_at=8") + last_created = self._get_last_created_snap(rank=0) + + status = self.fs.status() + rank0 = self.fs.get_rank(rank=0, status=status) + rank1 = self.fs.get_rank(rank=1, status=status) + self.fs.rank_freeze(True, rank=0) + self.fs.rank_freeze(True, rank=1) + self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "8"], rank=0, status=status) + self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "3"], rank=1, status=status) + proc = self.mount_a.run_shell(["mkdir", "d1/dir/.snap/s4"], wait=False) + self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=1), timeout=self.fs.beacon_timeout); + self.delete_mds_coredump(rank1['name']); + + self.mount_a.kill() + self.mount_a.kill_cleanup() + + self.assertEqual(len(self._get_pending_snap_update(rank=0)), 1) + + self.fs.rank_fail(rank=1) + self.fs.mds_restart(rank1['name']) + self.wait_for_daemon_start([rank1['name']]) + + # rollback triggers assertion + self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=0), timeout=self.fs.beacon_timeout); + self.delete_mds_coredump(rank0['name']); + self.fs.rank_fail(rank=0) + self.fs.mds_restart(rank0['name']) + self.wait_for_daemon_start([rank0['name']]) + self.fs.wait_for_state('up:active', rank=0, timeout=MDS_RESTART_GRACE) + + # mds.1 should re-send rollback message + self.wait_until_true(lambda: len(self._get_pending_snap_update(rank=0)) == 0, timeout=30) + self.assertEqual(self._get_last_created_snap(rank=0), last_created) + + self.mount_a.mount_wait() + + def test_snapclient_cache(self): + """ + check if snapclient cache gets synced properly + """ + self.fs.set_allow_new_snaps(True); + self.fs.set_max_mds(3) + status = self.fs.wait_for_daemons() + + self.mount_a.run_shell(["mkdir", "-p", "d0/d1/dir"]) + self.mount_a.run_shell(["mkdir", "-p", "d0/d2/dir"]) + self.mount_a.setfattr("d0", "ceph.dir.pin", "0") + self.mount_a.setfattr("d0/d1", "ceph.dir.pin", "1") + self.mount_a.setfattr("d0/d2", "ceph.dir.pin", "2") + self._wait_subtrees([("/d0", 0), ("/d0/d1", 1), ("/d0/d2", 2)], rank="all", status=status, path="/d0") + + def _check_snapclient_cache(snaps_dump, cache_dump=None, rank=0): + if cache_dump is None: + cache_dump = self._get_snapclient_dump(rank=rank) + for key, value in cache_dump.items(): + if value != snaps_dump[key]: + return False + return True; + + # sync after mksnap + last_created = self._get_last_created_snap(rank=0) + self.mount_a.run_shell(["mkdir", "d0/d1/dir/.snap/s1", "d0/d1/dir/.snap/s2"]) + self.wait_until_true(lambda: len(self._get_pending_snap_update(rank=0)) == 0, timeout=30) + self.assertGreater(self._get_last_created_snap(rank=0), last_created) + + snaps_dump = self._get_snapserver_dump(rank=0) + self.assertTrue(_check_snapclient_cache(snaps_dump, rank=0)); + self.assertTrue(_check_snapclient_cache(snaps_dump, rank=1)); + self.assertTrue(_check_snapclient_cache(snaps_dump, rank=2)); + + # sync after rmsnap + last_destroyed = self._get_last_destroyed_snap(rank=0) + self.mount_a.run_shell(["rmdir", "d0/d1/dir/.snap/s1"]) + self.wait_until_true(lambda: len(self._get_pending_snap_destroy(rank=0)) == 0, timeout=30) + self.assertGreater(self._get_last_destroyed_snap(rank=0), last_destroyed) + + snaps_dump = self._get_snapserver_dump(rank=0) + self.assertTrue(_check_snapclient_cache(snaps_dump, rank=0)); + self.assertTrue(_check_snapclient_cache(snaps_dump, rank=1)); + self.assertTrue(_check_snapclient_cache(snaps_dump, rank=2)); + + # sync during mds recovers + self.fs.rank_fail(rank=2) + status = self.fs.wait_for_daemons(timeout=MDS_RESTART_GRACE) + self.assertTrue(_check_snapclient_cache(snaps_dump, rank=2)); + + self.fs.rank_fail(rank=0) + self.fs.rank_fail(rank=1) + status = self.fs.wait_for_daemons() + self.fs.wait_for_state('up:active', rank=0, timeout=MDS_RESTART_GRACE) + self.assertTrue(_check_snapclient_cache(snaps_dump, rank=0)); + self.assertTrue(_check_snapclient_cache(snaps_dump, rank=1)); + self.assertTrue(_check_snapclient_cache(snaps_dump, rank=2)); + + # kill at MDSTableClient::handle_notify_prep + status = self.fs.status() + rank2 = self.fs.get_rank(rank=2, status=status) + self.fs.rank_freeze(True, rank=2) + self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "9"], rank=2, status=status) + proc = self.mount_a.run_shell(["mkdir", "d0/d1/dir/.snap/s3"], wait=False) + self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=self.fs.beacon_timeout); + self.delete_mds_coredump(rank2['name']); + + # mksnap should wait for notify ack from mds.2 + self.assertFalse(proc.finished); + + # mksnap should proceed after mds.2 fails + self.fs.rank_fail(rank=2) + self.wait_until_true(lambda: proc.finished, timeout=30); + + self.fs.mds_restart(rank2['name']) + self.wait_for_daemon_start([rank2['name']]) + status = self.fs.wait_for_daemons(timeout=MDS_RESTART_GRACE) + + self.mount_a.run_shell(["rmdir", Raw("d0/d1/dir/.snap/*")]) + + # kill at MDSTableClient::commit + # the recovering mds should sync all mds' cache when it enters resolve stage + self.set_conf("mds", "mds_reconnect_timeout", "5") + for i in range(1, 4): + status = self.fs.status() + rank2 = self.fs.get_rank(rank=2, status=status) + self.fs.rank_freeze(True, rank=2) + self.fs.rank_asok(['config', 'set', "mds_kill_mdstable_at", "4"], rank=2, status=status) + last_created = self._get_last_created_snap(rank=0) + proc = self.mount_a.run_shell(["mkdir", "d0/d2/dir/.snap/s{0}".format(i)], wait=False) + self.wait_until_true(lambda: "laggy_since" in self.fs.get_rank(rank=2), timeout=self.fs.beacon_timeout); + self.delete_mds_coredump(rank2['name']); + + self.mount_a.kill() + self.mount_a.kill_cleanup() + + self.assertEqual(len(self._get_pending_snap_update(rank=0)), 1) + + if i in [2,4]: + self.fs.rank_fail(rank=0) + if i in [3,4]: + self.fs.rank_fail(rank=1) + + self.fs.rank_fail(rank=2) + self.fs.mds_restart(rank2['name']) + self.wait_for_daemon_start([rank2['name']]) + status = self.fs.wait_for_daemons(timeout=MDS_RESTART_GRACE) + + rank0_cache = self._get_snapclient_dump(rank=0) + rank1_cache = self._get_snapclient_dump(rank=1) + rank2_cache = self._get_snapclient_dump(rank=2) + + self.assertGreater(int(rank0_cache["last_created"]), last_created) + self.assertEqual(rank0_cache, rank1_cache); + self.assertEqual(rank0_cache, rank2_cache); + + self.wait_until_true(lambda: len(self._get_pending_snap_update(rank=0)) == 0, timeout=30) + + snaps_dump = self._get_snapserver_dump(rank=0) + self.assertEqual(snaps_dump["last_created"], rank0_cache["last_created"]) + self.assertTrue(_check_snapclient_cache(snaps_dump, cache_dump=rank0_cache)); + + self.mount_a.mount_wait() + + self.mount_a.run_shell(["rmdir", Raw("d0/d2/dir/.snap/*")]) + + def test_multimds_mksnap(self): + """ + check if snapshot takes effect across authority subtrees + """ + self.fs.set_allow_new_snaps(True); + self.fs.set_max_mds(2) + status = self.fs.wait_for_daemons() + + self.mount_a.run_shell(["mkdir", "-p", "d0/d1/empty"]) + self.mount_a.setfattr("d0", "ceph.dir.pin", "0") + self.mount_a.setfattr("d0/d1", "ceph.dir.pin", "1") + self._wait_subtrees([("/d0", 0), ("/d0/d1", 1)], rank="all", status=status, path="/d0") + + self.mount_a.write_test_pattern("d0/d1/file_a", 8 * 1024 * 1024) + self.mount_a.run_shell(["mkdir", "d0/.snap/s1"]) + self.mount_a.run_shell(["rm", "-f", "d0/d1/file_a"]) + self.mount_a.validate_test_pattern("d0/.snap/s1/d1/file_a", 8 * 1024 * 1024) + + self.mount_a.run_shell(["rmdir", "d0/.snap/s1"]) + self.mount_a.run_shell(["rm", "-rf", "d0"]) + + def test_multimds_past_parents(self): + """ + check if past parents are properly recorded during across authority rename + """ + self.fs.set_allow_new_snaps(True); + self.fs.set_max_mds(2) + status = self.fs.wait_for_daemons() + + self.mount_a.run_shell_payload("mkdir -p {d0,d1}/empty") + self.mount_a.setfattr("d0", "ceph.dir.pin", "0") + self.mount_a.setfattr("d1", "ceph.dir.pin", "1") + self._wait_subtrees([("/d0", 0), ("/d1", 1)], rank=0, status=status) + + self.mount_a.run_shell(["mkdir", "d0/d3"]) + self.mount_a.run_shell(["mkdir", "d0/.snap/s1"]) + snap_name = self.mount_a.run_shell(["ls", "d0/d3/.snap"]).stdout.getvalue() + + self.mount_a.run_shell(["mv", "d0/d3", "d1/d3"]) + snap_name1 = self.mount_a.run_shell(["ls", "d1/d3/.snap"]).stdout.getvalue() + self.assertEqual(snap_name1, snap_name); + + self.mount_a.run_shell(["rmdir", "d0/.snap/s1"]) + snap_name1 = self.mount_a.run_shell(["ls", "d1/d3/.snap"]).stdout.getvalue() + self.assertEqual(snap_name1, ""); + + self.mount_a.run_shell(["rm", "-rf", "d0", "d1"]) + + def test_multimds_hardlink(self): + """ + check if hardlink snapshot works in multimds setup + """ + self.fs.set_allow_new_snaps(True); + self.fs.set_max_mds(2) + status = self.fs.wait_for_daemons() + + self.mount_a.run_shell_payload("mkdir -p {d0,d1}/empty") + + self.mount_a.setfattr("d0", "ceph.dir.pin", "0") + self.mount_a.setfattr("d1", "ceph.dir.pin", "1") + self._wait_subtrees([("/d0", 0), ("/d1", 1)], rank=0, status=status) + + self.mount_a.run_python(dedent(""" + import os + open(os.path.join("{path}", "d0/file1"), 'w').write("asdf") + open(os.path.join("{path}", "d0/file2"), 'w').write("asdf") + """.format(path=self.mount_a.mountpoint) + )) + + self.mount_a.run_shell(["ln", "d0/file1", "d1/file1"]) + self.mount_a.run_shell(["ln", "d0/file2", "d1/file2"]) + + self.mount_a.run_shell(["mkdir", "d1/.snap/s1"]) + + self.mount_a.run_python(dedent(""" + import os + open(os.path.join("{path}", "d0/file1"), 'w').write("qwer") + """.format(path=self.mount_a.mountpoint) + )) + + self.mount_a.run_shell(["grep", "asdf", "d1/.snap/s1/file1"]) + + self.mount_a.run_shell(["rm", "-f", "d0/file2"]) + self.mount_a.run_shell(["grep", "asdf", "d1/.snap/s1/file2"]) + + self.mount_a.run_shell(["rm", "-f", "d1/file2"]) + self.mount_a.run_shell(["grep", "asdf", "d1/.snap/s1/file2"]) + + self.mount_a.run_shell(["rmdir", "d1/.snap/s1"]) + self.mount_a.run_shell(["rm", "-rf", "d0", "d1"]) + + class SnapLimitViolationException(Exception): + failed_snapshot_number = -1 + + def __init__(self, num): + self.failed_snapshot_number = num + + def get_snap_name(self, dir_name, sno): + sname = "{dir_name}/.snap/s_{sno}".format(dir_name=dir_name, sno=sno) + return sname + + def create_snap_dir(self, sname): + self.mount_a.run_shell(["mkdir", sname]) + + def delete_dir_and_snaps(self, dir_name, snaps): + for sno in range(1, snaps+1, 1): + sname = self.get_snap_name(dir_name, sno) + self.mount_a.run_shell(["rmdir", sname]) + self.mount_a.run_shell(["rmdir", dir_name]) + + def create_dir_and_snaps(self, dir_name, snaps): + self.mount_a.run_shell(["mkdir", dir_name]) + + for sno in range(1, snaps+1, 1): + sname = self.get_snap_name(dir_name, sno) + try: + self.create_snap_dir(sname) + except CommandFailedError as e: + # failing at the last mkdir beyond the limit is expected + if sno == snaps: + log.info("failed while creating snap #{}: {}".format(sno, repr(e))) + raise TestSnapshots.SnapLimitViolationException(sno) + + def test_mds_max_snaps_per_dir_default_limit(self): + """ + Test the newly introudced option named mds_max_snaps_per_dir + Default snaps limit is 100 + Test if the default number of snapshot directories can be created + """ + self.create_dir_and_snaps("accounts", int(self.mds_max_snaps_per_dir)) + self.delete_dir_and_snaps("accounts", int(self.mds_max_snaps_per_dir)) + + def test_mds_max_snaps_per_dir_with_increased_limit(self): + """ + Test the newly introudced option named mds_max_snaps_per_dir + First create 101 directories and ensure that the 101st directory + creation fails. Then increase the default by one and see if the + additional directory creation succeeds + """ + # first test the default limit + new_limit = int(self.mds_max_snaps_per_dir) + self.fs.rank_asok(['config', 'set', 'mds_max_snaps_per_dir', repr(new_limit)]) + try: + self.create_dir_and_snaps("accounts", new_limit + 1) + except TestSnapshots.SnapLimitViolationException as e: + if e.failed_snapshot_number == (new_limit + 1): + pass + # then increase the limit by one and test + new_limit = new_limit + 1 + self.fs.rank_asok(['config', 'set', 'mds_max_snaps_per_dir', repr(new_limit)]) + sname = self.get_snap_name("accounts", new_limit) + self.create_snap_dir(sname) + self.delete_dir_and_snaps("accounts", new_limit) + + def test_mds_max_snaps_per_dir_with_reduced_limit(self): + """ + Test the newly introudced option named mds_max_snaps_per_dir + First create 99 directories. Then reduce the limit to 98. Then try + creating another directory and ensure that additional directory + creation fails. + """ + # first test the new limit + new_limit = int(self.mds_max_snaps_per_dir) - 1 + self.create_dir_and_snaps("accounts", new_limit) + sname = self.get_snap_name("accounts", new_limit + 1) + # then reduce the limit by one and test + new_limit = new_limit - 1 + self.fs.rank_asok(['config', 'set', 'mds_max_snaps_per_dir', repr(new_limit)]) + try: + self.create_snap_dir(sname) + except CommandFailedError: + # after reducing limit we expect the new snapshot creation to fail + pass + self.delete_dir_and_snaps("accounts", new_limit + 1) + + +class TestMonSnapsAndFsPools(CephFSTestCase): + MDSS_REQUIRED = 3 + + def test_disallow_monitor_managed_snaps_for_fs_pools(self): + """ + Test that creation of monitor managed snaps fails for pools attached + to any file-system + """ + with self.assertRaises(CommandFailedError): + self.fs.rados(["mksnap", "snap1"], pool=self.fs.get_data_pool_name()) + + with self.assertRaises(CommandFailedError): + self.fs.rados(["mksnap", "snap2"], pool=self.fs.get_metadata_pool_name()) + + with self.assertRaises(CommandFailedError): + test_pool_name = self.fs.get_data_pool_name() + base_cmd = f'osd pool mksnap {test_pool_name} snap3' + self.run_cluster_cmd(base_cmd) + + with self.assertRaises(CommandFailedError): + test_pool_name = self.fs.get_metadata_pool_name() + base_cmd = f'osd pool mksnap {test_pool_name} snap4' + self.run_cluster_cmd(base_cmd) + + def test_attaching_pools_with_snaps_to_fs_fails(self): + """ + Test that attempt to attach pool with snapshots to an fs fails + """ + test_pool_name = 'snap-test-pool' + base_cmd = f'osd pool create {test_pool_name}' + ret = self.run_cluster_cmd_result(base_cmd) + self.assertEqual(ret, 0) + + self.fs.rados(["mksnap", "snap3"], pool=test_pool_name) + + base_cmd = f'fs add_data_pool {self.fs.name} {test_pool_name}' + ret = self.run_cluster_cmd_result(base_cmd) + self.assertEqual(ret, errno.EOPNOTSUPP) + + # cleanup + self.fs.rados(["rmsnap", "snap3"], pool=test_pool_name) + base_cmd = f'osd pool delete {test_pool_name}' + ret = self.run_cluster_cmd_result(base_cmd) + + def test_using_pool_with_snap_fails_fs_creation(self): + """ + Test that using a pool with snaps for fs creation fails + """ + base_cmd = 'osd pool create test_data_pool' + ret = self.run_cluster_cmd_result(base_cmd) + self.assertEqual(ret, 0) + base_cmd = 'osd pool create test_metadata_pool' + ret = self.run_cluster_cmd_result(base_cmd) + self.assertEqual(ret, 0) + + self.fs.rados(["mksnap", "snap4"], pool='test_data_pool') + + base_cmd = 'fs new testfs test_metadata_pool test_data_pool' + ret = self.run_cluster_cmd_result(base_cmd) + self.assertEqual(ret, errno.EOPNOTSUPP) + + # cleanup + self.fs.rados(["rmsnap", "snap4"], pool='test_data_pool') + base_cmd = 'osd pool delete test_data_pool' + ret = self.run_cluster_cmd_result(base_cmd) + base_cmd = 'osd pool delete test_metadata_pool' + ret = self.run_cluster_cmd_result(base_cmd) diff --git a/qa/tasks/cephfs/test_strays.py b/qa/tasks/cephfs/test_strays.py new file mode 100644 index 000000000..8bdc126e2 --- /dev/null +++ b/qa/tasks/cephfs/test_strays.py @@ -0,0 +1,1027 @@ +import json +import time +import logging +from textwrap import dedent +import datetime +import gevent + +from teuthology.exceptions import CommandFailedError +from teuthology.orchestra.run import Raw +from tasks.cephfs.cephfs_test_case import CephFSTestCase, for_teuthology + +log = logging.getLogger(__name__) + + +class TestStrays(CephFSTestCase): + MDSS_REQUIRED = 2 + + OPS_THROTTLE = 1 + FILES_THROTTLE = 2 + + # Range of different file sizes used in throttle test's workload + throttle_workload_size_range = 16 + + @for_teuthology + def test_ops_throttle(self): + self._test_throttling(self.OPS_THROTTLE) + + @for_teuthology + def test_files_throttle(self): + self._test_throttling(self.FILES_THROTTLE) + + def test_dir_deletion(self): + """ + That when deleting a bunch of dentries and the containing + directory, everything gets purged. + Catches cases where the client might e.g. fail to trim + the unlinked dir from its cache. + """ + file_count = 1000 + create_script = dedent(""" + import os + + mountpoint = "{mountpoint}" + subdir = "delete_me" + size = {size} + file_count = {file_count} + os.mkdir(os.path.join(mountpoint, subdir)) + for i in range(0, file_count): + filename = "{{0}}_{{1}}.bin".format(i, size) + with open(os.path.join(mountpoint, subdir, filename), 'w') as f: + f.write(size * 'x') + """.format( + mountpoint=self.mount_a.mountpoint, + size=1024, + file_count=file_count + )) + + self.mount_a.run_python(create_script) + + # That the dirfrag object is created + self.fs.mds_asok(["flush", "journal"]) + dir_ino = self.mount_a.path_to_ino("delete_me") + self.assertTrue(self.fs.dirfrag_exists(dir_ino, 0)) + + # Remove everything + self.mount_a.run_shell(["rm", "-rf", "delete_me"]) + self.fs.mds_asok(["flush", "journal"]) + + # That all the removed files get created as strays + strays = self.get_mdc_stat("strays_created") + self.assertEqual(strays, file_count + 1) + + # That the strays all get enqueued for purge + self.wait_until_equal( + lambda: self.get_mdc_stat("strays_enqueued"), + strays, + timeout=600 + + ) + + # That all the purge operations execute + self.wait_until_equal( + lambda: self.get_stat("purge_queue", "pq_executed"), + strays, + timeout=600 + ) + + # That finally, the directory metadata object is gone + self.assertFalse(self.fs.dirfrag_exists(dir_ino, 0)) + + # That finally, the data objects are all gone + self.await_data_pool_empty() + + def _test_throttling(self, throttle_type): + self.data_log = [] + try: + return self._do_test_throttling(throttle_type) + except: + for l in self.data_log: + log.info(",".join([l_.__str__() for l_ in l])) + raise + + def _do_test_throttling(self, throttle_type): + """ + That the mds_max_purge_ops setting is respected + """ + + def set_throttles(files, ops): + """ + Helper for updating ops/files limits, and calculating effective + ops_per_pg setting to give the same ops limit. + """ + self.set_conf('mds', 'mds_max_purge_files', "%d" % files) + self.set_conf('mds', 'mds_max_purge_ops', "%d" % ops) + + pgs = self.fs.mon_manager.get_pool_int_property( + self.fs.get_data_pool_name(), + "pg_num" + ) + ops_per_pg = float(ops) / pgs + self.set_conf('mds', 'mds_max_purge_ops_per_pg', "%s" % ops_per_pg) + + # Test conditions depend on what we're going to be exercising. + # * Lift the threshold on whatever throttle we are *not* testing, so + # that the throttle of interest is the one that will be the bottleneck + # * Create either many small files (test file count throttling) or fewer + # large files (test op throttling) + if throttle_type == self.OPS_THROTTLE: + set_throttles(files=100000000, ops=16) + size_unit = 1024 * 1024 # big files, generate lots of ops + file_multiplier = 100 + elif throttle_type == self.FILES_THROTTLE: + # The default value of file limit is pretty permissive, so to avoid + # the test running too fast, create lots of files and set the limit + # pretty low. + set_throttles(ops=100000000, files=6) + size_unit = 1024 # small, numerous files + file_multiplier = 200 + else: + raise NotImplementedError(throttle_type) + + # Pick up config changes + self.fs.mds_fail_restart() + self.fs.wait_for_daemons() + + create_script = dedent(""" + import os + + mountpoint = "{mountpoint}" + subdir = "delete_me" + size_unit = {size_unit} + file_multiplier = {file_multiplier} + os.mkdir(os.path.join(mountpoint, subdir)) + for i in range(0, file_multiplier): + for size in range(0, {size_range}*size_unit, size_unit): + filename = "{{0}}_{{1}}.bin".format(i, size // size_unit) + with open(os.path.join(mountpoint, subdir, filename), 'w') as f: + f.write(size * 'x') + """.format( + mountpoint=self.mount_a.mountpoint, + size_unit=size_unit, + file_multiplier=file_multiplier, + size_range=self.throttle_workload_size_range + )) + + self.mount_a.run_python(create_script) + + # We will run the deletion in the background, to reduce the risk of it completing before + # we have started monitoring the stray statistics. + def background(): + self.mount_a.run_shell(["rm", "-rf", "delete_me"]) + self.fs.mds_asok(["flush", "journal"]) + + background_thread = gevent.spawn(background) + + total_inodes = file_multiplier * self.throttle_workload_size_range + 1 + mds_max_purge_ops = int(self.fs.get_config("mds_max_purge_ops", 'mds')) + mds_max_purge_files = int(self.fs.get_config("mds_max_purge_files", 'mds')) + + # During this phase we look for the concurrent ops to exceed half + # the limit (a heuristic) and not exceed the limit (a correctness + # condition). + purge_timeout = 600 + elapsed = 0 + files_high_water = 0 + ops_high_water = 0 + + while True: + stats = self.fs.mds_asok(['perf', 'dump']) + mdc_stats = stats['mds_cache'] + pq_stats = stats['purge_queue'] + if elapsed >= purge_timeout: + raise RuntimeError("Timeout waiting for {0} inodes to purge, stats:{1}".format(total_inodes, mdc_stats)) + + num_strays = mdc_stats['num_strays'] + num_strays_purging = pq_stats['pq_executing'] + num_purge_ops = pq_stats['pq_executing_ops'] + files_high_water = pq_stats['pq_executing_high_water'] + ops_high_water = pq_stats['pq_executing_ops_high_water'] + + self.data_log.append([datetime.datetime.now(), num_strays, num_strays_purging, num_purge_ops, files_high_water, ops_high_water]) + + total_strays_created = mdc_stats['strays_created'] + total_strays_purged = pq_stats['pq_executed'] + + if total_strays_purged == total_inodes: + log.info("Complete purge in {0} seconds".format(elapsed)) + break + elif total_strays_purged > total_inodes: + raise RuntimeError("Saw more strays than expected, mdc stats: {0}".format(mdc_stats)) + else: + if throttle_type == self.OPS_THROTTLE: + # 11 is filer_max_purge_ops plus one for the backtrace: + # limit is allowed to be overshot by this much. + if num_purge_ops > mds_max_purge_ops + 11: + raise RuntimeError("num_purge_ops violates threshold {0}/{1}".format( + num_purge_ops, mds_max_purge_ops + )) + elif throttle_type == self.FILES_THROTTLE: + if num_strays_purging > mds_max_purge_files: + raise RuntimeError("num_strays_purging violates threshold {0}/{1}".format( + num_strays_purging, mds_max_purge_files + )) + else: + raise NotImplementedError(throttle_type) + + log.info("Waiting for purge to complete {0}/{1}, {2}/{3}".format( + num_strays_purging, num_strays, + total_strays_purged, total_strays_created + )) + time.sleep(1) + elapsed += 1 + + background_thread.join() + + # Check that we got up to a respectable rate during the purge. This is totally + # racy, but should be safeish unless the cluster is pathologically slow, or + # insanely fast such that the deletions all pass before we have polled the + # statistics. + if throttle_type == self.OPS_THROTTLE: + if ops_high_water < mds_max_purge_ops // 2: + raise RuntimeError("Ops in flight high water is unexpectedly low ({0} / {1})".format( + ops_high_water, mds_max_purge_ops + )) + # The MDS may go over mds_max_purge_ops for some items, like a + # heavily fragmented directory. The throttle does not kick in + # until *after* we reach or exceed the limit. This is expected + # because we don't want to starve the PQ or never purge a + # particularly large file/directory. + self.assertLessEqual(ops_high_water, mds_max_purge_ops+64) + elif throttle_type == self.FILES_THROTTLE: + if files_high_water < mds_max_purge_files // 2: + raise RuntimeError("Files in flight high water is unexpectedly low ({0} / {1})".format( + files_high_water, mds_max_purge_files + )) + self.assertLessEqual(files_high_water, mds_max_purge_files) + + # Sanity check all MDC stray stats + stats = self.fs.mds_asok(['perf', 'dump']) + mdc_stats = stats['mds_cache'] + pq_stats = stats['purge_queue'] + self.assertEqual(mdc_stats['num_strays'], 0) + self.assertEqual(mdc_stats['num_strays_delayed'], 0) + self.assertEqual(pq_stats['pq_executing'], 0) + self.assertEqual(pq_stats['pq_executing_ops'], 0) + self.assertEqual(mdc_stats['strays_created'], total_inodes) + self.assertEqual(mdc_stats['strays_enqueued'], total_inodes) + self.assertEqual(pq_stats['pq_executed'], total_inodes) + + def get_mdc_stat(self, name, mds_id=None): + return self.get_stat("mds_cache", name, mds_id) + + def get_stat(self, subsys, name, mds_id=None): + return self.fs.mds_asok(['perf', 'dump', subsys, name], + mds_id=mds_id)[subsys][name] + + def _wait_for_counter(self, subsys, counter, expect_val, timeout=60, + mds_id=None): + self.wait_until_equal( + lambda: self.get_stat(subsys, counter, mds_id), + expect_val=expect_val, timeout=timeout, + reject_fn=lambda x: x > expect_val + ) + + def test_open_inode(self): + """ + That the case of a dentry unlinked while a client holds an + inode open is handled correctly. + + The inode should be moved into a stray dentry, while the original + dentry and directory should be purged. + + The inode's data should be purged when the client eventually closes + it. + """ + mount_a_client_id = self.mount_a.get_global_id() + + # Write some bytes to a file + size_mb = 8 + + # Hold the file open + p = self.mount_a.open_background("open_file") + self.mount_a.write_n_mb("open_file", size_mb) + open_file_ino = self.mount_a.path_to_ino("open_file") + + self.assertEqual(self.get_session(mount_a_client_id)['num_caps'], 2) + + # Unlink the dentry + self.mount_a.run_shell(["rm", "-f", "open_file"]) + + # Wait to see the stray count increment + self.wait_until_equal( + lambda: self.get_mdc_stat("num_strays"), + expect_val=1, timeout=60, reject_fn=lambda x: x > 1) + + # See that while the stray count has incremented, none have passed + # on to the purge queue + self.assertEqual(self.get_mdc_stat("strays_created"), 1) + self.assertEqual(self.get_mdc_stat("strays_enqueued"), 0) + + # See that the client still holds 2 caps + self.assertEqual(self.get_session(mount_a_client_id)['num_caps'], 2) + + # See that the data objects remain in the data pool + self.assertTrue(self.fs.data_objects_present(open_file_ino, size_mb * 1024 * 1024)) + + # Now close the file + self.mount_a.kill_background(p) + + # Wait to see the client cap count decrement + self.wait_until_equal( + lambda: self.get_session(mount_a_client_id)['num_caps'], + expect_val=1, timeout=60, reject_fn=lambda x: x > 2 or x < 1 + ) + # Wait to see the purge counter increment, stray count go to zero + self._wait_for_counter("mds_cache", "strays_enqueued", 1) + self.wait_until_equal( + lambda: self.get_mdc_stat("num_strays"), + expect_val=0, timeout=6, reject_fn=lambda x: x > 1 + ) + self._wait_for_counter("purge_queue", "pq_executed", 1) + + # See that the data objects no longer exist + self.assertTrue(self.fs.data_objects_absent(open_file_ino, size_mb * 1024 * 1024)) + + self.await_data_pool_empty() + + def test_reintegration_limit(self): + """ + That the reintegration is not blocked by full directories. + """ + + LOW_LIMIT = 50 + self.config_set('mds', 'mds_bal_fragment_size_max', str(LOW_LIMIT)) + time.sleep(10) # for config to reach MDS; async create is fast!! + + last_reintegrated = self.get_mdc_stat("strays_reintegrated") + self.mount_a.run_shell_payload(""" + mkdir a b + for i in `seq 1 50`; do + touch a/"$i" + ln a/"$i" b/"$i" + done + sync -f a b + rm a/* + """) + + self.wait_until_equal( + lambda: self.get_mdc_stat("num_strays"), + expect_val=0, + timeout=60 + ) + curr_reintegrated = self.get_mdc_stat("strays_reintegrated") + self.assertGreater(curr_reintegrated, last_reintegrated) + + + def test_hardlink_reintegration(self): + """ + That removal of primary dentry of hardlinked inode results + in reintegration of inode into the previously-remote dentry, + rather than lingering as a stray indefinitely. + """ + # Write some bytes to file_a + size_mb = 8 + self.mount_a.run_shell(["mkdir", "dir_1"]) + self.mount_a.write_n_mb("dir_1/file_a", size_mb) + ino = self.mount_a.path_to_ino("dir_1/file_a") + + # Create a hardlink named file_b + self.mount_a.run_shell(["mkdir", "dir_2"]) + self.mount_a.run_shell(["ln", "dir_1/file_a", "dir_2/file_b"]) + self.assertEqual(self.mount_a.path_to_ino("dir_2/file_b"), ino) + + # Flush journal + self.fs.mds_asok(['flush', 'journal']) + + # See that backtrace for the file points to the file_a path + pre_unlink_bt = self.fs.read_backtrace(ino) + self.assertEqual(pre_unlink_bt['ancestors'][0]['dname'], "file_a") + + # empty mds cache. otherwise mds reintegrates stray when unlink finishes + self.mount_a.umount_wait() + self.fs.mds_asok(['flush', 'journal']) + self.fs.mds_fail_restart() + self.fs.wait_for_daemons() + self.mount_a.mount_wait() + + # Unlink file_a + self.mount_a.run_shell(["rm", "-f", "dir_1/file_a"]) + + # See that a stray was created + self.assertEqual(self.get_mdc_stat("num_strays"), 1) + self.assertEqual(self.get_mdc_stat("strays_created"), 1) + + # Wait, see that data objects are still present (i.e. that the + # stray did not advance to purging given time) + time.sleep(30) + self.assertTrue(self.fs.data_objects_present(ino, size_mb * 1024 * 1024)) + self.assertEqual(self.get_mdc_stat("strays_enqueued"), 0) + + # See that before reintegration, the inode's backtrace points to a stray dir + self.fs.mds_asok(['flush', 'journal']) + self.assertTrue(self.get_backtrace_path(ino).startswith("stray")) + + last_reintegrated = self.get_mdc_stat("strays_reintegrated") + + # Do a metadata operation on the remaining link (mv is heavy handed, but + # others like touch may be satisfied from caps without poking MDS) + self.mount_a.run_shell(["mv", "dir_2/file_b", "dir_2/file_c"]) + + # Stray reintegration should happen as a result of the eval_remote call + # on responding to a client request. + self.wait_until_equal( + lambda: self.get_mdc_stat("num_strays"), + expect_val=0, + timeout=60 + ) + + # See the reintegration counter increment + curr_reintegrated = self.get_mdc_stat("strays_reintegrated") + self.assertGreater(curr_reintegrated, last_reintegrated) + last_reintegrated = curr_reintegrated + + # Flush the journal + self.fs.mds_asok(['flush', 'journal']) + + # See that the backtrace for the file points to the remaining link's path + post_reint_bt = self.fs.read_backtrace(ino) + self.assertEqual(post_reint_bt['ancestors'][0]['dname'], "file_c") + + # mds should reintegrates stray when unlink finishes + self.mount_a.run_shell(["ln", "dir_2/file_c", "dir_2/file_d"]) + self.mount_a.run_shell(["rm", "-f", "dir_2/file_c"]) + + # Stray reintegration should happen as a result of the notify_stray call + # on completion of unlink + self.wait_until_equal( + lambda: self.get_mdc_stat("num_strays"), + expect_val=0, + timeout=60 + ) + + # See the reintegration counter increment + curr_reintegrated = self.get_mdc_stat("strays_reintegrated") + self.assertGreater(curr_reintegrated, last_reintegrated) + last_reintegrated = curr_reintegrated + + # Flush the journal + self.fs.mds_asok(['flush', 'journal']) + + # See that the backtrace for the file points to the newest link's path + post_reint_bt = self.fs.read_backtrace(ino) + self.assertEqual(post_reint_bt['ancestors'][0]['dname'], "file_d") + + # Now really delete it + self.mount_a.run_shell(["rm", "-f", "dir_2/file_d"]) + self._wait_for_counter("mds_cache", "strays_enqueued", 1) + self._wait_for_counter("purge_queue", "pq_executed", 1) + + self.assert_purge_idle() + self.assertTrue(self.fs.data_objects_absent(ino, size_mb * 1024 * 1024)) + + # We caused the inode to go stray 3 times + self.assertEqual(self.get_mdc_stat("strays_created"), 3) + # We purged it at the last + self.assertEqual(self.get_mdc_stat("strays_enqueued"), 1) + + def test_reintegration_via_scrub(self): + """ + That reintegration is triggered via recursive scrub. + """ + + self.mount_a.run_shell_payload(""" + mkdir -p a b + for i in `seq 1 50`; do + touch a/"$i" + ln a/"$i" b/"$i" + done + sync -f . + """) + + self.mount_a.remount() # drop caps/cache + self.fs.rank_tell(["flush", "journal"]) + self.fs.rank_fail() + self.fs.wait_for_daemons() + + # only / in cache, reintegration cannot happen + self.wait_until_equal( + lambda: len(self.fs.rank_tell(["dump", "tree", "/"])), + expect_val=3, + timeout=60 + ) + + last_reintegrated = self.get_mdc_stat("strays_reintegrated") + self.mount_a.run_shell_payload(""" + rm a/* + sync -f . + """) + self.wait_until_equal( + lambda: len(self.fs.rank_tell(["dump", "tree", "/"])), + expect_val=3, + timeout=60 + ) + self.assertEqual(self.get_mdc_stat("num_strays"), 50) + curr_reintegrated = self.get_mdc_stat("strays_reintegrated") + self.assertEqual(last_reintegrated, curr_reintegrated) + + self.fs.rank_tell(["scrub", "start", "/", "recursive,force"]) + + self.wait_until_equal( + lambda: self.get_mdc_stat("num_strays"), + expect_val=0, + timeout=60 + ) + curr_reintegrated = self.get_mdc_stat("strays_reintegrated") + # N.B.: reintegrate (rename RPC) may be tried multiple times from different code paths + self.assertGreaterEqual(curr_reintegrated, last_reintegrated+50) + + def test_mv_hardlink_cleanup(self): + """ + That when doing a rename from A to B, and B has hardlinks, + then we make a stray for B which is then reintegrated + into one of his hardlinks. + """ + # Create file_a, file_b, and a hardlink to file_b + size_mb = 8 + self.mount_a.write_n_mb("file_a", size_mb) + file_a_ino = self.mount_a.path_to_ino("file_a") + + self.mount_a.write_n_mb("file_b", size_mb) + file_b_ino = self.mount_a.path_to_ino("file_b") + + self.mount_a.run_shell(["ln", "file_b", "linkto_b"]) + self.assertEqual(self.mount_a.path_to_ino("linkto_b"), file_b_ino) + + # mv file_a file_b + self.mount_a.run_shell(["mv", "file_a", "file_b"]) + + # Stray reintegration should happen as a result of the notify_stray call on + # completion of rename + self.wait_until_equal( + lambda: self.get_mdc_stat("num_strays"), + expect_val=0, + timeout=60 + ) + + self.assertEqual(self.get_mdc_stat("strays_created"), 1) + self.assertGreaterEqual(self.get_mdc_stat("strays_reintegrated"), 1) + + # No data objects should have been deleted, as both files still have linkage. + self.assertTrue(self.fs.data_objects_present(file_a_ino, size_mb * 1024 * 1024)) + self.assertTrue(self.fs.data_objects_present(file_b_ino, size_mb * 1024 * 1024)) + + self.fs.mds_asok(['flush', 'journal']) + + post_reint_bt = self.fs.read_backtrace(file_b_ino) + self.assertEqual(post_reint_bt['ancestors'][0]['dname'], "linkto_b") + + def _setup_two_ranks(self): + # Set up two MDSs + self.fs.set_max_mds(2) + + # See that we have two active MDSs + self.wait_until_equal(lambda: len(self.fs.get_active_names()), 2, 30, + reject_fn=lambda v: v > 2 or v < 1) + + active_mds_names = self.fs.get_active_names() + rank_0_id = active_mds_names[0] + rank_1_id = active_mds_names[1] + log.info("Ranks 0 and 1 are {0} and {1}".format( + rank_0_id, rank_1_id)) + + # Get rid of other MDS daemons so that it's easier to know which + # daemons to expect in which ranks after restarts + for unneeded_mds in set(self.mds_cluster.mds_ids) - {rank_0_id, rank_1_id}: + self.mds_cluster.mds_stop(unneeded_mds) + self.mds_cluster.mds_fail(unneeded_mds) + + return rank_0_id, rank_1_id + + def _force_migrate(self, path, rank=1): + """ + :param to_id: MDS id to move it to + :param path: Filesystem path (string) to move + :return: None + """ + self.mount_a.run_shell(["setfattr", "-n", "ceph.dir.pin", "-v", str(rank), path]) + rpath = "/"+path + self._wait_subtrees([(rpath, rank)], rank=rank, path=rpath) + + def _is_stopped(self, rank): + mds_map = self.fs.get_mds_map() + return rank not in [i['rank'] for i in mds_map['info'].values()] + + def test_purge_on_shutdown(self): + """ + That when an MDS rank is shut down, its purge queue is + drained in the process. + """ + rank_0_id, rank_1_id = self._setup_two_ranks() + + self.set_conf("mds.{0}".format(rank_1_id), 'mds_max_purge_files', "0") + self.mds_cluster.mds_fail_restart(rank_1_id) + self.fs.wait_for_daemons() + + file_count = 5 + + self.mount_a.create_n_files("delete_me/file", file_count) + + self._force_migrate("delete_me") + + self.mount_a.run_shell(["rm", "-rf", Raw("delete_me/*")]) + self.mount_a.umount_wait() + + # See all the strays go into purge queue + self._wait_for_counter("mds_cache", "strays_created", file_count, mds_id=rank_1_id) + self._wait_for_counter("mds_cache", "strays_enqueued", file_count, mds_id=rank_1_id) + self.assertEqual(self.get_stat("mds_cache", "num_strays", mds_id=rank_1_id), 0) + + # See nothing get purged from the purge queue (yet) + time.sleep(10) + self.assertEqual(self.get_stat("purge_queue", "pq_executed", mds_id=rank_1_id), 0) + + # Shut down rank 1 + self.fs.set_max_mds(1) + + # It shouldn't proceed past stopping because its still not allowed + # to purge + time.sleep(10) + self.assertEqual(self.get_stat("purge_queue", "pq_executed", mds_id=rank_1_id), 0) + self.assertFalse(self._is_stopped(1)) + + # Permit the daemon to start purging again + self.fs.mon_manager.raw_cluster_cmd('tell', 'mds.{0}'.format(rank_1_id), + 'injectargs', + "--mds_max_purge_files 100") + + # It should now proceed through shutdown + self.fs.wait_for_daemons(timeout=120) + + # ...and in the process purge all that data + self.await_data_pool_empty() + + def test_migration_on_shutdown(self): + """ + That when an MDS rank is shut down, any non-purgeable strays + get migrated to another rank. + """ + + rank_0_id, rank_1_id = self._setup_two_ranks() + + # Create a non-purgeable stray in a ~mds1 stray directory + # by doing a hard link and deleting the original file + self.mount_a.run_shell_payload(""" +mkdir dir_1 dir_2 +touch dir_1/original +ln dir_1/original dir_2/linkto +""") + + self._force_migrate("dir_1") + self._force_migrate("dir_2", rank=0) + + # empty mds cache. otherwise mds reintegrates stray when unlink finishes + self.mount_a.umount_wait() + self.fs.mds_asok(['flush', 'journal'], rank_1_id) + self.fs.mds_asok(['cache', 'drop'], rank_1_id) + + self.mount_a.mount_wait() + self.mount_a.run_shell(["rm", "-f", "dir_1/original"]) + self.mount_a.umount_wait() + + self._wait_for_counter("mds_cache", "strays_created", 1, + mds_id=rank_1_id) + + # Shut down rank 1 + self.fs.set_max_mds(1) + self.fs.wait_for_daemons(timeout=120) + + # See that the stray counter on rank 0 has incremented + self.assertEqual(self.get_mdc_stat("strays_created", rank_0_id), 1) + + def test_migrate_unlinked_dir(self): + """ + Reproduce https://tracker.ceph.com/issues/53597 + """ + rank_0_id, rank_1_id = self._setup_two_ranks() + + self.mount_a.run_shell_payload(""" +mkdir pin +touch pin/placeholder +""") + + self._force_migrate("pin") + + # Hold the dir open so it cannot be purged + p = self.mount_a.open_dir_background("pin/to-be-unlinked") + + # Unlink the dentry + self.mount_a.run_shell(["rmdir", "pin/to-be-unlinked"]) + + # Wait to see the stray count increment + self.wait_until_equal( + lambda: self.get_mdc_stat("num_strays", mds_id=rank_1_id), + expect_val=1, timeout=60, reject_fn=lambda x: x > 1) + # but not purged + self.assertEqual(self.get_mdc_stat("strays_created", mds_id=rank_1_id), 1) + self.assertEqual(self.get_mdc_stat("strays_enqueued", mds_id=rank_1_id), 0) + + # Test loading unlinked dir into cache + self.fs.mds_asok(['flush', 'journal'], rank_1_id) + self.fs.mds_asok(['cache', 'drop'], rank_1_id) + + # Shut down rank 1 + self.fs.set_max_mds(1) + self.fs.wait_for_daemons(timeout=120) + # Now the stray should be migrated to rank 0 + # self.assertEqual(self.get_mdc_stat("strays_created", mds_id=rank_0_id), 1) + # https://github.com/ceph/ceph/pull/44335#issuecomment-1125940158 + + self.mount_a.kill_background(p) + + def assert_backtrace(self, ino, expected_path): + """ + Assert that the backtrace in the data pool for an inode matches + an expected /foo/bar path. + """ + expected_elements = expected_path.strip("/").split("/") + bt = self.fs.read_backtrace(ino) + actual_elements = list(reversed([dn['dname'] for dn in bt['ancestors']])) + self.assertListEqual(expected_elements, actual_elements) + + def get_backtrace_path(self, ino): + bt = self.fs.read_backtrace(ino) + elements = reversed([dn['dname'] for dn in bt['ancestors']]) + return "/".join(elements) + + def assert_purge_idle(self): + """ + Assert that the MDS perf counters indicate no strays exist and + no ongoing purge activity. Sanity check for when PurgeQueue should + be idle. + """ + mdc_stats = self.fs.mds_asok(['perf', 'dump', "mds_cache"])['mds_cache'] + pq_stats = self.fs.mds_asok(['perf', 'dump', "purge_queue"])['purge_queue'] + self.assertEqual(mdc_stats["num_strays"], 0) + self.assertEqual(mdc_stats["num_strays_delayed"], 0) + self.assertEqual(pq_stats["pq_executing"], 0) + self.assertEqual(pq_stats["pq_executing_ops"], 0) + + def test_mv_cleanup(self): + """ + That when doing a rename from A to B, and B has no hardlinks, + then we make a stray for B and purge him. + """ + # Create file_a and file_b, write some to both + size_mb = 8 + self.mount_a.write_n_mb("file_a", size_mb) + file_a_ino = self.mount_a.path_to_ino("file_a") + self.mount_a.write_n_mb("file_b", size_mb) + file_b_ino = self.mount_a.path_to_ino("file_b") + + self.fs.mds_asok(['flush', 'journal']) + self.assert_backtrace(file_a_ino, "file_a") + self.assert_backtrace(file_b_ino, "file_b") + + # mv file_a file_b + self.mount_a.run_shell(['mv', 'file_a', 'file_b']) + + # See that stray counter increments + self.assertEqual(self.get_mdc_stat("strays_created"), 1) + # Wait for purge counter to increment + self._wait_for_counter("mds_cache", "strays_enqueued", 1) + self._wait_for_counter("purge_queue", "pq_executed", 1) + + self.assert_purge_idle() + + # file_b should have been purged + self.assertTrue(self.fs.data_objects_absent(file_b_ino, size_mb * 1024 * 1024)) + + # Backtrace should have updated from file_a to file_b + self.fs.mds_asok(['flush', 'journal']) + self.assert_backtrace(file_a_ino, "file_b") + + # file_a's data should still exist + self.assertTrue(self.fs.data_objects_present(file_a_ino, size_mb * 1024 * 1024)) + + def _pool_df(self, pool_name): + """ + Return a dict like + { + "kb_used": 0, + "bytes_used": 0, + "max_avail": 19630292406, + "objects": 0 + } + + :param pool_name: Which pool (must exist) + """ + out = self.fs.mon_manager.raw_cluster_cmd("df", "--format=json-pretty") + for p in json.loads(out)['pools']: + if p['name'] == pool_name: + return p['stats'] + + raise RuntimeError("Pool '{0}' not found".format(pool_name)) + + def await_data_pool_empty(self): + self.wait_until_true( + lambda: self._pool_df( + self.fs.get_data_pool_name() + )['objects'] == 0, + timeout=60) + + def test_snapshot_remove(self): + """ + That removal of a snapshot that references a now-unlinked file results + in purging on the stray for the file. + """ + # Enable snapshots + self.fs.set_allow_new_snaps(True) + + # Create a dir with a file in it + size_mb = 8 + self.mount_a.run_shell(["mkdir", "snapdir"]) + self.mount_a.run_shell(["mkdir", "snapdir/subdir"]) + self.mount_a.write_test_pattern("snapdir/subdir/file_a", size_mb * 1024 * 1024) + file_a_ino = self.mount_a.path_to_ino("snapdir/subdir/file_a") + + # Snapshot the dir + self.mount_a.run_shell(["mkdir", "snapdir/.snap/snap1"]) + + # Cause the head revision to deviate from the snapshot + self.mount_a.write_n_mb("snapdir/subdir/file_a", size_mb) + + # Flush the journal so that backtraces, dirfrag objects will actually be written + self.fs.mds_asok(["flush", "journal"]) + + # Unlink the file + self.mount_a.run_shell(["rm", "-f", "snapdir/subdir/file_a"]) + self.mount_a.run_shell(["rmdir", "snapdir/subdir"]) + + # Unmount the client because when I come back to check the data is still + # in the file I don't want to just see what's in the page cache. + self.mount_a.umount_wait() + + self.assertEqual(self.get_mdc_stat("strays_created"), 2) + + # FIXME: at this stage we see a purge and the stray count drops to + # zero, but there's actually still a stray, so at the very + # least the StrayManager stats code is slightly off + + self.mount_a.mount_wait() + + # See that the data from the snapshotted revision of the file is still present + # and correct + self.mount_a.validate_test_pattern("snapdir/.snap/snap1/subdir/file_a", size_mb * 1024 * 1024) + + # Remove the snapshot + self.mount_a.run_shell(["rmdir", "snapdir/.snap/snap1"]) + + # Purging file_a doesn't happen until after we've flushed the journal, because + # it is referenced by the snapshotted subdir, and the snapshot isn't really + # gone until the journal references to it are gone + self.fs.mds_asok(["flush", "journal"]) + + # Wait for purging to complete, which requires the OSDMap to propagate to the OSDs. + # See also: http://tracker.ceph.com/issues/20072 + self.wait_until_true( + lambda: self.fs.data_objects_absent(file_a_ino, size_mb * 1024 * 1024), + timeout=60 + ) + + # See that a purge happens now + self._wait_for_counter("mds_cache", "strays_enqueued", 2) + self._wait_for_counter("purge_queue", "pq_executed", 2) + + self.await_data_pool_empty() + + def test_fancy_layout(self): + """ + purge stray file with fancy layout + """ + + file_name = "fancy_layout_file" + self.mount_a.run_shell(["touch", file_name]) + + file_layout = "stripe_unit=1048576 stripe_count=4 object_size=8388608" + self.mount_a.setfattr(file_name, "ceph.file.layout", file_layout) + + # 35MB requires 7 objects + size_mb = 35 + self.mount_a.write_n_mb(file_name, size_mb) + + self.mount_a.run_shell(["rm", "-f", file_name]) + self.fs.mds_asok(["flush", "journal"]) + + # can't use self.fs.data_objects_absent here, it does not support fancy layout + self.await_data_pool_empty() + + def test_dirfrag_limit(self): + """ + That the directory fragment size cannot exceed mds_bal_fragment_size_max (using a limit of 50 in all configurations). + """ + + LOW_LIMIT = 50 + self.config_set('mds', 'mds_bal_fragment_size_max', str(LOW_LIMIT)) + time.sleep(10) # for config to reach MDS; async create is fast!! + + try: + self.mount_a.create_n_files("subdir/file", LOW_LIMIT+1, finaldirsync=True) + except CommandFailedError: + pass # ENOSPC + else: + self.fail("fragment size exceeded") + + + def test_dirfrag_limit_fragmented(self): + """ + That fragmentation (forced) will allow more entries to be created. + """ + + LOW_LIMIT = 50 + self.config_set('mds', 'mds_bal_fragment_size_max', str(LOW_LIMIT)) + self.config_set('mds', 'mds_bal_merge_size', 1) # disable merging + time.sleep(10) # for config to reach MDS; async create is fast!! + + # Test that we can go beyond the limit if we fragment the directory + self.mount_a.create_n_files("subdir/file", LOW_LIMIT, finaldirsync=True) + self.mount_a.umount_wait() # release client caps + + # Ensure that subdir is fragmented + self.fs.rank_asok(["dirfrag", "split", "/subdir", "0/0", "1"]) + self.fs.rank_asok(["flush", "journal"]) + + # Create 50% more files than the current fragment limit + self.mount_a.mount_wait() + self.mount_a.create_n_files("subdir/file", (LOW_LIMIT*3)//2, finaldirsync=True) + + def test_dirfrag_limit_strays(self): + """ + That unlinking fails when the stray directory fragment becomes too + large and that unlinking may continue once those strays are purged. + """ + + LOW_LIMIT = 10 + # N.B. this test is inherently racy because stray removal may be faster + # than slow(er) file creation. + self.config_set('mds', 'mds_bal_fragment_size_max', LOW_LIMIT) + time.sleep(10) # for config to reach MDS; async create is fast!! + + # Now test the stray directory size is limited and recovers + strays_before = self.get_mdc_stat("strays_created") + try: + # 10 stray directories: expect collisions + self.mount_a.create_n_files("subdir/file", LOW_LIMIT*10, finaldirsync=True, unlink=True) + except CommandFailedError: + pass # ENOSPC + else: + self.fail("fragment size exceeded") + strays_after = self.get_mdc_stat("strays_created") + self.assertGreaterEqual(strays_after-strays_before, LOW_LIMIT) + + self._wait_for_counter("mds_cache", "strays_enqueued", strays_after) + self._wait_for_counter("purge_queue", "pq_executed", strays_after) + + # verify new files can be created and unlinked + self.mount_a.create_n_files("subdir/file", LOW_LIMIT, dirsync=True, unlink=True) + + def test_purge_queue_upgrade(self): + """ + That when starting on a system with no purge queue in the metadata + pool, we silently create one. + :return: + """ + + self.mds_cluster.mds_stop() + self.mds_cluster.mds_fail() + self.fs.radosm(["rm", "500.00000000"]) + self.mds_cluster.mds_restart() + self.fs.wait_for_daemons() + + def test_replicated_delete_speed(self): + """ + That deletions of replicated metadata are not pathologically slow + """ + rank_0_id, rank_1_id = self._setup_two_ranks() + + self.set_conf("mds.{0}".format(rank_1_id), 'mds_max_purge_files', "0") + self.mds_cluster.mds_fail_restart(rank_1_id) + self.fs.wait_for_daemons() + + file_count = 10 + + self.mount_a.create_n_files("delete_me/file", file_count) + + self._force_migrate("delete_me") + + begin = datetime.datetime.now() + self.mount_a.run_shell(["rm", "-rf", Raw("delete_me/*")]) + end = datetime.datetime.now() + + # What we're really checking here is that we are completing client + # operations immediately rather than delaying until the next tick. + tick_period = float(self.fs.get_config("mds_tick_interval", + service_type="mds")) + + duration = (end - begin).total_seconds() + self.assertLess(duration, (file_count * tick_period) * 0.25) diff --git a/qa/tasks/cephfs/test_subvolume.py b/qa/tasks/cephfs/test_subvolume.py new file mode 100644 index 000000000..1ebb137dd --- /dev/null +++ b/qa/tasks/cephfs/test_subvolume.py @@ -0,0 +1,170 @@ +import logging + +from tasks.cephfs.cephfs_test_case import CephFSTestCase +from teuthology.exceptions import CommandFailedError + +log = logging.getLogger(__name__) + + +class TestSubvolume(CephFSTestCase): + CLIENTS_REQUIRED = 1 + MDSS_REQUIRED = 1 + + def setUp(self): + super().setUp() + self.setup_test() + + def tearDown(self): + # clean up + self.cleanup_test() + super().tearDown() + + def setup_test(self): + self.mount_a.run_shell(['mkdir', 'group']) + self.mount_a.run_shell(['mkdir', 'group/subvol1']) + self.mount_a.run_shell(['setfattr', '-n', 'ceph.dir.subvolume', + '-v', '1', 'group/subvol1']) + self.mount_a.run_shell(['mv', 'group/subvol1', 'group/subvol2']) + + def cleanup_test(self): + self.mount_a.run_shell(['rm', '-rf', 'group']) + + def test_subvolume_move_out_file(self): + """ + To verify that file can't be moved out of subvolume + """ + self.mount_a.run_shell(['touch', 'group/subvol2/file1']) + + # file can't be moved out of a subvolume + with self.assertRaises(CommandFailedError): + self.mount_a.run_shell(['rename', 'group/subvol2/file1', + 'group/file1', 'group/subvol2/file1']) + + + def test_subvolume_move_in_file(self): + """ + To verify that file can't be moved into subvolume + """ + # file can't be moved into a subvolume + self.mount_a.run_shell(['touch', 'group/file2']) + with self.assertRaises(CommandFailedError): + self.mount_a.run_shell(['rename', 'group/file2', + 'group/subvol2/file2', 'group/file2']) + + def test_subvolume_hardlink_to_outside(self): + """ + To verify that file can't be hardlinked to outside subvolume + """ + self.mount_a.run_shell(['touch', 'group/subvol2/file1']) + + # create hard link within subvolume + self.mount_a.run_shell(['ln', + 'group/subvol2/file1', 'group/subvol2/file1_']) + + # hard link can't be created out of subvolume + with self.assertRaises(CommandFailedError): + self.mount_a.run_shell(['ln', + 'group/subvol2/file1', 'group/file1_']) + + def test_subvolume_hardlink_to_inside(self): + """ + To verify that file can't be hardlinked to inside subvolume + """ + self.mount_a.run_shell(['touch', 'group/subvol2/file1']) + + # create hard link within subvolume + self.mount_a.run_shell(['ln', + 'group/subvol2/file1', 'group/subvol2/file1_']) + + # hard link can't be created inside subvolume + self.mount_a.run_shell(['touch', 'group/file2']) + with self.assertRaises(CommandFailedError): + self.mount_a.run_shell(['ln', + 'group/file2', 'group/subvol2/file2_']) + + def test_subvolume_snapshot_inside_subvolume_subdir(self): + """ + To verify that snapshot can't be taken for a subvolume subdir + """ + self.mount_a.run_shell(['touch', 'group/subvol2/file1']) + + # create snapshot at subvolume root + self.mount_a.run_shell(['mkdir', 'group/subvol2/.snap/s1']) + + # can't create snapshot in a descendent dir of subvolume + self.mount_a.run_shell(['mkdir', 'group/subvol2/dir']) + with self.assertRaises(CommandFailedError): + self.mount_a.run_shell(['mkdir', 'group/subvol2/dir/.snap/s2']) + + # clean up + self.mount_a.run_shell(['rmdir', 'group/subvol2/.snap/s1']) + + def test_subvolume_file_move_across_subvolumes(self): + """ + To verify that file can't be moved across subvolumes + """ + self.mount_a.run_shell(['touch', 'group/subvol2/file1']) + + # create another subvol + self.mount_a.run_shell(['mkdir', 'group/subvol3']) + self.mount_a.run_shell(['setfattr', '-n', 'ceph.dir.subvolume', + '-v', '1', 'group/subvol3']) + + # can't move file across subvolumes + with self.assertRaises(CommandFailedError): + self.mount_a.run_shell(['rename', 'group/subvol2/file1', + 'group/subvol3/file1', + 'group/subvol2/file1']) + + def test_subvolume_hardlink_across_subvolumes(self): + """ + To verify that hardlink can't be created across subvolumes + """ + self.mount_a.run_shell(['touch', 'group/subvol2/file1']) + + # create another subvol + self.mount_a.run_shell(['mkdir', 'group/subvol3']) + self.mount_a.run_shell(['setfattr', '-n', 'ceph.dir.subvolume', + '-v', '1', 'group/subvol3']) + + # can't create hard link across subvolumes + with self.assertRaises(CommandFailedError): + self.mount_a.run_shell(['ln', 'group/subvol2/file1', + 'group/subvol3/file1']) + + def test_subvolume_create_subvolume_inside_subvolume(self): + """ + To verify that subvolume can't be created inside a subvolume + """ + # can't create subvolume inside a subvolume + self.mount_a.run_shell(['mkdir', 'group/subvol2/dir']) + with self.assertRaises(CommandFailedError): + self.mount_a.run_shell(['setfattr', '-n', 'ceph.dir.subvolume', + '-v', '1', 'group/subvol2/dir']) + + def test_subvolume_create_snapshot_inside_new_subvolume_parent(self): + """ + To verify that subvolume can't be created inside a new subvolume parent + """ + self.mount_a.run_shell(['touch', 'group/subvol2/file1']) + + # clear subvolume flag + self.mount_a.run_shell(['setfattr', '-n', 'ceph.dir.subvolume', + '-v', '0', 'group/subvol2']) + + # create a snap + self.mount_a.run_shell(['mkdir', 'group/subvol2/dir']) + self.mount_a.run_shell(['mkdir', 'group/subvol2/dir/.snap/s2']) + + # override subdir subvolume with parent subvolume + self.mount_a.run_shell(['setfattr', '-n', 'ceph.dir.subvolume', + '-v', '1', 'group/subvol2/dir']) + self.mount_a.run_shell(['setfattr', '-n', 'ceph.dir.subvolume', + '-v', '1', 'group/subvol2']) + + # can't create a snap in a subdir of a subvol parent + with self.assertRaises(CommandFailedError): + self.mount_a.run_shell(['mkdir', 'group/subvol2/dir/.snap/s3']) + + # clean up + self.mount_a.run_shell(['rmdir', 'group/subvol2/dir/.snap/s2']) diff --git a/qa/tasks/cephfs/test_volumes.py b/qa/tasks/cephfs/test_volumes.py new file mode 100644 index 000000000..2ecfeb327 --- /dev/null +++ b/qa/tasks/cephfs/test_volumes.py @@ -0,0 +1,7946 @@ +import os +import json +import time +import errno +import random +import logging +import collections +import uuid +import unittest +from hashlib import md5 +from textwrap import dedent +from io import StringIO + +from tasks.cephfs.cephfs_test_case import CephFSTestCase +from tasks.cephfs.fuse_mount import FuseMount +from teuthology.exceptions import CommandFailedError + +log = logging.getLogger(__name__) + +class TestVolumesHelper(CephFSTestCase): + """Helper class for testing FS volume, subvolume group and subvolume operations.""" + TEST_VOLUME_PREFIX = "volume" + TEST_SUBVOLUME_PREFIX="subvolume" + TEST_GROUP_PREFIX="group" + TEST_SNAPSHOT_PREFIX="snapshot" + TEST_CLONE_PREFIX="clone" + TEST_FILE_NAME_PREFIX="subvolume_file" + + # for filling subvolume with data + CLIENTS_REQUIRED = 2 + MDSS_REQUIRED = 2 + + # io defaults + DEFAULT_FILE_SIZE = 1 # MB + DEFAULT_NUMBER_OF_FILES = 1024 + + def _fs_cmd(self, *args): + return self.mgr_cluster.mon_manager.raw_cluster_cmd("fs", *args) + + def _raw_cmd(self, *args): + return self.mgr_cluster.mon_manager.raw_cluster_cmd(*args) + + def __check_clone_state(self, state, clone, clone_group=None, timo=120): + check = 0 + args = ["clone", "status", self.volname, clone] + if clone_group: + args.append(clone_group) + args = tuple(args) + while check < timo: + result = json.loads(self._fs_cmd(*args)) + if result["status"]["state"] == state: + break + check += 1 + time.sleep(1) + self.assertTrue(check < timo) + + def _get_clone_status(self, clone, clone_group=None): + args = ["clone", "status", self.volname, clone] + if clone_group: + args.append(clone_group) + args = tuple(args) + result = json.loads(self._fs_cmd(*args)) + return result + + def _wait_for_clone_to_complete(self, clone, clone_group=None, timo=120): + self.__check_clone_state("complete", clone, clone_group, timo) + + def _wait_for_clone_to_fail(self, clone, clone_group=None, timo=120): + self.__check_clone_state("failed", clone, clone_group, timo) + + def _wait_for_clone_to_be_in_progress(self, clone, clone_group=None, timo=120): + self.__check_clone_state("in-progress", clone, clone_group, timo) + + def _check_clone_canceled(self, clone, clone_group=None): + self.__check_clone_state("canceled", clone, clone_group, timo=1) + + def _get_subvolume_snapshot_path(self, subvolume, snapshot, source_group, subvol_path, source_version): + if source_version == 2: + # v2 + if subvol_path is not None: + (base_path, uuid_str) = os.path.split(subvol_path) + else: + (base_path, uuid_str) = os.path.split(self._get_subvolume_path(self.volname, subvolume, group_name=source_group)) + return os.path.join(base_path, ".snap", snapshot, uuid_str) + + # v1 + base_path = self._get_subvolume_path(self.volname, subvolume, group_name=source_group) + return os.path.join(base_path, ".snap", snapshot) + + def _verify_clone_attrs(self, source_path, clone_path): + path1 = source_path + path2 = clone_path + + p = self.mount_a.run_shell(["find", path1]) + paths = p.stdout.getvalue().strip().split() + + # for each entry in source and clone (sink) verify certain inode attributes: + # inode type, mode, ownership, [am]time. + for source_path in paths: + sink_entry = source_path[len(path1)+1:] + sink_path = os.path.join(path2, sink_entry) + + # mode+type + sval = int(self.mount_a.run_shell(['stat', '-c' '%f', source_path]).stdout.getvalue().strip(), 16) + cval = int(self.mount_a.run_shell(['stat', '-c' '%f', sink_path]).stdout.getvalue().strip(), 16) + self.assertEqual(sval, cval) + + # ownership + sval = int(self.mount_a.run_shell(['stat', '-c' '%u', source_path]).stdout.getvalue().strip()) + cval = int(self.mount_a.run_shell(['stat', '-c' '%u', sink_path]).stdout.getvalue().strip()) + self.assertEqual(sval, cval) + + sval = int(self.mount_a.run_shell(['stat', '-c' '%g', source_path]).stdout.getvalue().strip()) + cval = int(self.mount_a.run_shell(['stat', '-c' '%g', sink_path]).stdout.getvalue().strip()) + self.assertEqual(sval, cval) + + # inode timestamps + # do not check access as kclient will generally not update this like ceph-fuse will. + sval = int(self.mount_a.run_shell(['stat', '-c' '%Y', source_path]).stdout.getvalue().strip()) + cval = int(self.mount_a.run_shell(['stat', '-c' '%Y', sink_path]).stdout.getvalue().strip()) + self.assertEqual(sval, cval) + + def _verify_clone_root(self, source_path, clone_path, clone, clone_group, clone_pool): + # verifies following clone root attrs quota, data_pool and pool_namespace + # remaining attributes of clone root are validated in _verify_clone_attrs + + clone_info = json.loads(self._get_subvolume_info(self.volname, clone, clone_group)) + + # verify quota is inherited from source snapshot + src_quota = self.mount_a.getfattr(source_path, "ceph.quota.max_bytes") + # FIXME: kclient fails to get this quota value: https://tracker.ceph.com/issues/48075 + if isinstance(self.mount_a, FuseMount): + self.assertEqual(clone_info["bytes_quota"], "infinite" if src_quota is None else int(src_quota)) + + if clone_pool: + # verify pool is set as per request + self.assertEqual(clone_info["data_pool"], clone_pool) + else: + # verify pool and pool namespace are inherited from snapshot + self.assertEqual(clone_info["data_pool"], + self.mount_a.getfattr(source_path, "ceph.dir.layout.pool")) + self.assertEqual(clone_info["pool_namespace"], + self.mount_a.getfattr(source_path, "ceph.dir.layout.pool_namespace")) + + def _verify_clone(self, subvolume, snapshot, clone, + source_group=None, clone_group=None, clone_pool=None, + subvol_path=None, source_version=2, timo=120): + # pass in subvol_path (subvolume path when snapshot was taken) when subvolume is removed + # but snapshots are retained for clone verification + path1 = self._get_subvolume_snapshot_path(subvolume, snapshot, source_group, subvol_path, source_version) + path2 = self._get_subvolume_path(self.volname, clone, group_name=clone_group) + + check = 0 + # TODO: currently snapshot rentries are not stable if snapshot source entries + # are removed, https://tracker.ceph.com/issues/46747 + while check < timo and subvol_path is None: + val1 = int(self.mount_a.getfattr(path1, "ceph.dir.rentries")) + val2 = int(self.mount_a.getfattr(path2, "ceph.dir.rentries")) + if val1 == val2: + break + check += 1 + time.sleep(1) + self.assertTrue(check < timo) + + self._verify_clone_root(path1, path2, clone, clone_group, clone_pool) + self._verify_clone_attrs(path1, path2) + + def _generate_random_volume_name(self, count=1): + n = self.volume_start + volumes = [f"{TestVolumes.TEST_VOLUME_PREFIX}_{i:016}" for i in range(n, n+count)] + self.volume_start += count + return volumes[0] if count == 1 else volumes + + def _generate_random_subvolume_name(self, count=1): + n = self.subvolume_start + subvolumes = [f"{TestVolumes.TEST_SUBVOLUME_PREFIX}_{i:016}" for i in range(n, n+count)] + self.subvolume_start += count + return subvolumes[0] if count == 1 else subvolumes + + def _generate_random_group_name(self, count=1): + n = self.group_start + groups = [f"{TestVolumes.TEST_GROUP_PREFIX}_{i:016}" for i in range(n, n+count)] + self.group_start += count + return groups[0] if count == 1 else groups + + def _generate_random_snapshot_name(self, count=1): + n = self.snapshot_start + snaps = [f"{TestVolumes.TEST_SNAPSHOT_PREFIX}_{i:016}" for i in range(n, n+count)] + self.snapshot_start += count + return snaps[0] if count == 1 else snaps + + def _generate_random_clone_name(self, count=1): + n = self.clone_start + clones = [f"{TestVolumes.TEST_CLONE_PREFIX}_{i:016}" for i in range(n, n+count)] + self.clone_start += count + return clones[0] if count == 1 else clones + + def _enable_multi_fs(self): + self._fs_cmd("flag", "set", "enable_multiple", "true", "--yes-i-really-mean-it") + + def _create_or_reuse_test_volume(self): + result = json.loads(self._fs_cmd("volume", "ls")) + if len(result) == 0: + self.vol_created = True + self.volname = self._generate_random_volume_name() + self._fs_cmd("volume", "create", self.volname) + else: + self.volname = result[0]['name'] + + def _get_volume_info(self, vol_name, human_readable=False): + if human_readable: + args = ["volume", "info", vol_name, human_readable] + else: + args = ["volume", "info", vol_name] + args = tuple(args) + vol_md = self._fs_cmd(*args) + return vol_md + + def _get_subvolume_group_path(self, vol_name, group_name): + args = ("subvolumegroup", "getpath", vol_name, group_name) + path = self._fs_cmd(*args) + # remove the leading '/', and trailing whitespaces + return path[1:].rstrip() + + def _get_subvolume_group_info(self, vol_name, group_name): + args = ["subvolumegroup", "info", vol_name, group_name] + args = tuple(args) + group_md = self._fs_cmd(*args) + return group_md + + def _get_subvolume_path(self, vol_name, subvol_name, group_name=None): + args = ["subvolume", "getpath", vol_name, subvol_name] + if group_name: + args.append(group_name) + args = tuple(args) + path = self._fs_cmd(*args) + # remove the leading '/', and trailing whitespaces + return path[1:].rstrip() + + def _get_subvolume_info(self, vol_name, subvol_name, group_name=None): + args = ["subvolume", "info", vol_name, subvol_name] + if group_name: + args.append(group_name) + args = tuple(args) + subvol_md = self._fs_cmd(*args) + return subvol_md + + def _get_subvolume_snapshot_info(self, vol_name, subvol_name, snapname, group_name=None): + args = ["subvolume", "snapshot", "info", vol_name, subvol_name, snapname] + if group_name: + args.append(group_name) + args = tuple(args) + snap_md = self._fs_cmd(*args) + return snap_md + + def _delete_test_volume(self): + self._fs_cmd("volume", "rm", self.volname, "--yes-i-really-mean-it") + + def _do_subvolume_pool_and_namespace_update(self, subvolume, pool=None, pool_namespace=None, subvolume_group=None): + subvolpath = self._get_subvolume_path(self.volname, subvolume, group_name=subvolume_group) + + if pool is not None: + self.mount_a.setfattr(subvolpath, 'ceph.dir.layout.pool', pool, sudo=True) + + if pool_namespace is not None: + self.mount_a.setfattr(subvolpath, 'ceph.dir.layout.pool_namespace', pool_namespace, sudo=True) + + def _do_subvolume_attr_update(self, subvolume, uid, gid, mode, subvolume_group=None): + subvolpath = self._get_subvolume_path(self.volname, subvolume, group_name=subvolume_group) + + # mode + self.mount_a.run_shell(['sudo', 'chmod', mode, subvolpath], omit_sudo=False) + + # ownership + self.mount_a.run_shell(['sudo', 'chown', uid, subvolpath], omit_sudo=False) + self.mount_a.run_shell(['sudo', 'chgrp', gid, subvolpath], omit_sudo=False) + + def _do_subvolume_io(self, subvolume, subvolume_group=None, create_dir=None, + number_of_files=DEFAULT_NUMBER_OF_FILES, file_size=DEFAULT_FILE_SIZE): + # get subvolume path for IO + args = ["subvolume", "getpath", self.volname, subvolume] + if subvolume_group: + args.append(subvolume_group) + args = tuple(args) + subvolpath = self._fs_cmd(*args) + self.assertNotEqual(subvolpath, None) + subvolpath = subvolpath[1:].rstrip() # remove "/" prefix and any trailing newline + + io_path = subvolpath + if create_dir: + io_path = os.path.join(subvolpath, create_dir) + self.mount_a.run_shell_payload(f"mkdir -p {io_path}") + + log.debug("filling subvolume {0} with {1} files each {2}MB size under directory {3}".format(subvolume, number_of_files, file_size, io_path)) + for i in range(number_of_files): + filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, i) + self.mount_a.write_n_mb(os.path.join(io_path, filename), file_size) + + def _do_subvolume_io_mixed(self, subvolume, subvolume_group=None): + subvolpath = self._get_subvolume_path(self.volname, subvolume, group_name=subvolume_group) + + reg_file = "regfile.0" + dir_path = os.path.join(subvolpath, "dir.0") + sym_path1 = os.path.join(subvolpath, "sym.0") + # this symlink's ownership would be changed + sym_path2 = os.path.join(dir_path, "sym.0") + + self.mount_a.run_shell(["mkdir", dir_path]) + self.mount_a.run_shell(["ln", "-s", "./{}".format(reg_file), sym_path1]) + self.mount_a.run_shell(["ln", "-s", "./{}".format(reg_file), sym_path2]) + # flip ownership to nobody. assumption: nobody's id is 65534 + self.mount_a.run_shell(["sudo", "chown", "-h", "65534:65534", sym_path2], omit_sudo=False) + + def _wait_for_trash_empty(self, timeout=60): + # XXX: construct the trash dir path (note that there is no mgr + # [sub]volume interface for this). + trashdir = os.path.join("./", "volumes", "_deleting") + self.mount_a.wait_for_dir_empty(trashdir, timeout=timeout) + + def _wait_for_subvol_trash_empty(self, subvol, group="_nogroup", timeout=30): + trashdir = os.path.join("./", "volumes", group, subvol, ".trash") + try: + self.mount_a.wait_for_dir_empty(trashdir, timeout=timeout) + except CommandFailedError as ce: + if ce.exitstatus != errno.ENOENT: + pass + else: + raise + + def _assert_meta_location_and_version(self, vol_name, subvol_name, subvol_group=None, version=2, legacy=False): + if legacy: + subvol_path = self._get_subvolume_path(vol_name, subvol_name, group_name=subvol_group) + m = md5() + m.update(("/"+subvol_path).encode('utf-8')) + meta_filename = "{0}.meta".format(m.digest().hex()) + metapath = os.path.join(".", "volumes", "_legacy", meta_filename) + else: + group = subvol_group if subvol_group is not None else '_nogroup' + metapath = os.path.join(".", "volumes", group, subvol_name, ".meta") + + out = self.mount_a.run_shell(['sudo', 'cat', metapath], omit_sudo=False) + lines = out.stdout.getvalue().strip().split('\n') + sv_version = -1 + for line in lines: + if line == "version = " + str(version): + sv_version = version + break + self.assertEqual(sv_version, version, "version expected was '{0}' but got '{1}' from meta file at '{2}'".format( + version, sv_version, metapath)) + + def _create_v1_subvolume(self, subvol_name, subvol_group=None, has_snapshot=True, subvol_type='subvolume', state='complete'): + group = subvol_group if subvol_group is not None else '_nogroup' + basepath = os.path.join("volumes", group, subvol_name) + uuid_str = str(uuid.uuid4()) + createpath = os.path.join(basepath, uuid_str) + self.mount_a.run_shell(['sudo', 'mkdir', '-p', createpath], omit_sudo=False) + + # create a v1 snapshot, to prevent auto upgrades + if has_snapshot: + snappath = os.path.join(createpath, ".snap", "fake") + self.mount_a.run_shell(['sudo', 'mkdir', '-p', snappath], omit_sudo=False) + + # add required xattrs to subvolume + default_pool = self.mount_a.getfattr(".", "ceph.dir.layout.pool") + self.mount_a.setfattr(createpath, 'ceph.dir.layout.pool', default_pool, sudo=True) + + # create a v1 .meta file + meta_contents = "[GLOBAL]\nversion = 1\ntype = {0}\npath = {1}\nstate = {2}\n".format(subvol_type, "/" + createpath, state) + if state == 'pending': + # add a fake clone source + meta_contents = meta_contents + '[source]\nvolume = fake\nsubvolume = fake\nsnapshot = fake\n' + meta_filepath1 = os.path.join(self.mount_a.mountpoint, basepath, ".meta") + self.mount_a.client_remote.write_file(meta_filepath1, meta_contents, sudo=True) + return createpath + + def _update_fake_trash(self, subvol_name, subvol_group=None, trash_name='fake', create=True): + group = subvol_group if subvol_group is not None else '_nogroup' + trashpath = os.path.join("volumes", group, subvol_name, '.trash', trash_name) + if create: + self.mount_a.run_shell(['sudo', 'mkdir', '-p', trashpath], omit_sudo=False) + else: + self.mount_a.run_shell(['sudo', 'rmdir', trashpath], omit_sudo=False) + + def _configure_guest_auth(self, guest_mount, authid, key): + """ + Set up auth credentials for a guest client. + """ + # Create keyring file for the guest client. + keyring_txt = dedent(""" + [client.{authid}] + key = {key} + + """.format(authid=authid,key=key)) + + guest_mount.client_id = authid + guest_mount.client_remote.write_file(guest_mount.get_keyring_path(), + keyring_txt, sudo=True) + # Add a guest client section to the ceph config file. + self.config_set("client.{0}".format(authid), "debug client", 20) + self.config_set("client.{0}".format(authid), "debug objecter", 20) + self.set_conf("client.{0}".format(authid), + "keyring", guest_mount.get_keyring_path()) + + def _auth_metadata_get(self, filedata): + """ + Return a deserialized JSON object, or None + """ + try: + data = json.loads(filedata) + except json.decoder.JSONDecodeError: + data = None + return data + + def setUp(self): + super(TestVolumesHelper, self).setUp() + self.volname = None + self.vol_created = False + self._enable_multi_fs() + self._create_or_reuse_test_volume() + self.config_set('mon', 'mon_allow_pool_delete', True) + self.volume_start = random.randint(1, (1<<20)) + self.subvolume_start = random.randint(1, (1<<20)) + self.group_start = random.randint(1, (1<<20)) + self.snapshot_start = random.randint(1, (1<<20)) + self.clone_start = random.randint(1, (1<<20)) + + def tearDown(self): + if self.vol_created: + self._delete_test_volume() + super(TestVolumesHelper, self).tearDown() + + +class TestVolumes(TestVolumesHelper): + """Tests for FS volume operations.""" + def test_volume_create(self): + """ + That the volume can be created and then cleans up + """ + volname = self._generate_random_volume_name() + self._fs_cmd("volume", "create", volname) + volumels = json.loads(self._fs_cmd("volume", "ls")) + + if not (volname in ([volume['name'] for volume in volumels])): + raise RuntimeError("Error creating volume '{0}'".format(volname)) + + # check that the pools were created with the correct config + pool_details = json.loads(self._raw_cmd("osd", "pool", "ls", "detail", "--format=json")) + pool_flags = {} + for pool in pool_details: + pool_flags[pool["pool_id"]] = pool["flags_names"].split(",") + + volume_details = json.loads(self._fs_cmd("get", volname, "--format=json")) + for data_pool_id in volume_details['mdsmap']['data_pools']: + self.assertIn("bulk", pool_flags[data_pool_id]) + meta_pool_id = volume_details['mdsmap']['metadata_pool'] + self.assertNotIn("bulk", pool_flags[meta_pool_id]) + + # clean up + self._fs_cmd("volume", "rm", volname, "--yes-i-really-mean-it") + + def test_volume_ls(self): + """ + That the existing and the newly created volumes can be listed and + finally cleans up. + """ + vls = json.loads(self._fs_cmd("volume", "ls")) + volumes = [volume['name'] for volume in vls] + + #create new volumes and add it to the existing list of volumes + volumenames = self._generate_random_volume_name(2) + for volumename in volumenames: + self._fs_cmd("volume", "create", volumename) + volumes.extend(volumenames) + + # list volumes + try: + volumels = json.loads(self._fs_cmd('volume', 'ls')) + if len(volumels) == 0: + raise RuntimeError("Expected the 'fs volume ls' command to list the created volumes.") + else: + volnames = [volume['name'] for volume in volumels] + if collections.Counter(volnames) != collections.Counter(volumes): + raise RuntimeError("Error creating or listing volumes") + finally: + # clean up + for volume in volumenames: + self._fs_cmd("volume", "rm", volume, "--yes-i-really-mean-it") + + def test_volume_rm(self): + """ + That the volume can only be removed when --yes-i-really-mean-it is used + and verify that the deleted volume is not listed anymore. + """ + for m in self.mounts: + m.umount_wait() + try: + self._fs_cmd("volume", "rm", self.volname) + except CommandFailedError as ce: + if ce.exitstatus != errno.EPERM: + raise RuntimeError("expected the 'fs volume rm' command to fail with EPERM, " + "but it failed with {0}".format(ce.exitstatus)) + else: + self._fs_cmd("volume", "rm", self.volname, "--yes-i-really-mean-it") + + #check if it's gone + volumes = json.loads(self._fs_cmd("volume", "ls", "--format=json-pretty")) + if (self.volname in [volume['name'] for volume in volumes]): + raise RuntimeError("Expected the 'fs volume rm' command to succeed. " + "The volume {0} not removed.".format(self.volname)) + else: + raise RuntimeError("expected the 'fs volume rm' command to fail.") + + def test_volume_rm_arbitrary_pool_removal(self): + """ + That the arbitrary pool added to the volume out of band is removed + successfully on volume removal. + """ + for m in self.mounts: + m.umount_wait() + new_pool = "new_pool" + # add arbitrary data pool + self.fs.add_data_pool(new_pool) + vol_status = json.loads(self._fs_cmd("status", self.volname, "--format=json-pretty")) + self._fs_cmd("volume", "rm", self.volname, "--yes-i-really-mean-it") + + #check if fs is gone + volumes = json.loads(self._fs_cmd("volume", "ls", "--format=json-pretty")) + volnames = [volume['name'] for volume in volumes] + self.assertNotIn(self.volname, volnames) + + #check if osd pools are gone + pools = json.loads(self._raw_cmd("osd", "pool", "ls", "--format=json-pretty")) + for pool in vol_status["pools"]: + self.assertNotIn(pool["name"], pools) + + def test_volume_rm_when_mon_delete_pool_false(self): + """ + That the volume can only be removed when mon_allowd_pool_delete is set + to true and verify that the pools are removed after volume deletion. + """ + for m in self.mounts: + m.umount_wait() + self.config_set('mon', 'mon_allow_pool_delete', False) + try: + self._fs_cmd("volume", "rm", self.volname, "--yes-i-really-mean-it") + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EPERM, + "expected the 'fs volume rm' command to fail with EPERM, " + "but it failed with {0}".format(ce.exitstatus)) + vol_status = json.loads(self._fs_cmd("status", self.volname, "--format=json-pretty")) + self.config_set('mon', 'mon_allow_pool_delete', True) + self._fs_cmd("volume", "rm", self.volname, "--yes-i-really-mean-it") + + #check if fs is gone + volumes = json.loads(self._fs_cmd("volume", "ls", "--format=json-pretty")) + volnames = [volume['name'] for volume in volumes] + self.assertNotIn(self.volname, volnames, + "volume {0} exists after removal".format(self.volname)) + #check if pools are gone + pools = json.loads(self._raw_cmd("osd", "pool", "ls", "--format=json-pretty")) + for pool in vol_status["pools"]: + self.assertNotIn(pool["name"], pools, + "pool {0} exists after volume removal".format(pool["name"])) + + def test_volume_rename(self): + """ + That volume, its file system and pools, can be renamed. + """ + for m in self.mounts: + m.umount_wait() + oldvolname = self.volname + newvolname = self._generate_random_volume_name() + new_data_pool, new_metadata_pool = f"cephfs.{newvolname}.data", f"cephfs.{newvolname}.meta" + self._fs_cmd("volume", "rename", oldvolname, newvolname, + "--yes-i-really-mean-it") + volumels = json.loads(self._fs_cmd('volume', 'ls')) + volnames = [volume['name'] for volume in volumels] + # volume name changed + self.assertIn(newvolname, volnames) + self.assertNotIn(oldvolname, volnames) + # pool names changed + self.fs.get_pool_names(refresh=True) + self.assertEqual(new_metadata_pool, self.fs.get_metadata_pool_name()) + self.assertEqual(new_data_pool, self.fs.get_data_pool_name()) + + def test_volume_rename_idempotency(self): + """ + That volume rename is idempotent. + """ + for m in self.mounts: + m.umount_wait() + oldvolname = self.volname + newvolname = self._generate_random_volume_name() + new_data_pool, new_metadata_pool = f"cephfs.{newvolname}.data", f"cephfs.{newvolname}.meta" + self._fs_cmd("volume", "rename", oldvolname, newvolname, + "--yes-i-really-mean-it") + self._fs_cmd("volume", "rename", oldvolname, newvolname, + "--yes-i-really-mean-it") + volumels = json.loads(self._fs_cmd('volume', 'ls')) + volnames = [volume['name'] for volume in volumels] + self.assertIn(newvolname, volnames) + self.assertNotIn(oldvolname, volnames) + self.fs.get_pool_names(refresh=True) + self.assertEqual(new_metadata_pool, self.fs.get_metadata_pool_name()) + self.assertEqual(new_data_pool, self.fs.get_data_pool_name()) + + def test_volume_rename_fails_without_confirmation_flag(self): + """ + That renaming volume fails without --yes-i-really-mean-it flag. + """ + newvolname = self._generate_random_volume_name() + try: + self._fs_cmd("volume", "rename", self.volname, newvolname) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EPERM, + "invalid error code on renaming a FS volume without the " + "'--yes-i-really-mean-it' flag") + else: + self.fail("expected renaming of FS volume to fail without the " + "'--yes-i-really-mean-it' flag") + + def test_volume_rename_for_more_than_one_data_pool(self): + """ + That renaming a volume with more than one data pool does not change + the name of the data pools. + """ + for m in self.mounts: + m.umount_wait() + self.fs.add_data_pool('another-data-pool') + oldvolname = self.volname + newvolname = self._generate_random_volume_name() + self.fs.get_pool_names(refresh=True) + orig_data_pool_names = list(self.fs.data_pools.values()) + new_metadata_pool = f"cephfs.{newvolname}.meta" + self._fs_cmd("volume", "rename", self.volname, newvolname, + "--yes-i-really-mean-it") + volumels = json.loads(self._fs_cmd('volume', 'ls')) + volnames = [volume['name'] for volume in volumels] + # volume name changed + self.assertIn(newvolname, volnames) + self.assertNotIn(oldvolname, volnames) + self.fs.get_pool_names(refresh=True) + # metadata pool name changed + self.assertEqual(new_metadata_pool, self.fs.get_metadata_pool_name()) + # data pool names unchanged + self.assertCountEqual(orig_data_pool_names, list(self.fs.data_pools.values())) + + def test_volume_info(self): + """ + Tests the 'fs volume info' command + """ + vol_fields = ["pools", "used_size", "pending_subvolume_deletions", "mon_addrs"] + group = self._generate_random_group_name() + # create subvolumegroup + self._fs_cmd("subvolumegroup", "create", self.volname, group) + # get volume metadata + vol_info = json.loads(self._get_volume_info(self.volname)) + for md in vol_fields: + self.assertIn(md, vol_info, + f"'{md}' key not present in metadata of volume") + self.assertEqual(vol_info["used_size"], 0, + "Size should be zero when volumes directory is empty") + + def test_volume_info_pending_subvol_deletions(self): + """ + Tests the pending_subvolume_deletions in 'fs volume info' command + """ + subvolname = self._generate_random_subvolume_name() + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolname, "--mode=777") + # create 3K zero byte files + self._do_subvolume_io(subvolname, number_of_files=3000, file_size=0) + # Delete the subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolname) + # get volume metadata + vol_info = json.loads(self._get_volume_info(self.volname)) + self.assertNotEqual(vol_info['pending_subvolume_deletions'], 0, + "pending_subvolume_deletions should be 1") + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_volume_info_without_subvolumegroup(self): + """ + Tests the 'fs volume info' command without subvolume group + """ + vol_fields = ["pools", "mon_addrs"] + # get volume metadata + vol_info = json.loads(self._get_volume_info(self.volname)) + for md in vol_fields: + self.assertIn(md, vol_info, + f"'{md}' key not present in metadata of volume") + self.assertNotIn("used_size", vol_info, + "'used_size' should not be present in absence of subvolumegroup") + self.assertNotIn("pending_subvolume_deletions", vol_info, + "'pending_subvolume_deletions' should not be present in absence" + " of subvolumegroup") + + def test_volume_info_with_human_readable_flag(self): + """ + Tests the 'fs volume info --human_readable' command + """ + vol_fields = ["pools", "used_size", "pending_subvolume_deletions", "mon_addrs"] + group = self._generate_random_group_name() + # create subvolumegroup + self._fs_cmd("subvolumegroup", "create", self.volname, group) + # get volume metadata + vol_info = json.loads(self._get_volume_info(self.volname, "--human_readable")) + for md in vol_fields: + self.assertIn(md, vol_info, + f"'{md}' key not present in metadata of volume") + units = [' ', 'k', 'M', 'G', 'T', 'P', 'E'] + assert vol_info["used_size"][-1] in units, "unit suffix in used_size is absent" + assert vol_info["pools"]["data"][0]["avail"][-1] in units, "unit suffix in avail data is absent" + assert vol_info["pools"]["data"][0]["used"][-1] in units, "unit suffix in used data is absent" + assert vol_info["pools"]["metadata"][0]["avail"][-1] in units, "unit suffix in avail metadata is absent" + assert vol_info["pools"]["metadata"][0]["used"][-1] in units, "unit suffix in used metadata is absent" + self.assertEqual(int(vol_info["used_size"]), 0, + "Size should be zero when volumes directory is empty") + + def test_volume_info_with_human_readable_flag_without_subvolumegroup(self): + """ + Tests the 'fs volume info --human_readable' command without subvolume group + """ + vol_fields = ["pools", "mon_addrs"] + # get volume metadata + vol_info = json.loads(self._get_volume_info(self.volname, "--human_readable")) + for md in vol_fields: + self.assertIn(md, vol_info, + f"'{md}' key not present in metadata of volume") + units = [' ', 'k', 'M', 'G', 'T', 'P', 'E'] + assert vol_info["pools"]["data"][0]["avail"][-1] in units, "unit suffix in avail data is absent" + assert vol_info["pools"]["data"][0]["used"][-1] in units, "unit suffix in used data is absent" + assert vol_info["pools"]["metadata"][0]["avail"][-1] in units, "unit suffix in avail metadata is absent" + assert vol_info["pools"]["metadata"][0]["used"][-1] in units, "unit suffix in used metadata is absent" + self.assertNotIn("used_size", vol_info, + "'used_size' should not be present in absence of subvolumegroup") + self.assertNotIn("pending_subvolume_deletions", vol_info, + "'pending_subvolume_deletions' should not be present in absence" + " of subvolumegroup") + + +class TestSubvolumeGroups(TestVolumesHelper): + """Tests for FS subvolume group operations.""" + def test_default_uid_gid_subvolume_group(self): + group = self._generate_random_group_name() + expected_uid = 0 + expected_gid = 0 + + # create group + self._fs_cmd("subvolumegroup", "create", self.volname, group) + group_path = self._get_subvolume_group_path(self.volname, group) + + # check group's uid and gid + stat = self.mount_a.stat(group_path) + self.assertEqual(stat['st_uid'], expected_uid) + self.assertEqual(stat['st_gid'], expected_gid) + + # remove group + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_nonexistent_subvolume_group_create(self): + subvolume = self._generate_random_subvolume_name() + group = "non_existent_group" + + # try, creating subvolume in a nonexistent group + try: + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group) + except CommandFailedError as ce: + if ce.exitstatus != errno.ENOENT: + raise + else: + raise RuntimeError("expected the 'fs subvolume create' command to fail") + + def test_nonexistent_subvolume_group_rm(self): + group = "non_existent_group" + + # try, remove subvolume group + try: + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + except CommandFailedError as ce: + if ce.exitstatus != errno.ENOENT: + raise + else: + raise RuntimeError("expected the 'fs subvolumegroup rm' command to fail") + + def test_subvolume_group_create_with_auto_cleanup_on_fail(self): + group = self._generate_random_group_name() + data_pool = "invalid_pool" + # create group with invalid data pool layout + with self.assertRaises(CommandFailedError): + self._fs_cmd("subvolumegroup", "create", self.volname, group, "--pool_layout", data_pool) + + # check whether group path is cleaned up + try: + self._fs_cmd("subvolumegroup", "getpath", self.volname, group) + except CommandFailedError as ce: + if ce.exitstatus != errno.ENOENT: + raise + else: + raise RuntimeError("expected the 'fs subvolumegroup getpath' command to fail") + + def test_subvolume_group_create_with_desired_data_pool_layout(self): + group1, group2 = self._generate_random_group_name(2) + + # create group + self._fs_cmd("subvolumegroup", "create", self.volname, group1) + group1_path = self._get_subvolume_group_path(self.volname, group1) + + default_pool = self.mount_a.getfattr(group1_path, "ceph.dir.layout.pool") + new_pool = "new_pool" + self.assertNotEqual(default_pool, new_pool) + + # add data pool + newid = self.fs.add_data_pool(new_pool) + + # create group specifying the new data pool as its pool layout + self._fs_cmd("subvolumegroup", "create", self.volname, group2, + "--pool_layout", new_pool) + group2_path = self._get_subvolume_group_path(self.volname, group2) + + desired_pool = self.mount_a.getfattr(group2_path, "ceph.dir.layout.pool") + try: + self.assertEqual(desired_pool, new_pool) + except AssertionError: + self.assertEqual(int(desired_pool), newid) # old kernel returns id + + self._fs_cmd("subvolumegroup", "rm", self.volname, group1) + self._fs_cmd("subvolumegroup", "rm", self.volname, group2) + + def test_subvolume_group_create_with_desired_mode(self): + group1, group2 = self._generate_random_group_name(2) + # default mode + expected_mode1 = "755" + # desired mode + expected_mode2 = "777" + + # create group + self._fs_cmd("subvolumegroup", "create", self.volname, group2, f"--mode={expected_mode2}") + self._fs_cmd("subvolumegroup", "create", self.volname, group1) + + group1_path = self._get_subvolume_group_path(self.volname, group1) + group2_path = self._get_subvolume_group_path(self.volname, group2) + volumes_path = os.path.dirname(group1_path) + + # check group's mode + actual_mode1 = self.mount_a.run_shell(['stat', '-c' '%a', group1_path]).stdout.getvalue().strip() + actual_mode2 = self.mount_a.run_shell(['stat', '-c' '%a', group2_path]).stdout.getvalue().strip() + actual_mode3 = self.mount_a.run_shell(['stat', '-c' '%a', volumes_path]).stdout.getvalue().strip() + self.assertEqual(actual_mode1, expected_mode1) + self.assertEqual(actual_mode2, expected_mode2) + self.assertEqual(actual_mode3, expected_mode1) + + self._fs_cmd("subvolumegroup", "rm", self.volname, group1) + self._fs_cmd("subvolumegroup", "rm", self.volname, group2) + + def test_subvolume_group_create_with_desired_uid_gid(self): + """ + That the subvolume group can be created with the desired uid and gid and its uid and gid matches the + expected values. + """ + uid = 1000 + gid = 1000 + + # create subvolume group + subvolgroupname = self._generate_random_group_name() + self._fs_cmd("subvolumegroup", "create", self.volname, subvolgroupname, "--uid", str(uid), "--gid", str(gid)) + + # make sure it exists + subvolgrouppath = self._get_subvolume_group_path(self.volname, subvolgroupname) + self.assertNotEqual(subvolgrouppath, None) + + # verify the uid and gid + suid = int(self.mount_a.run_shell(['stat', '-c' '%u', subvolgrouppath]).stdout.getvalue().strip()) + sgid = int(self.mount_a.run_shell(['stat', '-c' '%g', subvolgrouppath]).stdout.getvalue().strip()) + self.assertEqual(uid, suid) + self.assertEqual(gid, sgid) + + # remove group + self._fs_cmd("subvolumegroup", "rm", self.volname, subvolgroupname) + + def test_subvolume_group_create_with_invalid_data_pool_layout(self): + group = self._generate_random_group_name() + data_pool = "invalid_pool" + # create group with invalid data pool layout + try: + self._fs_cmd("subvolumegroup", "create", self.volname, group, "--pool_layout", data_pool) + except CommandFailedError as ce: + if ce.exitstatus != errno.EINVAL: + raise + else: + raise RuntimeError("expected the 'fs subvolumegroup create' command to fail") + + def test_subvolume_group_create_with_size(self): + # create group with size -- should set quota + group = self._generate_random_group_name() + self._fs_cmd("subvolumegroup", "create", self.volname, group, "1000000000") + + # get group metadata + group_info = json.loads(self._get_subvolume_group_info(self.volname, group)) + self.assertEqual(group_info["bytes_quota"], 1000000000) + + # remove group + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_subvolume_group_info(self): + # tests the 'fs subvolumegroup info' command + + group_md = ["atime", "bytes_pcent", "bytes_quota", "bytes_used", "created_at", "ctime", + "data_pool", "gid", "mode", "mon_addrs", "mtime", "uid"] + + # create group + group = self._generate_random_group_name() + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # get group metadata + group_info = json.loads(self._get_subvolume_group_info(self.volname, group)) + for md in group_md: + self.assertIn(md, group_info, "'{0}' key not present in metadata of group".format(md)) + + self.assertEqual(group_info["bytes_pcent"], "undefined", "bytes_pcent should be set to undefined if quota is not set") + self.assertEqual(group_info["bytes_quota"], "infinite", "bytes_quota should be set to infinite if quota is not set") + self.assertEqual(group_info["uid"], 0) + self.assertEqual(group_info["gid"], 0) + + nsize = self.DEFAULT_FILE_SIZE*1024*1024 + self._fs_cmd("subvolumegroup", "resize", self.volname, group, str(nsize)) + + # get group metadata after quota set + group_info = json.loads(self._get_subvolume_group_info(self.volname, group)) + for md in group_md: + self.assertIn(md, group_info, "'{0}' key not present in metadata of subvolume".format(md)) + + self.assertNotEqual(group_info["bytes_pcent"], "undefined", "bytes_pcent should not be set to undefined if quota is set") + self.assertEqual(group_info["bytes_quota"], nsize, "bytes_quota should be set to '{0}'".format(nsize)) + + # remove group + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_subvolume_group_create_idempotence(self): + # create group + group = self._generate_random_group_name() + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # try creating w/ same subvolume group name -- should be idempotent + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # remove group + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_subvolume_group_create_idempotence_mode(self): + # create group + group = self._generate_random_group_name() + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # try creating w/ same subvolume group name with mode -- should set mode + self._fs_cmd("subvolumegroup", "create", self.volname, group, "--mode=766") + + group_path = self._get_subvolume_group_path(self.volname, group) + + # check subvolumegroup's mode + mode = self.mount_a.run_shell(['stat', '-c' '%a', group_path]).stdout.getvalue().strip() + self.assertEqual(mode, "766") + + # remove group + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_subvolume_group_create_idempotence_uid_gid(self): + desired_uid = 1000 + desired_gid = 1000 + + # create group + group = self._generate_random_group_name() + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # try creating w/ same subvolume group name with uid/gid -- should set uid/gid + self._fs_cmd("subvolumegroup", "create", self.volname, group, "--uid", str(desired_uid), "--gid", str(desired_gid)) + + group_path = self._get_subvolume_group_path(self.volname, group) + + # verify the uid and gid + actual_uid = int(self.mount_a.run_shell(['stat', '-c' '%u', group_path]).stdout.getvalue().strip()) + actual_gid = int(self.mount_a.run_shell(['stat', '-c' '%g', group_path]).stdout.getvalue().strip()) + self.assertEqual(desired_uid, actual_uid) + self.assertEqual(desired_gid, actual_gid) + + # remove group + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_subvolume_group_create_idempotence_data_pool(self): + # create group + group = self._generate_random_group_name() + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + group_path = self._get_subvolume_group_path(self.volname, group) + + default_pool = self.mount_a.getfattr(group_path, "ceph.dir.layout.pool") + new_pool = "new_pool" + self.assertNotEqual(default_pool, new_pool) + + # add data pool + newid = self.fs.add_data_pool(new_pool) + + # try creating w/ same subvolume group name with new data pool -- should set pool + self._fs_cmd("subvolumegroup", "create", self.volname, group, "--pool_layout", new_pool) + desired_pool = self.mount_a.getfattr(group_path, "ceph.dir.layout.pool") + try: + self.assertEqual(desired_pool, new_pool) + except AssertionError: + self.assertEqual(int(desired_pool), newid) # old kernel returns id + + # remove group + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_subvolume_group_create_idempotence_resize(self): + # create group + group = self._generate_random_group_name() + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # try creating w/ same subvolume name with size -- should set quota + self._fs_cmd("subvolumegroup", "create", self.volname, group, "1000000000") + + # get group metadata + group_info = json.loads(self._get_subvolume_group_info(self.volname, group)) + self.assertEqual(group_info["bytes_quota"], 1000000000) + + # remove group + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_subvolume_group_quota_mds_path_restriction_to_group_path(self): + """ + Tests subvolumegroup quota enforcement with mds path restriction set to group. + For quota to be enforced, read permission needs to be provided to the parent + of the directory on which quota is set. Please see the tracker comment [1] + [1] https://tracker.ceph.com/issues/55090#note-8 + """ + osize = self.DEFAULT_FILE_SIZE*1024*1024*100 + # create group with 100MB quota + group = self._generate_random_group_name() + self._fs_cmd("subvolumegroup", "create", self.volname, group, + "--size", str(osize), "--mode=777") + + # make sure it exists + grouppath = self._get_subvolume_group_path(self.volname, group) + self.assertNotEqual(grouppath, None) + + # create subvolume under the group + subvolname = self._generate_random_subvolume_name() + self._fs_cmd("subvolume", "create", self.volname, subvolname, + "--group_name", group, "--mode=777") + + # make sure it exists + subvolpath = self._get_subvolume_path(self.volname, subvolname, group_name=group) + self.assertNotEqual(subvolpath, None) + + # Create auth_id + authid = "client.guest1" + user = json.loads(self.fs.mon_manager.raw_cluster_cmd( + "auth", "get-or-create", authid, + "mds", "allow rw path=/volumes", + "mgr", "allow rw", + "osd", "allow rw tag cephfs *=*", + "mon", "allow r", + "--format=json-pretty" + )) + + # Prepare guest_mount with new authid + guest_mount = self.mount_b + guest_mount.umount_wait() + + # configure credentials for guest client + self._configure_guest_auth(guest_mount, "guest1", user[0]["key"]) + + # mount the subvolume + mount_path = os.path.join("/", subvolpath) + guest_mount.mount_wait(cephfs_mntpt=mount_path) + + # create 99 files of 1MB + guest_mount.run_shell_payload("mkdir -p dir1") + for i in range(99): + filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, i) + guest_mount.write_n_mb(os.path.join("dir1", filename), self.DEFAULT_FILE_SIZE) + try: + # write two files of 1MB file to exceed the quota + guest_mount.run_shell_payload("mkdir -p dir2") + for i in range(2): + filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, i) + guest_mount.write_n_mb(os.path.join("dir2", filename), self.DEFAULT_FILE_SIZE) + # For quota to be enforced + time.sleep(60) + # create 400 files of 1MB to exceed quota + for i in range(400): + filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, i) + guest_mount.write_n_mb(os.path.join("dir2", filename), self.DEFAULT_FILE_SIZE) + # Sometimes quota enforcement takes time. + if i == 200: + time.sleep(60) + except CommandFailedError: + pass + else: + self.fail(f"expected filling subvolume {subvolname} with 400 files of size 1MB to fail") + + # clean up + guest_mount.umount_wait() + + # Delete the subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolname, "--group_name", group) + + # remove group + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_group_quota_mds_path_restriction_to_subvolume_path(self): + """ + Tests subvolumegroup quota enforcement with mds path restriction set to subvolume path + The quota should not be enforced because of the fourth limitation mentioned at + https://docs.ceph.com/en/latest/cephfs/quota/#limitations + """ + osize = self.DEFAULT_FILE_SIZE*1024*1024*100 + # create group with 100MB quota + group = self._generate_random_group_name() + self._fs_cmd("subvolumegroup", "create", self.volname, group, + "--size", str(osize), "--mode=777") + + # make sure it exists + grouppath = self._get_subvolume_group_path(self.volname, group) + self.assertNotEqual(grouppath, None) + + # create subvolume under the group + subvolname = self._generate_random_subvolume_name() + self._fs_cmd("subvolume", "create", self.volname, subvolname, + "--group_name", group, "--mode=777") + + # make sure it exists + subvolpath = self._get_subvolume_path(self.volname, subvolname, group_name=group) + self.assertNotEqual(subvolpath, None) + + mount_path = os.path.join("/", subvolpath) + + # Create auth_id + authid = "client.guest1" + user = json.loads(self.fs.mon_manager.raw_cluster_cmd( + "auth", "get-or-create", authid, + "mds", f"allow rw path={mount_path}", + "mgr", "allow rw", + "osd", "allow rw tag cephfs *=*", + "mon", "allow r", + "--format=json-pretty" + )) + + # Prepare guest_mount with new authid + guest_mount = self.mount_b + guest_mount.umount_wait() + + # configure credentials for guest client + self._configure_guest_auth(guest_mount, "guest1", user[0]["key"]) + + # mount the subvolume + guest_mount.mount_wait(cephfs_mntpt=mount_path) + + # create 99 files of 1MB to exceed quota + guest_mount.run_shell_payload("mkdir -p dir1") + for i in range(99): + filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, i) + guest_mount.write_n_mb(os.path.join("dir1", filename), self.DEFAULT_FILE_SIZE) + try: + # write two files of 1MB file to exceed the quota + guest_mount.run_shell_payload("mkdir -p dir2") + for i in range(2): + filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, i) + guest_mount.write_n_mb(os.path.join("dir2", filename), self.DEFAULT_FILE_SIZE) + # For quota to be enforced + time.sleep(60) + # create 400 files of 1MB to exceed quota + for i in range(400): + filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, i) + guest_mount.write_n_mb(os.path.join("dir2", filename), self.DEFAULT_FILE_SIZE) + # Sometimes quota enforcement takes time. + if i == 200: + time.sleep(60) + except CommandFailedError: + self.fail(f"Quota should not be enforced, expected filling subvolume {subvolname} with 400 files of size 1MB to succeed") + + # clean up + guest_mount.umount_wait() + + # Delete the subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolname, "--group_name", group) + + # remove group + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_group_quota_exceeded_subvolume_removal(self): + """ + Tests subvolume removal if it's group quota is exceeded + """ + osize = self.DEFAULT_FILE_SIZE*1024*1024*100 + # create group with 100MB quota + group = self._generate_random_group_name() + self._fs_cmd("subvolumegroup", "create", self.volname, group, + "--size", str(osize), "--mode=777") + + # make sure it exists + grouppath = self._get_subvolume_group_path(self.volname, group) + self.assertNotEqual(grouppath, None) + + # create subvolume under the group + subvolname = self._generate_random_subvolume_name() + self._fs_cmd("subvolume", "create", self.volname, subvolname, + "--group_name", group, "--mode=777") + + # make sure it exists + subvolpath = self._get_subvolume_path(self.volname, subvolname, group_name=group) + self.assertNotEqual(subvolpath, None) + + # create 99 files of 1MB to exceed quota + self._do_subvolume_io(subvolname, subvolume_group=group, number_of_files=99) + + try: + # write two files of 1MB file to exceed the quota + self._do_subvolume_io(subvolname, subvolume_group=group, create_dir='dir1', number_of_files=2) + # For quota to be enforced + time.sleep(20) + # create 400 files of 1MB to exceed quota + self._do_subvolume_io(subvolname, subvolume_group=group, create_dir='dir1', number_of_files=400) + except CommandFailedError: + # Delete subvolume when group quota is exceeded + self._fs_cmd("subvolume", "rm", self.volname, subvolname, "--group_name", group) + else: + self.fail(f"expected filling subvolume {subvolname} with 400 files of size 1MB to fail") + + # remove group + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_group_quota_exceeded_subvolume_removal_retained_snaps(self): + """ + Tests retained snapshot subvolume removal if it's group quota is exceeded + """ + group = self._generate_random_group_name() + subvolname = self._generate_random_subvolume_name() + snapshot1, snapshot2 = self._generate_random_snapshot_name(2) + + osize = self.DEFAULT_FILE_SIZE*1024*1024*100 + # create group with 100MB quota + self._fs_cmd("subvolumegroup", "create", self.volname, group, + "--size", str(osize), "--mode=777") + + # make sure it exists + grouppath = self._get_subvolume_group_path(self.volname, group) + self.assertNotEqual(grouppath, None) + + # create subvolume under the group + self._fs_cmd("subvolume", "create", self.volname, subvolname, + "--group_name", group, "--mode=777") + + # make sure it exists + subvolpath = self._get_subvolume_path(self.volname, subvolname, group_name=group) + self.assertNotEqual(subvolpath, None) + + # create 99 files of 1MB to exceed quota + self._do_subvolume_io(subvolname, subvolume_group=group, number_of_files=99) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot1, "--group_name", group) + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot2, "--group_name", group) + + try: + # write two files of 1MB file to exceed the quota + self._do_subvolume_io(subvolname, subvolume_group=group, create_dir='dir1', number_of_files=2) + # For quota to be enforced + time.sleep(20) + # create 400 files of 1MB to exceed quota + self._do_subvolume_io(subvolname, subvolume_group=group, number_of_files=400) + except CommandFailedError: + # remove with snapshot retention + self._fs_cmd("subvolume", "rm", self.volname, subvolname, "--group_name", group, "--retain-snapshots") + # remove snapshot1 + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot1, "--group_name", group) + # remove snapshot2 (should remove volume) + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot2, "--group_name", group) + # verify subvolume trash is clean + self._wait_for_subvol_trash_empty(subvolname, group=group) + else: + self.fail(f"expected filling subvolume {subvolname} with 400 files of size 1MB to fail") + + # remove group + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_group_quota_subvolume_removal(self): + """ + Tests subvolume removal if it's group quota is set. + """ + # create group with size -- should set quota + group = self._generate_random_group_name() + self._fs_cmd("subvolumegroup", "create", self.volname, group, "1000000000") + + # create subvolume under the group + subvolname = self._generate_random_subvolume_name() + self._fs_cmd("subvolume", "create", self.volname, subvolname, "--group_name", group) + + # remove subvolume + try: + self._fs_cmd("subvolume", "rm", self.volname, subvolname, "--group_name", group) + except CommandFailedError: + self.fail("expected the 'fs subvolume rm' command to succeed if group quota is set") + + # remove subvolumegroup + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_group_quota_legacy_subvolume_removal(self): + """ + Tests legacy subvolume removal if it's group quota is set. + """ + subvolume = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + + # emulate a old-fashioned subvolume -- in a custom group + createpath1 = os.path.join(".", "volumes", group, subvolume) + self.mount_a.run_shell(['sudo', 'mkdir', '-p', createpath1], omit_sudo=False) + + # this would auto-upgrade on access without anyone noticing + subvolpath1 = self._fs_cmd("subvolume", "getpath", self.volname, subvolume, "--group-name", group) + self.assertNotEqual(subvolpath1, None) + subvolpath1 = subvolpath1.rstrip() # remove "/" prefix and any trailing newline + + # and... the subvolume path returned should be what we created behind the scene + self.assertEqual(createpath1[1:], subvolpath1) + + # Set subvolumegroup quota on idempotent subvolumegroup creation + self._fs_cmd("subvolumegroup", "create", self.volname, group, "1000000000") + + # remove subvolume + try: + self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group) + except CommandFailedError: + self.fail("expected the 'fs subvolume rm' command to succeed if group quota is set") + + # remove subvolumegroup + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_group_quota_v1_subvolume_removal(self): + """ + Tests v1 subvolume removal if it's group quota is set. + """ + subvolume = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + + # emulate a v1 subvolume -- in a custom group + self._create_v1_subvolume(subvolume, subvol_group=group, has_snapshot=False) + + # Set subvolumegroup quota on idempotent subvolumegroup creation + self._fs_cmd("subvolumegroup", "create", self.volname, group, "1000000000") + + # remove subvolume + try: + self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group) + except CommandFailedError: + self.fail("expected the 'fs subvolume rm' command to succeed if group quota is set") + + # remove subvolumegroup + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_group_resize_fail_invalid_size(self): + """ + That a subvolume group cannot be resized to an invalid size and the quota did not change + """ + + osize = self.DEFAULT_FILE_SIZE*1024*1024 + # create group with 1MB quota + group = self._generate_random_group_name() + self._fs_cmd("subvolumegroup", "create", self.volname, group, "--size", str(osize)) + + # make sure it exists + grouppath = self._get_subvolume_group_path(self.volname, group) + self.assertNotEqual(grouppath, None) + + # try to resize the subvolume with an invalid size -10 + nsize = -10 + try: + self._fs_cmd("subvolumegroup", "resize", self.volname, group, str(nsize)) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EINVAL, + "invalid error code on resize of subvolume group with invalid size") + else: + self.fail("expected the 'fs subvolumegroup resize' command to fail") + + # verify the quota did not change + size = int(self.mount_a.getfattr(grouppath, "ceph.quota.max_bytes")) + self.assertEqual(size, osize) + + # remove group + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_subvolume_group_resize_fail_zero_size(self): + """ + That a subvolume group cannot be resized to a zero size and the quota did not change + """ + + osize = self.DEFAULT_FILE_SIZE*1024*1024 + # create group with 1MB quota + group = self._generate_random_group_name() + self._fs_cmd("subvolumegroup", "create", self.volname, group, "--size", str(osize)) + + # make sure it exists + grouppath = self._get_subvolume_group_path(self.volname, group) + self.assertNotEqual(grouppath, None) + + # try to resize the subvolume group with size 0 + nsize = 0 + try: + self._fs_cmd("subvolumegroup", "resize", self.volname, group, str(nsize)) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EINVAL, + "invalid error code on resize of subvolume group with invalid size") + else: + self.fail("expected the 'fs subvolumegroup resize' command to fail") + + # verify the quota did not change + size = int(self.mount_a.getfattr(grouppath, "ceph.quota.max_bytes")) + self.assertEqual(size, osize) + + # remove group + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_subvolume_group_resize_quota_lt_used_size(self): + """ + That a subvolume group can be resized to a size smaller than the current used size + and the resulting quota matches the expected size. + """ + + osize = self.DEFAULT_FILE_SIZE*1024*1024*20 + # create group with 20MB quota + group = self._generate_random_group_name() + self._fs_cmd("subvolumegroup", "create", self.volname, group, + "--size", str(osize), "--mode=777") + + # make sure it exists + grouppath = self._get_subvolume_group_path(self.volname, group) + self.assertNotEqual(grouppath, None) + + # create subvolume under the group + subvolname = self._generate_random_subvolume_name() + self._fs_cmd("subvolume", "create", self.volname, subvolname, + "--group_name", group, "--mode=777") + + # make sure it exists + subvolpath = self._get_subvolume_path(self.volname, subvolname, group_name=group) + self.assertNotEqual(subvolpath, None) + + # create one file of 10MB + file_size=self.DEFAULT_FILE_SIZE*10 + number_of_files=1 + log.debug("filling subvolume {0} with {1} file of size {2}MB".format(subvolname, + number_of_files, + file_size)) + filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, self.DEFAULT_NUMBER_OF_FILES+1) + self.mount_a.write_n_mb(os.path.join(subvolpath, filename), file_size) + + usedsize = int(self.mount_a.getfattr(subvolpath, "ceph.dir.rbytes")) + + # shrink the subvolume group + nsize = usedsize // 2 + try: + self._fs_cmd("subvolumegroup", "resize", self.volname, group, str(nsize)) + except CommandFailedError: + self.fail("expected the 'fs subvolumegroup resize' command to succeed") + + # verify the quota + size = int(self.mount_a.getfattr(grouppath, "ceph.quota.max_bytes")) + self.assertEqual(size, nsize) + + # remove subvolume and group + self._fs_cmd("subvolume", "rm", self.volname, subvolname, "--group_name", group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_group_resize_fail_quota_lt_used_size_no_shrink(self): + """ + That a subvolume group cannot be resized to a size smaller than the current used size + when --no_shrink is given and the quota did not change. + """ + + osize = self.DEFAULT_FILE_SIZE*1024*1024*20 + # create group with 20MB quota + group = self._generate_random_group_name() + self._fs_cmd("subvolumegroup", "create", self.volname, group, + "--size", str(osize), "--mode=777") + + # make sure it exists + grouppath = self._get_subvolume_group_path(self.volname, group) + self.assertNotEqual(grouppath, None) + + # create subvolume under the group + subvolname = self._generate_random_subvolume_name() + self._fs_cmd("subvolume", "create", self.volname, subvolname, + "--group_name", group, "--mode=777") + + # make sure it exists + subvolpath = self._get_subvolume_path(self.volname, subvolname, group_name=group) + self.assertNotEqual(subvolpath, None) + + # create one file of 10MB + file_size=self.DEFAULT_FILE_SIZE*10 + number_of_files=1 + log.debug("filling subvolume {0} with {1} file of size {2}MB".format(subvolname, + number_of_files, + file_size)) + filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, self.DEFAULT_NUMBER_OF_FILES+2) + self.mount_a.write_n_mb(os.path.join(subvolpath, filename), file_size) + + usedsize = int(self.mount_a.getfattr(grouppath, "ceph.dir.rbytes")) + + # shrink the subvolume group + nsize = usedsize // 2 + try: + self._fs_cmd("subvolumegroup", "resize", self.volname, group, str(nsize), "--no_shrink") + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EINVAL, "invalid error code on resize of subvolumegroup with quota less than used") + else: + self.fail("expected the 'fs subvolumegroup resize' command to fail") + + # verify the quota did not change + size = int(self.mount_a.getfattr(grouppath, "ceph.quota.max_bytes")) + self.assertEqual(size, osize) + + # remove subvolume and group + self._fs_cmd("subvolume", "rm", self.volname, subvolname, "--group_name", group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_group_resize_expand_on_full_subvolume(self): + """ + That the subvolume group can be expanded after it is full and future write succeed + """ + + osize = self.DEFAULT_FILE_SIZE*1024*1024*100 + # create group with 100MB quota + group = self._generate_random_group_name() + self._fs_cmd("subvolumegroup", "create", self.volname, group, + "--size", str(osize), "--mode=777") + + # make sure it exists + grouppath = self._get_subvolume_group_path(self.volname, group) + self.assertNotEqual(grouppath, None) + + # create subvolume under the group + subvolname = self._generate_random_subvolume_name() + self._fs_cmd("subvolume", "create", self.volname, subvolname, + "--group_name", group, "--mode=777") + + # make sure it exists + subvolpath = self._get_subvolume_path(self.volname, subvolname, group_name=group) + self.assertNotEqual(subvolpath, None) + + # create 99 files of 1MB + self._do_subvolume_io(subvolname, subvolume_group=group, number_of_files=99) + + try: + # write two files of 1MB file to exceed the quota + self._do_subvolume_io(subvolname, subvolume_group=group, create_dir='dir1', number_of_files=2) + # For quota to be enforced + time.sleep(20) + # create 500 files of 1MB + self._do_subvolume_io(subvolname, subvolume_group=group, create_dir='dir1', number_of_files=500) + except CommandFailedError: + # Not able to write. So expand the subvolumegroup more and try writing the files again + nsize = osize*7 + self._fs_cmd("subvolumegroup", "resize", self.volname, group, str(nsize)) + try: + self._do_subvolume_io(subvolname, subvolume_group=group, create_dir='dir1', number_of_files=500) + except CommandFailedError: + self.fail("expected filling subvolume {0} with 500 files of size 1MB " + "to succeed".format(subvolname)) + else: + self.fail("expected filling subvolume {0} with 500 files of size 1MB " + "to fail".format(subvolname)) + + # remove subvolume and group + self._fs_cmd("subvolume", "rm", self.volname, subvolname, "--group_name", group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_group_resize_infinite_size(self): + """ + That a subvolume group can be resized to an infinite size by unsetting its quota. + """ + + osize = self.DEFAULT_FILE_SIZE*1024*1024 + # create group + group = self._generate_random_group_name() + self._fs_cmd("subvolumegroup", "create", self.volname, group, + "--size", str(osize)) + + # make sure it exists + grouppath = self._get_subvolume_group_path(self.volname, group) + self.assertNotEqual(grouppath, None) + + # resize inf + self._fs_cmd("subvolumegroup", "resize", self.volname, group, "inf") + + # verify that the quota is None + size = self.mount_a.getfattr(grouppath, "ceph.quota.max_bytes") + self.assertEqual(size, None) + + # remove subvolume group + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_subvolume_group_resize_infinite_size_future_writes(self): + """ + That a subvolume group can be resized to an infinite size and the future writes succeed. + """ + + osize = self.DEFAULT_FILE_SIZE*1024*1024*5 + # create group with 5MB quota + group = self._generate_random_group_name() + self._fs_cmd("subvolumegroup", "create", self.volname, group, + "--size", str(osize), "--mode=777") + + # make sure it exists + grouppath = self._get_subvolume_group_path(self.volname, group) + self.assertNotEqual(grouppath, None) + + # create subvolume under the group + subvolname = self._generate_random_subvolume_name() + self._fs_cmd("subvolume", "create", self.volname, subvolname, + "--group_name", group, "--mode=777") + + # make sure it exists + subvolpath = self._get_subvolume_path(self.volname, subvolname, group_name=group) + self.assertNotEqual(subvolpath, None) + + # create 4 files of 1MB + self._do_subvolume_io(subvolname, subvolume_group=group, number_of_files=4) + + try: + # write two files of 1MB file to exceed the quota + self._do_subvolume_io(subvolname, subvolume_group=group, create_dir='dir1', number_of_files=2) + # For quota to be enforced + time.sleep(20) + # create 500 files of 1MB + self._do_subvolume_io(subvolname, subvolume_group=group, create_dir='dir1', number_of_files=500) + except CommandFailedError: + # Not able to write. So resize subvolumegroup to 'inf' and try writing the files again + # resize inf + self._fs_cmd("subvolumegroup", "resize", self.volname, group, "inf") + try: + self._do_subvolume_io(subvolname, subvolume_group=group, create_dir='dir1', number_of_files=500) + except CommandFailedError: + self.fail("expected filling subvolume {0} with 500 files of size 1MB " + "to succeed".format(subvolname)) + else: + self.fail("expected filling subvolume {0} with 500 files of size 1MB " + "to fail".format(subvolname)) + + + # verify that the quota is None + size = self.mount_a.getfattr(grouppath, "ceph.quota.max_bytes") + self.assertEqual(size, None) + + # remove subvolume and group + self._fs_cmd("subvolume", "rm", self.volname, subvolname, "--group_name", group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_group_ls(self): + # tests the 'fs subvolumegroup ls' command + + subvolumegroups = [] + + #create subvolumegroups + subvolumegroups = self._generate_random_group_name(3) + for groupname in subvolumegroups: + self._fs_cmd("subvolumegroup", "create", self.volname, groupname) + + subvolumegroupls = json.loads(self._fs_cmd('subvolumegroup', 'ls', self.volname)) + if len(subvolumegroupls) == 0: + raise RuntimeError("Expected the 'fs subvolumegroup ls' command to list the created subvolume groups") + else: + subvolgroupnames = [subvolumegroup['name'] for subvolumegroup in subvolumegroupls] + if collections.Counter(subvolgroupnames) != collections.Counter(subvolumegroups): + raise RuntimeError("Error creating or listing subvolume groups") + + def test_subvolume_group_ls_filter(self): + # tests the 'fs subvolumegroup ls' command filters '_deleting' directory + + subvolumegroups = [] + + #create subvolumegroup + subvolumegroups = self._generate_random_group_name(3) + for groupname in subvolumegroups: + self._fs_cmd("subvolumegroup", "create", self.volname, groupname) + + # create subvolume and remove. This creates '_deleting' directory. + subvolume = self._generate_random_subvolume_name() + self._fs_cmd("subvolume", "create", self.volname, subvolume) + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + + subvolumegroupls = json.loads(self._fs_cmd('subvolumegroup', 'ls', self.volname)) + subvolgroupnames = [subvolumegroup['name'] for subvolumegroup in subvolumegroupls] + if "_deleting" in subvolgroupnames: + self.fail("Listing subvolume groups listed '_deleting' directory") + + def test_subvolume_group_ls_filter_internal_directories(self): + # tests the 'fs subvolumegroup ls' command filters internal directories + # eg: '_deleting', '_nogroup', '_index', "_legacy" + + subvolumegroups = self._generate_random_group_name(3) + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone = self._generate_random_clone_name() + + #create subvolumegroups + for groupname in subvolumegroups: + self._fs_cmd("subvolumegroup", "create", self.volname, groupname) + + # create subvolume which will create '_nogroup' directory + self._fs_cmd("subvolume", "create", self.volname, subvolume) + + # create snapshot + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # clone snapshot which will create '_index' directory + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone) + + # wait for clone to complete + self._wait_for_clone_to_complete(clone) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove subvolume which will create '_deleting' directory + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + + # list subvolumegroups + ret = json.loads(self._fs_cmd('subvolumegroup', 'ls', self.volname)) + self.assertEqual(len(ret), len(subvolumegroups)) + + ret_list = [subvolumegroup['name'] for subvolumegroup in ret] + self.assertEqual(len(ret_list), len(subvolumegroups)) + + self.assertEqual(all(elem in subvolumegroups for elem in ret_list), True) + + # cleanup + self._fs_cmd("subvolume", "rm", self.volname, clone) + for groupname in subvolumegroups: + self._fs_cmd("subvolumegroup", "rm", self.volname, groupname) + + def test_subvolume_group_ls_for_nonexistent_volume(self): + # tests the 'fs subvolumegroup ls' command when /volume doesn't exist + # prerequisite: we expect that the test volume is created and a subvolumegroup is NOT created + + # list subvolume groups + subvolumegroupls = json.loads(self._fs_cmd('subvolumegroup', 'ls', self.volname)) + if len(subvolumegroupls) > 0: + raise RuntimeError("Expected the 'fs subvolumegroup ls' command to output an empty list") + + def test_subvolumegroup_pin_distributed(self): + self.fs.set_max_mds(2) + status = self.fs.wait_for_daemons() + self.config_set('mds', 'mds_export_ephemeral_distributed', True) + + group = "pinme" + self._fs_cmd("subvolumegroup", "create", self.volname, group) + self._fs_cmd("subvolumegroup", "pin", self.volname, group, "distributed", "True") + subvolumes = self._generate_random_subvolume_name(50) + for subvolume in subvolumes: + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group) + self._wait_distributed_subtrees(2 * 2, status=status, rank="all") + + # remove subvolumes + for subvolume in subvolumes: + self._fs_cmd("subvolume", "rm", self.volname, subvolume, group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_group_rm_force(self): + # test removing non-existing subvolume group with --force + group = self._generate_random_group_name() + try: + self._fs_cmd("subvolumegroup", "rm", self.volname, group, "--force") + except CommandFailedError: + raise RuntimeError("expected the 'fs subvolumegroup rm --force' command to succeed") + + def test_subvolume_group_exists_with_subvolumegroup_and_no_subvolume(self): + """Test the presence of any subvolumegroup when only subvolumegroup is present""" + + group = self._generate_random_group_name() + # create subvolumegroup + self._fs_cmd("subvolumegroup", "create", self.volname, group) + ret = self._fs_cmd("subvolumegroup", "exist", self.volname) + self.assertEqual(ret.strip('\n'), "subvolumegroup exists") + # delete subvolumegroup + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + ret = self._fs_cmd("subvolumegroup", "exist", self.volname) + self.assertEqual(ret.strip('\n'), "no subvolumegroup exists") + + def test_subvolume_group_exists_with_no_subvolumegroup_and_subvolume(self): + """Test the presence of any subvolumegroup when no subvolumegroup is present""" + + ret = self._fs_cmd("subvolumegroup", "exist", self.volname) + self.assertEqual(ret.strip('\n'), "no subvolumegroup exists") + + def test_subvolume_group_exists_with_subvolumegroup_and_subvolume(self): + """Test the presence of any subvolume when subvolumegroup + and subvolume both are present""" + + group = self._generate_random_group_name() + subvolume = self._generate_random_subvolume_name(2) + # create subvolumegroup + self._fs_cmd("subvolumegroup", "create", self.volname, group) + # create subvolume in group + self._fs_cmd("subvolume", "create", self.volname, subvolume[0], "--group_name", group) + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume[1]) + ret = self._fs_cmd("subvolumegroup", "exist", self.volname) + self.assertEqual(ret.strip('\n'), "subvolumegroup exists") + # delete subvolume in group + self._fs_cmd("subvolume", "rm", self.volname, subvolume[0], "--group_name", group) + ret = self._fs_cmd("subvolumegroup", "exist", self.volname) + self.assertEqual(ret.strip('\n'), "subvolumegroup exists") + # delete subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume[1]) + ret = self._fs_cmd("subvolumegroup", "exist", self.volname) + self.assertEqual(ret.strip('\n'), "subvolumegroup exists") + # delete subvolumegroup + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + ret = self._fs_cmd("subvolumegroup", "exist", self.volname) + self.assertEqual(ret.strip('\n'), "no subvolumegroup exists") + + def test_subvolume_group_exists_without_subvolumegroup_and_with_subvolume(self): + """Test the presence of any subvolume when subvolume is present + but no subvolumegroup is present""" + + subvolume = self._generate_random_subvolume_name() + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume) + ret = self._fs_cmd("subvolumegroup", "exist", self.volname) + self.assertEqual(ret.strip('\n'), "no subvolumegroup exists") + # delete subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + ret = self._fs_cmd("subvolumegroup", "exist", self.volname) + self.assertEqual(ret.strip('\n'), "no subvolumegroup exists") + + +class TestSubvolumes(TestVolumesHelper): + """Tests for FS subvolume operations, except snapshot and snapshot clone.""" + def test_async_subvolume_rm(self): + subvolumes = self._generate_random_subvolume_name(100) + + # create subvolumes + for subvolume in subvolumes: + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + self._do_subvolume_io(subvolume, number_of_files=10) + + self.mount_a.umount_wait() + + # remove subvolumes + for subvolume in subvolumes: + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + + self.mount_a.mount_wait() + + # verify trash dir is clean + self._wait_for_trash_empty(timeout=300) + + def test_default_uid_gid_subvolume(self): + subvolume = self._generate_random_subvolume_name() + expected_uid = 0 + expected_gid = 0 + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume) + subvol_path = self._get_subvolume_path(self.volname, subvolume) + + # check subvolume's uid and gid + stat = self.mount_a.stat(subvol_path) + self.assertEqual(stat['st_uid'], expected_uid) + self.assertEqual(stat['st_gid'], expected_gid) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_nonexistent_subvolume_rm(self): + # remove non-existing subvolume + subvolume = "non_existent_subvolume" + + # try, remove subvolume + try: + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + except CommandFailedError as ce: + if ce.exitstatus != errno.ENOENT: + raise + else: + raise RuntimeError("expected the 'fs subvolume rm' command to fail") + + def test_subvolume_create_and_rm(self): + # create subvolume + subvolume = self._generate_random_subvolume_name() + self._fs_cmd("subvolume", "create", self.volname, subvolume) + + # make sure it exists + subvolpath = self._fs_cmd("subvolume", "getpath", self.volname, subvolume) + self.assertNotEqual(subvolpath, None) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + # make sure its gone + try: + self._fs_cmd("subvolume", "getpath", self.volname, subvolume) + except CommandFailedError as ce: + if ce.exitstatus != errno.ENOENT: + raise + else: + raise RuntimeError("expected the 'fs subvolume getpath' command to fail. Subvolume not removed.") + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_create_and_rm_in_group(self): + subvolume = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + + # create group + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume, group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + # remove group + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_subvolume_create_idempotence(self): + # create subvolume + subvolume = self._generate_random_subvolume_name() + self._fs_cmd("subvolume", "create", self.volname, subvolume) + + # try creating w/ same subvolume name -- should be idempotent + self._fs_cmd("subvolume", "create", self.volname, subvolume) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_create_idempotence_resize(self): + # create subvolume + subvolume = self._generate_random_subvolume_name() + self._fs_cmd("subvolume", "create", self.volname, subvolume) + + # try creating w/ same subvolume name with size -- should set quota + self._fs_cmd("subvolume", "create", self.volname, subvolume, "1000000000") + + # get subvolume metadata + subvol_info = json.loads(self._get_subvolume_info(self.volname, subvolume)) + self.assertEqual(subvol_info["bytes_quota"], 1000000000) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_create_idempotence_mode(self): + # default mode + default_mode = "755" + + # create subvolume + subvolume = self._generate_random_subvolume_name() + self._fs_cmd("subvolume", "create", self.volname, subvolume) + + subvol_path = self._get_subvolume_path(self.volname, subvolume) + + actual_mode_1 = self.mount_a.run_shell(['stat', '-c' '%a', subvol_path]).stdout.getvalue().strip() + self.assertEqual(actual_mode_1, default_mode) + + # try creating w/ same subvolume name with --mode 777 + new_mode = "777" + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode", new_mode) + + actual_mode_2 = self.mount_a.run_shell(['stat', '-c' '%a', subvol_path]).stdout.getvalue().strip() + self.assertEqual(actual_mode_2, new_mode) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_create_idempotence_without_passing_mode(self): + # create subvolume + desired_mode = "777" + subvolume = self._generate_random_subvolume_name() + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode", desired_mode) + + subvol_path = self._get_subvolume_path(self.volname, subvolume) + + actual_mode_1 = self.mount_a.run_shell(['stat', '-c' '%a', subvol_path]).stdout.getvalue().strip() + self.assertEqual(actual_mode_1, desired_mode) + + # default mode + default_mode = "755" + + # try creating w/ same subvolume name without passing --mode argument + self._fs_cmd("subvolume", "create", self.volname, subvolume) + + actual_mode_2 = self.mount_a.run_shell(['stat', '-c' '%a', subvol_path]).stdout.getvalue().strip() + self.assertEqual(actual_mode_2, default_mode) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_create_isolated_namespace(self): + """ + Create subvolume in separate rados namespace + """ + + # create subvolume + subvolume = self._generate_random_subvolume_name() + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--namespace-isolated") + + # get subvolume metadata + subvol_info = json.loads(self._get_subvolume_info(self.volname, subvolume)) + self.assertNotEqual(len(subvol_info), 0) + self.assertEqual(subvol_info["pool_namespace"], "fsvolumens_" + subvolume) + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_create_with_auto_cleanup_on_fail(self): + subvolume = self._generate_random_subvolume_name() + data_pool = "invalid_pool" + # create subvolume with invalid data pool layout fails + with self.assertRaises(CommandFailedError): + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--pool_layout", data_pool) + + # check whether subvol path is cleaned up + try: + self._fs_cmd("subvolume", "getpath", self.volname, subvolume) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.ENOENT, "invalid error code on getpath of non-existent subvolume") + else: + self.fail("expected the 'fs subvolume getpath' command to fail") + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_create_with_desired_data_pool_layout_in_group(self): + subvol1, subvol2 = self._generate_random_subvolume_name(2) + group = self._generate_random_group_name() + + # create group. this also helps set default pool layout for subvolumes + # created within the group. + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group. + self._fs_cmd("subvolume", "create", self.volname, subvol1, "--group_name", group) + subvol1_path = self._get_subvolume_path(self.volname, subvol1, group_name=group) + + default_pool = self.mount_a.getfattr(subvol1_path, "ceph.dir.layout.pool") + new_pool = "new_pool" + self.assertNotEqual(default_pool, new_pool) + + # add data pool + newid = self.fs.add_data_pool(new_pool) + + # create subvolume specifying the new data pool as its pool layout + self._fs_cmd("subvolume", "create", self.volname, subvol2, "--group_name", group, + "--pool_layout", new_pool) + subvol2_path = self._get_subvolume_path(self.volname, subvol2, group_name=group) + + desired_pool = self.mount_a.getfattr(subvol2_path, "ceph.dir.layout.pool") + try: + self.assertEqual(desired_pool, new_pool) + except AssertionError: + self.assertEqual(int(desired_pool), newid) # old kernel returns id + + self._fs_cmd("subvolume", "rm", self.volname, subvol2, group) + self._fs_cmd("subvolume", "rm", self.volname, subvol1, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_create_with_desired_mode(self): + subvol1 = self._generate_random_subvolume_name() + + # default mode + default_mode = "755" + # desired mode + desired_mode = "777" + + self._fs_cmd("subvolume", "create", self.volname, subvol1, "--mode", "777") + + subvol1_path = self._get_subvolume_path(self.volname, subvol1) + + # check subvolumegroup's mode + subvol_par_path = os.path.dirname(subvol1_path) + group_path = os.path.dirname(subvol_par_path) + actual_mode1 = self.mount_a.run_shell(['stat', '-c' '%a', group_path]).stdout.getvalue().strip() + self.assertEqual(actual_mode1, default_mode) + # check /volumes mode + volumes_path = os.path.dirname(group_path) + actual_mode2 = self.mount_a.run_shell(['stat', '-c' '%a', volumes_path]).stdout.getvalue().strip() + self.assertEqual(actual_mode2, default_mode) + # check subvolume's mode + actual_mode3 = self.mount_a.run_shell(['stat', '-c' '%a', subvol1_path]).stdout.getvalue().strip() + self.assertEqual(actual_mode3, desired_mode) + + self._fs_cmd("subvolume", "rm", self.volname, subvol1) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_create_with_desired_mode_in_group(self): + subvol1, subvol2, subvol3 = self._generate_random_subvolume_name(3) + + group = self._generate_random_group_name() + # default mode + expected_mode1 = "755" + # desired mode + expected_mode2 = "777" + + # create group + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group + self._fs_cmd("subvolume", "create", self.volname, subvol1, "--group_name", group) + self._fs_cmd("subvolume", "create", self.volname, subvol2, "--group_name", group, "--mode", "777") + # check whether mode 0777 also works + self._fs_cmd("subvolume", "create", self.volname, subvol3, "--group_name", group, "--mode", "0777") + + subvol1_path = self._get_subvolume_path(self.volname, subvol1, group_name=group) + subvol2_path = self._get_subvolume_path(self.volname, subvol2, group_name=group) + subvol3_path = self._get_subvolume_path(self.volname, subvol3, group_name=group) + + # check subvolume's mode + actual_mode1 = self.mount_a.run_shell(['stat', '-c' '%a', subvol1_path]).stdout.getvalue().strip() + actual_mode2 = self.mount_a.run_shell(['stat', '-c' '%a', subvol2_path]).stdout.getvalue().strip() + actual_mode3 = self.mount_a.run_shell(['stat', '-c' '%a', subvol3_path]).stdout.getvalue().strip() + self.assertEqual(actual_mode1, expected_mode1) + self.assertEqual(actual_mode2, expected_mode2) + self.assertEqual(actual_mode3, expected_mode2) + + self._fs_cmd("subvolume", "rm", self.volname, subvol1, group) + self._fs_cmd("subvolume", "rm", self.volname, subvol2, group) + self._fs_cmd("subvolume", "rm", self.volname, subvol3, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_create_with_desired_uid_gid(self): + """ + That the subvolume can be created with the desired uid and gid and its uid and gid matches the + expected values. + """ + uid = 1000 + gid = 1000 + + # create subvolume + subvolname = self._generate_random_subvolume_name() + self._fs_cmd("subvolume", "create", self.volname, subvolname, "--uid", str(uid), "--gid", str(gid)) + + # make sure it exists + subvolpath = self._get_subvolume_path(self.volname, subvolname) + self.assertNotEqual(subvolpath, None) + + # verify the uid and gid + suid = int(self.mount_a.run_shell(['stat', '-c' '%u', subvolpath]).stdout.getvalue().strip()) + sgid = int(self.mount_a.run_shell(['stat', '-c' '%g', subvolpath]).stdout.getvalue().strip()) + self.assertEqual(uid, suid) + self.assertEqual(gid, sgid) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolname) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_create_with_invalid_data_pool_layout(self): + subvolume = self._generate_random_subvolume_name() + data_pool = "invalid_pool" + # create subvolume with invalid data pool layout + try: + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--pool_layout", data_pool) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EINVAL, "invalid error code on create of subvolume with invalid pool layout") + else: + self.fail("expected the 'fs subvolume create' command to fail") + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_create_with_invalid_size(self): + # create subvolume with an invalid size -1 + subvolume = self._generate_random_subvolume_name() + try: + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--size", "-1") + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EINVAL, "invalid error code on create of subvolume with invalid size") + else: + self.fail("expected the 'fs subvolume create' command to fail") + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_create_and_ls_providing_group_as_nogroup(self): + """ + That a 'subvolume create' and 'subvolume ls' should throw + permission denied error if option --group=_nogroup is provided. + """ + + subvolname = self._generate_random_subvolume_name() + + # try to create subvolume providing --group_name=_nogroup option + try: + self._fs_cmd("subvolume", "create", self.volname, subvolname, "--group_name", "_nogroup") + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EPERM) + else: + self.fail("expected the 'fs subvolume create' command to fail") + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolname) + + # try to list subvolumes providing --group_name=_nogroup option + try: + self._fs_cmd("subvolume", "ls", self.volname, "--group_name", "_nogroup") + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EPERM) + else: + self.fail("expected the 'fs subvolume ls' command to fail") + + # list subvolumes + self._fs_cmd("subvolume", "ls", self.volname) + + self._fs_cmd("subvolume", "rm", self.volname, subvolname) + + # verify trash dir is clean. + self._wait_for_trash_empty() + + def test_subvolume_expand(self): + """ + That a subvolume can be expanded in size and its quota matches the expected size. + """ + + # create subvolume + subvolname = self._generate_random_subvolume_name() + osize = self.DEFAULT_FILE_SIZE*1024*1024 + self._fs_cmd("subvolume", "create", self.volname, subvolname, "--size", str(osize)) + + # make sure it exists + subvolpath = self._get_subvolume_path(self.volname, subvolname) + self.assertNotEqual(subvolpath, None) + + # expand the subvolume + nsize = osize*2 + self._fs_cmd("subvolume", "resize", self.volname, subvolname, str(nsize)) + + # verify the quota + size = int(self.mount_a.getfattr(subvolpath, "ceph.quota.max_bytes")) + self.assertEqual(size, nsize) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolname) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_info(self): + # tests the 'fs subvolume info' command + + subvol_md = ["atime", "bytes_pcent", "bytes_quota", "bytes_used", "created_at", "ctime", + "data_pool", "gid", "mode", "mon_addrs", "mtime", "path", "pool_namespace", + "type", "uid", "features", "state"] + + # create subvolume + subvolume = self._generate_random_subvolume_name() + self._fs_cmd("subvolume", "create", self.volname, subvolume) + + # get subvolume metadata + subvol_info = json.loads(self._get_subvolume_info(self.volname, subvolume)) + for md in subvol_md: + self.assertIn(md, subvol_info, "'{0}' key not present in metadata of subvolume".format(md)) + + self.assertEqual(subvol_info["bytes_pcent"], "undefined", "bytes_pcent should be set to undefined if quota is not set") + self.assertEqual(subvol_info["bytes_quota"], "infinite", "bytes_quota should be set to infinite if quota is not set") + self.assertEqual(subvol_info["pool_namespace"], "", "expected pool namespace to be empty") + self.assertEqual(subvol_info["state"], "complete", "expected state to be complete") + + self.assertEqual(len(subvol_info["features"]), 3, + msg="expected 3 features, found '{0}' ({1})".format(len(subvol_info["features"]), subvol_info["features"])) + for feature in ['snapshot-clone', 'snapshot-autoprotect', 'snapshot-retention']: + self.assertIn(feature, subvol_info["features"], msg="expected feature '{0}' in subvolume".format(feature)) + + nsize = self.DEFAULT_FILE_SIZE*1024*1024 + self._fs_cmd("subvolume", "resize", self.volname, subvolume, str(nsize)) + + # get subvolume metadata after quota set + subvol_info = json.loads(self._get_subvolume_info(self.volname, subvolume)) + for md in subvol_md: + self.assertIn(md, subvol_info, "'{0}' key not present in metadata of subvolume".format(md)) + + self.assertNotEqual(subvol_info["bytes_pcent"], "undefined", "bytes_pcent should not be set to undefined if quota is not set") + self.assertEqual(subvol_info["bytes_quota"], nsize, "bytes_quota should be set to '{0}'".format(nsize)) + self.assertEqual(subvol_info["type"], "subvolume", "type should be set to subvolume") + self.assertEqual(subvol_info["state"], "complete", "expected state to be complete") + + self.assertEqual(len(subvol_info["features"]), 3, + msg="expected 3 features, found '{0}' ({1})".format(len(subvol_info["features"]), subvol_info["features"])) + for feature in ['snapshot-clone', 'snapshot-autoprotect', 'snapshot-retention']: + self.assertIn(feature, subvol_info["features"], msg="expected feature '{0}' in subvolume".format(feature)) + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_ls(self): + # tests the 'fs subvolume ls' command + + subvolumes = [] + + # create subvolumes + subvolumes = self._generate_random_subvolume_name(3) + for subvolume in subvolumes: + self._fs_cmd("subvolume", "create", self.volname, subvolume) + + # list subvolumes + subvolumels = json.loads(self._fs_cmd('subvolume', 'ls', self.volname)) + if len(subvolumels) == 0: + self.fail("Expected the 'fs subvolume ls' command to list the created subvolumes.") + else: + subvolnames = [subvolume['name'] for subvolume in subvolumels] + if collections.Counter(subvolnames) != collections.Counter(subvolumes): + self.fail("Error creating or listing subvolumes") + + # remove subvolume + for subvolume in subvolumes: + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_ls_with_groupname_as_internal_directory(self): + # tests the 'fs subvolume ls' command when the default groupname as internal directories + # Eg: '_nogroup', '_legacy', '_deleting', '_index'. + # Expecting 'fs subvolume ls' will be fail with errno EINVAL for '_legacy', '_deleting', '_index' + # Expecting 'fs subvolume ls' will be fail with errno EPERM for '_nogroup' + + # try to list subvolumes providing --group_name=_nogroup option + try: + self._fs_cmd("subvolume", "ls", self.volname, "--group_name", "_nogroup") + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EPERM) + else: + self.fail("expected the 'fs subvolume ls' command to fail with error 'EPERM' for _nogroup") + + # try to list subvolumes providing --group_name=_legacy option + try: + self._fs_cmd("subvolume", "ls", self.volname, "--group_name", "_legacy") + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EINVAL) + else: + self.fail("expected the 'fs subvolume ls' command to fail with error 'EINVAL' for _legacy") + + # try to list subvolumes providing --group_name=_deleting option + try: + self._fs_cmd("subvolume", "ls", self.volname, "--group_name", "_deleting") + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EINVAL) + else: + self.fail("expected the 'fs subvolume ls' command to fail with error 'EINVAL' for _deleting") + + # try to list subvolumes providing --group_name=_index option + try: + self._fs_cmd("subvolume", "ls", self.volname, "--group_name", "_index") + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EINVAL) + else: + self.fail("expected the 'fs subvolume ls' command to fail with error 'EINVAL' for _index") + + def test_subvolume_ls_for_notexistent_default_group(self): + # tests the 'fs subvolume ls' command when the default group '_nogroup' doesn't exist + # prerequisite: we expect that the volume is created and the default group _nogroup is + # NOT created (i.e. a subvolume without group is not created) + + # list subvolumes + subvolumels = json.loads(self._fs_cmd('subvolume', 'ls', self.volname)) + if len(subvolumels) > 0: + raise RuntimeError("Expected the 'fs subvolume ls' command to output an empty list.") + + def test_subvolume_marked(self): + """ + ensure a subvolume is marked with the ceph.dir.subvolume xattr + """ + subvolume = self._generate_random_subvolume_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume) + + # getpath + subvolpath = self._get_subvolume_path(self.volname, subvolume) + + # subdirectory of a subvolume cannot be moved outside the subvolume once marked with + # the xattr ceph.dir.subvolume, hence test by attempting to rename subvol path (incarnation) + # outside the subvolume + dstpath = os.path.join(self.mount_a.mountpoint, 'volumes', '_nogroup', 'new_subvol_location') + srcpath = os.path.join(self.mount_a.mountpoint, subvolpath) + rename_script = dedent(""" + import os + import errno + try: + os.rename("{src}", "{dst}") + except OSError as e: + if e.errno != errno.EXDEV: + raise RuntimeError("invalid error code on renaming subvolume incarnation out of subvolume directory") + else: + raise RuntimeError("expected renaming subvolume incarnation out of subvolume directory to fail") + """) + self.mount_a.run_python(rename_script.format(src=srcpath, dst=dstpath), sudo=True) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_pin_export(self): + self.fs.set_max_mds(2) + status = self.fs.wait_for_daemons() + + subvolume = self._generate_random_subvolume_name() + self._fs_cmd("subvolume", "create", self.volname, subvolume) + self._fs_cmd("subvolume", "pin", self.volname, subvolume, "export", "1") + path = self._fs_cmd("subvolume", "getpath", self.volname, subvolume) + path = os.path.dirname(path) # get subvolume path + + self._get_subtrees(status=status, rank=1) + self._wait_subtrees([(path, 1)], status=status) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + + # verify trash dir is clean + self._wait_for_trash_empty() + + ### authorize operations + + def test_authorize_deauthorize_legacy_subvolume(self): + subvolume = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + authid = "alice" + + guest_mount = self.mount_b + guest_mount.umount_wait() + + # emulate a old-fashioned subvolume in a custom group + createpath = os.path.join(".", "volumes", group, subvolume) + self.mount_a.run_shell(['sudo', 'mkdir', '-p', createpath], omit_sudo=False) + + # add required xattrs to subvolume + default_pool = self.mount_a.getfattr(".", "ceph.dir.layout.pool") + self.mount_a.setfattr(createpath, 'ceph.dir.layout.pool', default_pool, sudo=True) + + mount_path = os.path.join("/", "volumes", group, subvolume) + + # authorize guest authID read-write access to subvolume + key = self._fs_cmd("subvolume", "authorize", self.volname, subvolume, authid, + "--group_name", group, "--tenant_id", "tenant_id") + + # guest authID should exist + existing_ids = [a['entity'] for a in self.auth_list()] + self.assertIn("client.{0}".format(authid), existing_ids) + + # configure credentials for guest client + self._configure_guest_auth(guest_mount, authid, key) + + # mount the subvolume, and write to it + guest_mount.mount_wait(cephfs_mntpt=mount_path) + guest_mount.write_n_mb("data.bin", 1) + + # authorize guest authID read access to subvolume + key = self._fs_cmd("subvolume", "authorize", self.volname, subvolume, authid, + "--group_name", group, "--tenant_id", "tenant_id", "--access_level", "r") + + # guest client sees the change in access level to read only after a + # remount of the subvolume. + guest_mount.umount_wait() + guest_mount.mount_wait(cephfs_mntpt=mount_path) + + # read existing content of the subvolume + self.assertListEqual(guest_mount.ls(guest_mount.mountpoint), ["data.bin"]) + # cannot write into read-only subvolume + with self.assertRaises(CommandFailedError): + guest_mount.write_n_mb("rogue.bin", 1) + + # cleanup + guest_mount.umount_wait() + self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume, authid, + "--group_name", group) + # guest authID should no longer exist + existing_ids = [a['entity'] for a in self.auth_list()] + self.assertNotIn("client.{0}".format(authid), existing_ids) + self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_authorize_deauthorize_subvolume(self): + subvolume = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + authid = "alice" + + guest_mount = self.mount_b + guest_mount.umount_wait() + + # create group + self._fs_cmd("subvolumegroup", "create", self.volname, group, "--mode=777") + + # create subvolume in group + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group) + mount_path = self._fs_cmd("subvolume", "getpath", self.volname, subvolume, + "--group_name", group).rstrip() + + # authorize guest authID read-write access to subvolume + key = self._fs_cmd("subvolume", "authorize", self.volname, subvolume, authid, + "--group_name", group, "--tenant_id", "tenant_id") + + # guest authID should exist + existing_ids = [a['entity'] for a in self.auth_list()] + self.assertIn("client.{0}".format(authid), existing_ids) + + # configure credentials for guest client + self._configure_guest_auth(guest_mount, authid, key) + + # mount the subvolume, and write to it + guest_mount.mount_wait(cephfs_mntpt=mount_path) + guest_mount.write_n_mb("data.bin", 1) + + # authorize guest authID read access to subvolume + key = self._fs_cmd("subvolume", "authorize", self.volname, subvolume, authid, + "--group_name", group, "--tenant_id", "tenant_id", "--access_level", "r") + + # guest client sees the change in access level to read only after a + # remount of the subvolume. + guest_mount.umount_wait() + guest_mount.mount_wait(cephfs_mntpt=mount_path) + + # read existing content of the subvolume + self.assertListEqual(guest_mount.ls(guest_mount.mountpoint), ["data.bin"]) + # cannot write into read-only subvolume + with self.assertRaises(CommandFailedError): + guest_mount.write_n_mb("rogue.bin", 1) + + # cleanup + guest_mount.umount_wait() + self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume, authid, + "--group_name", group) + # guest authID should no longer exist + existing_ids = [a['entity'] for a in self.auth_list()] + self.assertNotIn("client.{0}".format(authid), existing_ids) + self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_multitenant_subvolumes(self): + """ + That subvolume access can be restricted to a tenant. + + That metadata used to enforce tenant isolation of + subvolumes is stored as a two-way mapping between auth + IDs and subvolumes that they're authorized to access. + """ + subvolume = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + + guest_mount = self.mount_b + + # Guest clients belonging to different tenants, but using the same + # auth ID. + auth_id = "alice" + guestclient_1 = { + "auth_id": auth_id, + "tenant_id": "tenant1", + } + guestclient_2 = { + "auth_id": auth_id, + "tenant_id": "tenant2", + } + + # create group + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group) + + # Check that subvolume metadata file is created on subvolume creation. + subvol_metadata_filename = "_{0}:{1}.meta".format(group, subvolume) + self.assertIn(subvol_metadata_filename, guest_mount.ls("volumes")) + + # Authorize 'guestclient_1', using auth ID 'alice' and belonging to + # 'tenant1', with 'rw' access to the volume. + self._fs_cmd("subvolume", "authorize", self.volname, subvolume, guestclient_1["auth_id"], + "--group_name", group, "--tenant_id", guestclient_1["tenant_id"]) + + # Check that auth metadata file for auth ID 'alice', is + # created on authorizing 'alice' access to the subvolume. + auth_metadata_filename = "${0}.meta".format(guestclient_1["auth_id"]) + self.assertIn(auth_metadata_filename, guest_mount.ls("volumes")) + + # Verify that the auth metadata file stores the tenant ID that the + # auth ID belongs to, the auth ID's authorized access levels + # for different subvolumes, versioning details, etc. + expected_auth_metadata = { + "version": 5, + "compat_version": 6, + "dirty": False, + "tenant_id": "tenant1", + "subvolumes": { + "{0}/{1}".format(group,subvolume): { + "dirty": False, + "access_level": "rw" + } + } + } + + auth_metadata = self._auth_metadata_get(guest_mount.read_file("volumes/{0}".format(auth_metadata_filename))) + self.assertGreaterEqual(auth_metadata["version"], expected_auth_metadata["version"]) + del expected_auth_metadata["version"] + del auth_metadata["version"] + self.assertEqual(expected_auth_metadata, auth_metadata) + + # Verify that the subvolume metadata file stores info about auth IDs + # and their access levels to the subvolume, versioning details, etc. + expected_subvol_metadata = { + "version": 1, + "compat_version": 1, + "auths": { + "alice": { + "dirty": False, + "access_level": "rw" + } + } + } + subvol_metadata = self._auth_metadata_get(guest_mount.read_file("volumes/{0}".format(subvol_metadata_filename))) + + self.assertGreaterEqual(subvol_metadata["version"], expected_subvol_metadata["version"]) + del expected_subvol_metadata["version"] + del subvol_metadata["version"] + self.assertEqual(expected_subvol_metadata, subvol_metadata) + + # Cannot authorize 'guestclient_2' to access the volume. + # It uses auth ID 'alice', which has already been used by a + # 'guestclient_1' belonging to an another tenant for accessing + # the volume. + + try: + self._fs_cmd("subvolume", "authorize", self.volname, subvolume, guestclient_2["auth_id"], + "--group_name", group, "--tenant_id", guestclient_2["tenant_id"]) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EPERM, + "Invalid error code returned on authorize of subvolume with same auth_id but different tenant_id") + else: + self.fail("expected the 'fs subvolume authorize' command to fail") + + # Check that auth metadata file is cleaned up on removing + # auth ID's only access to a volume. + + self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume, auth_id, + "--group_name", group) + self.assertNotIn(auth_metadata_filename, guest_mount.ls("volumes")) + + # Check that subvolume metadata file is cleaned up on subvolume deletion. + self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group) + self.assertNotIn(subvol_metadata_filename, guest_mount.ls("volumes")) + + # clean up + guest_mount.umount_wait() + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_subvolume_authorized_list(self): + subvolume = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + authid1 = "alice" + authid2 = "guest1" + authid3 = "guest2" + + # create group + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group) + + # authorize alice authID read-write access to subvolume + self._fs_cmd("subvolume", "authorize", self.volname, subvolume, authid1, + "--group_name", group) + # authorize guest1 authID read-write access to subvolume + self._fs_cmd("subvolume", "authorize", self.volname, subvolume, authid2, + "--group_name", group) + # authorize guest2 authID read access to subvolume + self._fs_cmd("subvolume", "authorize", self.volname, subvolume, authid3, + "--group_name", group, "--access_level", "r") + + # list authorized-ids of the subvolume + expected_auth_list = [{'alice': 'rw'}, {'guest1': 'rw'}, {'guest2': 'r'}] + auth_list = json.loads(self._fs_cmd('subvolume', 'authorized_list', self.volname, subvolume, "--group_name", group)) + self.assertCountEqual(expected_auth_list, auth_list) + + # cleanup + self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume, authid1, + "--group_name", group) + self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume, authid2, + "--group_name", group) + self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume, authid3, + "--group_name", group) + self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_authorize_auth_id_not_created_by_mgr_volumes(self): + """ + If the auth_id already exists and is not created by mgr plugin, + it's not allowed to authorize the auth-id by default. + """ + + subvolume = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + + # Create auth_id + self.fs.mon_manager.raw_cluster_cmd( + "auth", "get-or-create", "client.guest1", + "mds", "allow *", + "osd", "allow rw", + "mon", "allow *" + ) + + auth_id = "guest1" + guestclient_1 = { + "auth_id": auth_id, + "tenant_id": "tenant1", + } + + # create group + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group) + + try: + self._fs_cmd("subvolume", "authorize", self.volname, subvolume, guestclient_1["auth_id"], + "--group_name", group, "--tenant_id", guestclient_1["tenant_id"]) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EPERM, + "Invalid error code returned on authorize of subvolume for auth_id created out of band") + else: + self.fail("expected the 'fs subvolume authorize' command to fail") + + # clean up + self.fs.mon_manager.raw_cluster_cmd("auth", "rm", "client.guest1") + self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_authorize_allow_existing_id_option(self): + """ + If the auth_id already exists and is not created by mgr volumes, + it's not allowed to authorize the auth-id by default but is + allowed with option allow_existing_id. + """ + + subvolume = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + + # Create auth_id + self.fs.mon_manager.raw_cluster_cmd( + "auth", "get-or-create", "client.guest1", + "mds", "allow *", + "osd", "allow rw", + "mon", "allow *" + ) + + auth_id = "guest1" + guestclient_1 = { + "auth_id": auth_id, + "tenant_id": "tenant1", + } + + # create group + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group) + + # Cannot authorize 'guestclient_1' to access the volume by default, + # which already exists and not created by mgr volumes but is allowed + # with option 'allow_existing_id'. + self._fs_cmd("subvolume", "authorize", self.volname, subvolume, guestclient_1["auth_id"], + "--group_name", group, "--tenant_id", guestclient_1["tenant_id"], "--allow-existing-id") + + # clean up + self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume, auth_id, + "--group_name", group) + self.fs.mon_manager.raw_cluster_cmd("auth", "rm", "client.guest1") + self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_deauthorize_auth_id_after_out_of_band_update(self): + """ + If the auth_id authorized by mgr/volumes plugin is updated + out of band, the auth_id should not be deleted after a + deauthorize. It should only remove caps associated with it. + """ + + subvolume = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + + auth_id = "guest1" + guestclient_1 = { + "auth_id": auth_id, + "tenant_id": "tenant1", + } + + # create group + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group) + + # Authorize 'guestclient_1' to access the subvolume. + self._fs_cmd("subvolume", "authorize", self.volname, subvolume, guestclient_1["auth_id"], + "--group_name", group, "--tenant_id", guestclient_1["tenant_id"]) + + subvol_path = self._fs_cmd("subvolume", "getpath", self.volname, subvolume, + "--group_name", group).rstrip() + + # Update caps for guestclient_1 out of band + out = self.fs.mon_manager.raw_cluster_cmd( + "auth", "caps", "client.guest1", + "mds", "allow rw path=/volumes/{0}, allow rw path={1}".format(group, subvol_path), + "osd", "allow rw pool=cephfs_data", + "mon", "allow r", + "mgr", "allow *" + ) + + # Deauthorize guestclient_1 + self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume, auth_id, "--group_name", group) + + # Validate the caps of guestclient_1 after deauthorize. It should not have deleted + # guestclient_1. The mgr and mds caps should be present which was updated out of band. + out = json.loads(self.fs.mon_manager.raw_cluster_cmd("auth", "get", "client.guest1", "--format=json-pretty")) + + self.assertEqual("client.guest1", out[0]["entity"]) + self.assertEqual("allow rw path=/volumes/{0}".format(group), out[0]["caps"]["mds"]) + self.assertEqual("allow *", out[0]["caps"]["mgr"]) + self.assertNotIn("osd", out[0]["caps"]) + + # clean up + out = self.fs.mon_manager.raw_cluster_cmd("auth", "rm", "client.guest1") + self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_recover_auth_metadata_during_authorize(self): + """ + That auth metadata manager can recover from partial auth updates using + metadata files, which store auth info and its update status info. This + test validates the recovery during authorize. + """ + + guest_mount = self.mount_b + + subvolume = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + + auth_id = "guest1" + guestclient_1 = { + "auth_id": auth_id, + "tenant_id": "tenant1", + } + + # create group + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group) + + # Authorize 'guestclient_1' to access the subvolume. + self._fs_cmd("subvolume", "authorize", self.volname, subvolume, guestclient_1["auth_id"], + "--group_name", group, "--tenant_id", guestclient_1["tenant_id"]) + + # Check that auth metadata file for auth ID 'guest1', is + # created on authorizing 'guest1' access to the subvolume. + auth_metadata_filename = "${0}.meta".format(guestclient_1["auth_id"]) + self.assertIn(auth_metadata_filename, guest_mount.ls("volumes")) + expected_auth_metadata_content = self._auth_metadata_get(self.mount_a.read_file("volumes/{0}".format(auth_metadata_filename))) + + # Induce partial auth update state by modifying the auth metadata file, + # and then run authorize again. + guest_mount.run_shell(['sudo', 'sed', '-i', 's/false/true/g', 'volumes/{0}'.format(auth_metadata_filename)], omit_sudo=False) + + # Authorize 'guestclient_1' to access the subvolume. + self._fs_cmd("subvolume", "authorize", self.volname, subvolume, guestclient_1["auth_id"], + "--group_name", group, "--tenant_id", guestclient_1["tenant_id"]) + + auth_metadata_content = self._auth_metadata_get(self.mount_a.read_file("volumes/{0}".format(auth_metadata_filename))) + self.assertEqual(auth_metadata_content, expected_auth_metadata_content) + + # clean up + self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume, auth_id, "--group_name", group) + guest_mount.umount_wait() + self.fs.mon_manager.raw_cluster_cmd("auth", "rm", "client.guest1") + self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_recover_auth_metadata_during_deauthorize(self): + """ + That auth metadata manager can recover from partial auth updates using + metadata files, which store auth info and its update status info. This + test validates the recovery during deauthorize. + """ + + guest_mount = self.mount_b + + subvolume1, subvolume2 = self._generate_random_subvolume_name(2) + group = self._generate_random_group_name() + + guestclient_1 = { + "auth_id": "guest1", + "tenant_id": "tenant1", + } + + # create group + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolumes in group + self._fs_cmd("subvolume", "create", self.volname, subvolume1, "--group_name", group) + self._fs_cmd("subvolume", "create", self.volname, subvolume2, "--group_name", group) + + # Authorize 'guestclient_1' to access the subvolume1. + self._fs_cmd("subvolume", "authorize", self.volname, subvolume1, guestclient_1["auth_id"], + "--group_name", group, "--tenant_id", guestclient_1["tenant_id"]) + + # Check that auth metadata file for auth ID 'guest1', is + # created on authorizing 'guest1' access to the subvolume1. + auth_metadata_filename = "${0}.meta".format(guestclient_1["auth_id"]) + self.assertIn(auth_metadata_filename, guest_mount.ls("volumes")) + expected_auth_metadata_content = self._auth_metadata_get(self.mount_a.read_file("volumes/{0}".format(auth_metadata_filename))) + + # Authorize 'guestclient_1' to access the subvolume2. + self._fs_cmd("subvolume", "authorize", self.volname, subvolume2, guestclient_1["auth_id"], + "--group_name", group, "--tenant_id", guestclient_1["tenant_id"]) + + # Induce partial auth update state by modifying the auth metadata file, + # and then run de-authorize. + guest_mount.run_shell(['sudo', 'sed', '-i', 's/false/true/g', 'volumes/{0}'.format(auth_metadata_filename)], omit_sudo=False) + + # Deauthorize 'guestclient_1' to access the subvolume2. + self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume2, guestclient_1["auth_id"], + "--group_name", group) + + auth_metadata_content = self._auth_metadata_get(self.mount_a.read_file("volumes/{0}".format(auth_metadata_filename))) + self.assertEqual(auth_metadata_content, expected_auth_metadata_content) + + # clean up + self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume1, "guest1", "--group_name", group) + guest_mount.umount_wait() + self.fs.mon_manager.raw_cluster_cmd("auth", "rm", "client.guest1") + self._fs_cmd("subvolume", "rm", self.volname, subvolume1, "--group_name", group) + self._fs_cmd("subvolume", "rm", self.volname, subvolume2, "--group_name", group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_update_old_style_auth_metadata_to_new_during_authorize(self): + """ + CephVolumeClient stores the subvolume data in auth metadata file with + 'volumes' key as there was no subvolume namespace. It doesn't makes sense + with mgr/volumes. This test validates the transparent update of 'volumes' + key to 'subvolumes' key in auth metadata file during authorize. + """ + + guest_mount = self.mount_b + + subvolume1, subvolume2 = self._generate_random_subvolume_name(2) + group = self._generate_random_group_name() + + auth_id = "guest1" + guestclient_1 = { + "auth_id": auth_id, + "tenant_id": "tenant1", + } + + # create group + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolumes in group + self._fs_cmd("subvolume", "create", self.volname, subvolume1, "--group_name", group) + self._fs_cmd("subvolume", "create", self.volname, subvolume2, "--group_name", group) + + # Authorize 'guestclient_1' to access the subvolume1. + self._fs_cmd("subvolume", "authorize", self.volname, subvolume1, guestclient_1["auth_id"], + "--group_name", group, "--tenant_id", guestclient_1["tenant_id"]) + + # Check that auth metadata file for auth ID 'guest1', is + # created on authorizing 'guest1' access to the subvolume1. + auth_metadata_filename = "${0}.meta".format(guestclient_1["auth_id"]) + self.assertIn(auth_metadata_filename, guest_mount.ls("volumes")) + + # Replace 'subvolumes' to 'volumes', old style auth-metadata file + guest_mount.run_shell(['sudo', 'sed', '-i', 's/subvolumes/volumes/g', 'volumes/{0}'.format(auth_metadata_filename)], omit_sudo=False) + + # Authorize 'guestclient_1' to access the subvolume2. This should transparently update 'volumes' to 'subvolumes' + self._fs_cmd("subvolume", "authorize", self.volname, subvolume2, guestclient_1["auth_id"], + "--group_name", group, "--tenant_id", guestclient_1["tenant_id"]) + + expected_auth_metadata = { + "version": 5, + "compat_version": 6, + "dirty": False, + "tenant_id": "tenant1", + "subvolumes": { + "{0}/{1}".format(group,subvolume1): { + "dirty": False, + "access_level": "rw" + }, + "{0}/{1}".format(group,subvolume2): { + "dirty": False, + "access_level": "rw" + } + } + } + + auth_metadata = self._auth_metadata_get(guest_mount.read_file("volumes/{0}".format(auth_metadata_filename))) + + self.assertGreaterEqual(auth_metadata["version"], expected_auth_metadata["version"]) + del expected_auth_metadata["version"] + del auth_metadata["version"] + self.assertEqual(expected_auth_metadata, auth_metadata) + + # clean up + self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume1, auth_id, "--group_name", group) + self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume2, auth_id, "--group_name", group) + guest_mount.umount_wait() + self.fs.mon_manager.raw_cluster_cmd("auth", "rm", "client.guest1") + self._fs_cmd("subvolume", "rm", self.volname, subvolume1, "--group_name", group) + self._fs_cmd("subvolume", "rm", self.volname, subvolume2, "--group_name", group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_update_old_style_auth_metadata_to_new_during_deauthorize(self): + """ + CephVolumeClient stores the subvolume data in auth metadata file with + 'volumes' key as there was no subvolume namespace. It doesn't makes sense + with mgr/volumes. This test validates the transparent update of 'volumes' + key to 'subvolumes' key in auth metadata file during deauthorize. + """ + + guest_mount = self.mount_b + + subvolume1, subvolume2 = self._generate_random_subvolume_name(2) + group = self._generate_random_group_name() + + auth_id = "guest1" + guestclient_1 = { + "auth_id": auth_id, + "tenant_id": "tenant1", + } + + # create group + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolumes in group + self._fs_cmd("subvolume", "create", self.volname, subvolume1, "--group_name", group) + self._fs_cmd("subvolume", "create", self.volname, subvolume2, "--group_name", group) + + # Authorize 'guestclient_1' to access the subvolume1. + self._fs_cmd("subvolume", "authorize", self.volname, subvolume1, guestclient_1["auth_id"], + "--group_name", group, "--tenant_id", guestclient_1["tenant_id"]) + + # Authorize 'guestclient_1' to access the subvolume2. + self._fs_cmd("subvolume", "authorize", self.volname, subvolume2, guestclient_1["auth_id"], + "--group_name", group, "--tenant_id", guestclient_1["tenant_id"]) + + # Check that auth metadata file for auth ID 'guest1', is created. + auth_metadata_filename = "${0}.meta".format(guestclient_1["auth_id"]) + self.assertIn(auth_metadata_filename, guest_mount.ls("volumes")) + + # Replace 'subvolumes' to 'volumes', old style auth-metadata file + guest_mount.run_shell(['sudo', 'sed', '-i', 's/subvolumes/volumes/g', 'volumes/{0}'.format(auth_metadata_filename)], omit_sudo=False) + + # Deauthorize 'guestclient_1' to access the subvolume2. This should update 'volumes' to subvolumes' + self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume2, auth_id, "--group_name", group) + + expected_auth_metadata = { + "version": 5, + "compat_version": 6, + "dirty": False, + "tenant_id": "tenant1", + "subvolumes": { + "{0}/{1}".format(group,subvolume1): { + "dirty": False, + "access_level": "rw" + } + } + } + + auth_metadata = self._auth_metadata_get(guest_mount.read_file("volumes/{0}".format(auth_metadata_filename))) + + self.assertGreaterEqual(auth_metadata["version"], expected_auth_metadata["version"]) + del expected_auth_metadata["version"] + del auth_metadata["version"] + self.assertEqual(expected_auth_metadata, auth_metadata) + + # clean up + self._fs_cmd("subvolume", "deauthorize", self.volname, subvolume1, auth_id, "--group_name", group) + guest_mount.umount_wait() + self.fs.mon_manager.raw_cluster_cmd("auth", "rm", "client.guest1") + self._fs_cmd("subvolume", "rm", self.volname, subvolume1, "--group_name", group) + self._fs_cmd("subvolume", "rm", self.volname, subvolume2, "--group_name", group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_subvolume_evict_client(self): + """ + That a subvolume client can be evicted based on the auth ID + """ + + subvolumes = self._generate_random_subvolume_name(2) + group = self._generate_random_group_name() + + # create group + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # mounts[0] and mounts[1] would be used as guests to mount the volumes/shares. + for i in range(0, 2): + self.mounts[i].umount_wait() + guest_mounts = (self.mounts[0], self.mounts[1]) + auth_id = "guest" + guestclient_1 = { + "auth_id": auth_id, + "tenant_id": "tenant1", + } + + # Create two subvolumes. Authorize 'guest' auth ID to mount the two + # subvolumes. Mount the two subvolumes. Write data to the volumes. + for i in range(2): + # Create subvolume. + self._fs_cmd("subvolume", "create", self.volname, subvolumes[i], "--group_name", group, "--mode=777") + + # authorize guest authID read-write access to subvolume + key = self._fs_cmd("subvolume", "authorize", self.volname, subvolumes[i], guestclient_1["auth_id"], + "--group_name", group, "--tenant_id", guestclient_1["tenant_id"]) + + mount_path = self._fs_cmd("subvolume", "getpath", self.volname, subvolumes[i], + "--group_name", group).rstrip() + # configure credentials for guest client + self._configure_guest_auth(guest_mounts[i], auth_id, key) + + # mount the subvolume, and write to it + guest_mounts[i].mount_wait(cephfs_mntpt=mount_path) + guest_mounts[i].write_n_mb("data.bin", 1) + + # Evict client, guest_mounts[0], using auth ID 'guest' and has mounted + # one volume. + self._fs_cmd("subvolume", "evict", self.volname, subvolumes[0], auth_id, "--group_name", group) + + # Evicted guest client, guest_mounts[0], should not be able to do + # anymore metadata ops. It should start failing all operations + # when it sees that its own address is in the blocklist. + try: + guest_mounts[0].write_n_mb("rogue.bin", 1) + except CommandFailedError: + pass + else: + raise RuntimeError("post-eviction write should have failed!") + + # The blocklisted guest client should now be unmountable + guest_mounts[0].umount_wait() + + # Guest client, guest_mounts[1], using the same auth ID 'guest', but + # has mounted the other volume, should be able to use its volume + # unaffected. + guest_mounts[1].write_n_mb("data.bin.1", 1) + + # Cleanup. + guest_mounts[1].umount_wait() + for i in range(2): + self._fs_cmd("subvolume", "deauthorize", self.volname, subvolumes[i], auth_id, "--group_name", group) + self._fs_cmd("subvolume", "rm", self.volname, subvolumes[i], "--group_name", group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_subvolume_pin_random(self): + self.fs.set_max_mds(2) + self.fs.wait_for_daemons() + self.config_set('mds', 'mds_export_ephemeral_random', True) + + subvolume = self._generate_random_subvolume_name() + self._fs_cmd("subvolume", "create", self.volname, subvolume) + self._fs_cmd("subvolume", "pin", self.volname, subvolume, "random", ".01") + # no verification + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_resize_fail_invalid_size(self): + """ + That a subvolume cannot be resized to an invalid size and the quota did not change + """ + + osize = self.DEFAULT_FILE_SIZE*1024*1024 + # create subvolume + subvolname = self._generate_random_subvolume_name() + self._fs_cmd("subvolume", "create", self.volname, subvolname, "--size", str(osize)) + + # make sure it exists + subvolpath = self._get_subvolume_path(self.volname, subvolname) + self.assertNotEqual(subvolpath, None) + + # try to resize the subvolume with an invalid size -10 + nsize = -10 + try: + self._fs_cmd("subvolume", "resize", self.volname, subvolname, str(nsize)) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EINVAL, "invalid error code on resize of subvolume with invalid size") + else: + self.fail("expected the 'fs subvolume resize' command to fail") + + # verify the quota did not change + size = int(self.mount_a.getfattr(subvolpath, "ceph.quota.max_bytes")) + self.assertEqual(size, osize) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolname) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_resize_fail_zero_size(self): + """ + That a subvolume cannot be resized to a zero size and the quota did not change + """ + + osize = self.DEFAULT_FILE_SIZE*1024*1024 + # create subvolume + subvolname = self._generate_random_subvolume_name() + self._fs_cmd("subvolume", "create", self.volname, subvolname, "--size", str(osize)) + + # make sure it exists + subvolpath = self._get_subvolume_path(self.volname, subvolname) + self.assertNotEqual(subvolpath, None) + + # try to resize the subvolume with size 0 + nsize = 0 + try: + self._fs_cmd("subvolume", "resize", self.volname, subvolname, str(nsize)) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EINVAL, "invalid error code on resize of subvolume with invalid size") + else: + self.fail("expected the 'fs subvolume resize' command to fail") + + # verify the quota did not change + size = int(self.mount_a.getfattr(subvolpath, "ceph.quota.max_bytes")) + self.assertEqual(size, osize) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolname) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_resize_quota_lt_used_size(self): + """ + That a subvolume can be resized to a size smaller than the current used size + and the resulting quota matches the expected size. + """ + + osize = self.DEFAULT_FILE_SIZE*1024*1024*20 + # create subvolume + subvolname = self._generate_random_subvolume_name() + self._fs_cmd("subvolume", "create", self.volname, subvolname, "--size", str(osize), "--mode=777") + + # make sure it exists + subvolpath = self._get_subvolume_path(self.volname, subvolname) + self.assertNotEqual(subvolpath, None) + + # create one file of 10MB + file_size=self.DEFAULT_FILE_SIZE*10 + number_of_files=1 + log.debug("filling subvolume {0} with {1} file of size {2}MB".format(subvolname, + number_of_files, + file_size)) + filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, self.DEFAULT_NUMBER_OF_FILES+1) + self.mount_a.write_n_mb(os.path.join(subvolpath, filename), file_size) + + usedsize = int(self.mount_a.getfattr(subvolpath, "ceph.dir.rbytes")) + susedsize = int(self.mount_a.run_shell(['stat', '-c' '%s', subvolpath]).stdout.getvalue().strip()) + if isinstance(self.mount_a, FuseMount): + # kclient dir does not have size==rbytes + self.assertEqual(usedsize, susedsize) + + # shrink the subvolume + nsize = usedsize // 2 + try: + self._fs_cmd("subvolume", "resize", self.volname, subvolname, str(nsize)) + except CommandFailedError: + self.fail("expected the 'fs subvolume resize' command to succeed") + + # verify the quota + size = int(self.mount_a.getfattr(subvolpath, "ceph.quota.max_bytes")) + self.assertEqual(size, nsize) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolname) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_resize_fail_quota_lt_used_size_no_shrink(self): + """ + That a subvolume cannot be resized to a size smaller than the current used size + when --no_shrink is given and the quota did not change. + """ + + osize = self.DEFAULT_FILE_SIZE*1024*1024*20 + # create subvolume + subvolname = self._generate_random_subvolume_name() + self._fs_cmd("subvolume", "create", self.volname, subvolname, "--size", str(osize), "--mode=777") + + # make sure it exists + subvolpath = self._get_subvolume_path(self.volname, subvolname) + self.assertNotEqual(subvolpath, None) + + # create one file of 10MB + file_size=self.DEFAULT_FILE_SIZE*10 + number_of_files=1 + log.debug("filling subvolume {0} with {1} file of size {2}MB".format(subvolname, + number_of_files, + file_size)) + filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, self.DEFAULT_NUMBER_OF_FILES+2) + self.mount_a.write_n_mb(os.path.join(subvolpath, filename), file_size) + + usedsize = int(self.mount_a.getfattr(subvolpath, "ceph.dir.rbytes")) + susedsize = int(self.mount_a.run_shell(['stat', '-c' '%s', subvolpath]).stdout.getvalue().strip()) + if isinstance(self.mount_a, FuseMount): + # kclient dir does not have size==rbytes + self.assertEqual(usedsize, susedsize) + + # shrink the subvolume + nsize = usedsize // 2 + try: + self._fs_cmd("subvolume", "resize", self.volname, subvolname, str(nsize), "--no_shrink") + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EINVAL, "invalid error code on resize of subvolume with invalid size") + else: + self.fail("expected the 'fs subvolume resize' command to fail") + + # verify the quota did not change + size = int(self.mount_a.getfattr(subvolpath, "ceph.quota.max_bytes")) + self.assertEqual(size, osize) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolname) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_resize_expand_on_full_subvolume(self): + """ + That the subvolume can be expanded from a full subvolume and future writes succeed. + """ + + osize = self.DEFAULT_FILE_SIZE*1024*1024*10 + # create subvolume of quota 10MB and make sure it exists + subvolname = self._generate_random_subvolume_name() + self._fs_cmd("subvolume", "create", self.volname, subvolname, "--size", str(osize), "--mode=777") + subvolpath = self._get_subvolume_path(self.volname, subvolname) + self.assertNotEqual(subvolpath, None) + + # create one file of size 10MB and write + file_size=self.DEFAULT_FILE_SIZE*10 + number_of_files=1 + log.debug("filling subvolume {0} with {1} file of size {2}MB".format(subvolname, + number_of_files, + file_size)) + filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, self.DEFAULT_NUMBER_OF_FILES+3) + self.mount_a.write_n_mb(os.path.join(subvolpath, filename), file_size) + + # create a file of size 5MB and try write more + file_size=file_size // 2 + number_of_files=1 + log.debug("filling subvolume {0} with {1} file of size {2}MB".format(subvolname, + number_of_files, + file_size)) + filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, self.DEFAULT_NUMBER_OF_FILES+4) + try: + self.mount_a.write_n_mb(os.path.join(subvolpath, filename), file_size) + except CommandFailedError: + # Not able to write. So expand the subvolume more and try writing the 5MB file again + nsize = osize*2 + self._fs_cmd("subvolume", "resize", self.volname, subvolname, str(nsize)) + try: + self.mount_a.write_n_mb(os.path.join(subvolpath, filename), file_size) + except CommandFailedError: + self.fail("expected filling subvolume {0} with {1} file of size {2}MB" + "to succeed".format(subvolname, number_of_files, file_size)) + else: + self.fail("expected filling subvolume {0} with {1} file of size {2}MB" + "to fail".format(subvolname, number_of_files, file_size)) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolname) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_resize_infinite_size(self): + """ + That a subvolume can be resized to an infinite size by unsetting its quota. + """ + + # create subvolume + subvolname = self._generate_random_subvolume_name() + self._fs_cmd("subvolume", "create", self.volname, subvolname, "--size", + str(self.DEFAULT_FILE_SIZE*1024*1024)) + + # make sure it exists + subvolpath = self._get_subvolume_path(self.volname, subvolname) + self.assertNotEqual(subvolpath, None) + + # resize inf + self._fs_cmd("subvolume", "resize", self.volname, subvolname, "inf") + + # verify that the quota is None + size = self.mount_a.getfattr(subvolpath, "ceph.quota.max_bytes") + self.assertEqual(size, None) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolname) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_resize_infinite_size_future_writes(self): + """ + That a subvolume can be resized to an infinite size and the future writes succeed. + """ + + # create subvolume + subvolname = self._generate_random_subvolume_name() + self._fs_cmd("subvolume", "create", self.volname, subvolname, "--size", + str(self.DEFAULT_FILE_SIZE*1024*1024*5), "--mode=777") + + # make sure it exists + subvolpath = self._get_subvolume_path(self.volname, subvolname) + self.assertNotEqual(subvolpath, None) + + # resize inf + self._fs_cmd("subvolume", "resize", self.volname, subvolname, "inf") + + # verify that the quota is None + size = self.mount_a.getfattr(subvolpath, "ceph.quota.max_bytes") + self.assertEqual(size, None) + + # create one file of 10MB and try to write + file_size=self.DEFAULT_FILE_SIZE*10 + number_of_files=1 + log.debug("filling subvolume {0} with {1} file of size {2}MB".format(subvolname, + number_of_files, + file_size)) + filename = "{0}.{1}".format(TestVolumes.TEST_FILE_NAME_PREFIX, self.DEFAULT_NUMBER_OF_FILES+5) + + try: + self.mount_a.write_n_mb(os.path.join(subvolpath, filename), file_size) + except CommandFailedError: + self.fail("expected filling subvolume {0} with {1} file of size {2}MB " + "to succeed".format(subvolname, number_of_files, file_size)) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolname) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_rm_force(self): + # test removing non-existing subvolume with --force + subvolume = self._generate_random_subvolume_name() + try: + self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--force") + except CommandFailedError: + self.fail("expected the 'fs subvolume rm --force' command to succeed") + + def test_subvolume_exists_with_subvolumegroup_and_subvolume(self): + """Test the presence of any subvolume by specifying the name of subvolumegroup""" + + group = self._generate_random_group_name() + subvolume1 = self._generate_random_subvolume_name() + # create subvolumegroup + self._fs_cmd("subvolumegroup", "create", self.volname, group) + # create subvolume in group + self._fs_cmd("subvolume", "create", self.volname, subvolume1, "--group_name", group) + ret = self._fs_cmd("subvolume", "exist", self.volname, "--group_name", group) + self.assertEqual(ret.strip('\n'), "subvolume exists") + # delete subvolume in group + self._fs_cmd("subvolume", "rm", self.volname, subvolume1, "--group_name", group) + ret = self._fs_cmd("subvolume", "exist", self.volname, "--group_name", group) + self.assertEqual(ret.strip('\n'), "no subvolume exists") + # delete subvolumegroup + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_subvolume_exists_with_subvolumegroup_and_no_subvolume(self): + """Test the presence of any subvolume specifying the name + of subvolumegroup and no subvolumes""" + + group = self._generate_random_group_name() + # create subvolumegroup + self._fs_cmd("subvolumegroup", "create", self.volname, group) + ret = self._fs_cmd("subvolume", "exist", self.volname, "--group_name", group) + self.assertEqual(ret.strip('\n'), "no subvolume exists") + # delete subvolumegroup + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_subvolume_exists_without_subvolumegroup_and_with_subvolume(self): + """Test the presence of any subvolume without specifying the name + of subvolumegroup""" + + subvolume1 = self._generate_random_subvolume_name() + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume1) + ret = self._fs_cmd("subvolume", "exist", self.volname) + self.assertEqual(ret.strip('\n'), "subvolume exists") + # delete subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume1) + ret = self._fs_cmd("subvolume", "exist", self.volname) + self.assertEqual(ret.strip('\n'), "no subvolume exists") + + def test_subvolume_exists_without_subvolumegroup_and_without_subvolume(self): + """Test the presence of any subvolume without any subvolumegroup + and without any subvolume""" + + ret = self._fs_cmd("subvolume", "exist", self.volname) + self.assertEqual(ret.strip('\n'), "no subvolume exists") + + def test_subvolume_shrink(self): + """ + That a subvolume can be shrinked in size and its quota matches the expected size. + """ + + # create subvolume + subvolname = self._generate_random_subvolume_name() + osize = self.DEFAULT_FILE_SIZE*1024*1024 + self._fs_cmd("subvolume", "create", self.volname, subvolname, "--size", str(osize)) + + # make sure it exists + subvolpath = self._get_subvolume_path(self.volname, subvolname) + self.assertNotEqual(subvolpath, None) + + # shrink the subvolume + nsize = osize // 2 + self._fs_cmd("subvolume", "resize", self.volname, subvolname, str(nsize)) + + # verify the quota + size = int(self.mount_a.getfattr(subvolpath, "ceph.quota.max_bytes")) + self.assertEqual(size, nsize) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolname) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_retain_snapshot_rm_idempotency(self): + """ + ensure subvolume deletion of a subvolume which is already deleted with retain snapshots option passes. + After subvolume deletion with retain snapshots, the subvolume exists until the trash directory (resides inside subvolume) + is cleaned up. The subvolume deletion issued while the trash directory is not empty, should pass and should + not error out with EAGAIN. + """ + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # do some IO + self._do_subvolume_io(subvolume, number_of_files=256) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # remove with snapshot retention + self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots") + + # remove snapshots (removes retained volume) + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove subvolume (check idempotency) + try: + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + except CommandFailedError as ce: + if ce.exitstatus != errno.ENOENT: + self.fail(f"expected subvolume rm to pass with error: {os.strerror(ce.exitstatus)}") + + # verify trash dir is clean + self._wait_for_trash_empty() + + + def test_subvolume_user_metadata_set(self): + subvolname = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + + # create group. + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group. + self._fs_cmd("subvolume", "create", self.volname, subvolname, "--group_name", group) + + # set metadata for subvolume. + key = "key" + value = "value" + try: + self._fs_cmd("subvolume", "metadata", "set", self.volname, subvolname, key, value, "--group_name", group) + except CommandFailedError: + self.fail("expected the 'fs subvolume metadata set' command to succeed") + + self._fs_cmd("subvolume", "rm", self.volname, subvolname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean. + self._wait_for_trash_empty() + + def test_subvolume_user_metadata_set_idempotence(self): + subvolname = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + + # create group. + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group. + self._fs_cmd("subvolume", "create", self.volname, subvolname, "--group_name", group) + + # set metadata for subvolume. + key = "key" + value = "value" + try: + self._fs_cmd("subvolume", "metadata", "set", self.volname, subvolname, key, value, "--group_name", group) + except CommandFailedError: + self.fail("expected the 'fs subvolume metadata set' command to succeed") + + # set same metadata again for subvolume. + try: + self._fs_cmd("subvolume", "metadata", "set", self.volname, subvolname, key, value, "--group_name", group) + except CommandFailedError: + self.fail("expected the 'fs subvolume metadata set' command to succeed because it is idempotent operation") + + self._fs_cmd("subvolume", "rm", self.volname, subvolname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean. + self._wait_for_trash_empty() + + def test_subvolume_user_metadata_get(self): + subvolname = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + + # create group. + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group. + self._fs_cmd("subvolume", "create", self.volname, subvolname, "--group_name", group) + + # set metadata for subvolume. + key = "key" + value = "value" + self._fs_cmd("subvolume", "metadata", "set", self.volname, subvolname, key, value, "--group_name", group) + + # get value for specified key. + try: + ret = self._fs_cmd("subvolume", "metadata", "get", self.volname, subvolname, key, "--group_name", group) + except CommandFailedError: + self.fail("expected the 'fs subvolume metadata get' command to succeed") + + # remove '\n' from returned value. + ret = ret.strip('\n') + + # match received value with expected value. + self.assertEqual(value, ret) + + self._fs_cmd("subvolume", "rm", self.volname, subvolname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean. + self._wait_for_trash_empty() + + def test_subvolume_user_metadata_get_for_nonexisting_key(self): + subvolname = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + + # create group. + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group. + self._fs_cmd("subvolume", "create", self.volname, subvolname, "--group_name", group) + + # set metadata for subvolume. + key = "key" + value = "value" + self._fs_cmd("subvolume", "metadata", "set", self.volname, subvolname, key, value, "--group_name", group) + + # try to get value for nonexisting key + # Expecting ENOENT exit status because key does not exist + try: + self._fs_cmd("subvolume", "metadata", "get", self.volname, subvolname, "key_nonexist", "--group_name", group) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.ENOENT) + else: + self.fail("Expected ENOENT because 'key_nonexist' does not exist") + + self._fs_cmd("subvolume", "rm", self.volname, subvolname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean. + self._wait_for_trash_empty() + + def test_subvolume_user_metadata_get_for_nonexisting_section(self): + subvolname = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + + # create group. + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group. + self._fs_cmd("subvolume", "create", self.volname, subvolname, "--group_name", group) + + # try to get value for nonexisting key (as section does not exist) + # Expecting ENOENT exit status because key does not exist + try: + self._fs_cmd("subvolume", "metadata", "get", self.volname, subvolname, "key", "--group_name", group) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.ENOENT) + else: + self.fail("Expected ENOENT because section does not exist") + + self._fs_cmd("subvolume", "rm", self.volname, subvolname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean. + self._wait_for_trash_empty() + + def test_subvolume_user_metadata_update(self): + subvolname = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + + # create group. + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group. + self._fs_cmd("subvolume", "create", self.volname, subvolname, "--group_name", group) + + # set metadata for subvolume. + key = "key" + value = "value" + self._fs_cmd("subvolume", "metadata", "set", self.volname, subvolname, key, value, "--group_name", group) + + # update metadata against key. + new_value = "new_value" + self._fs_cmd("subvolume", "metadata", "set", self.volname, subvolname, key, new_value, "--group_name", group) + + # get metadata for specified key of subvolume. + try: + ret = self._fs_cmd("subvolume", "metadata", "get", self.volname, subvolname, key, "--group_name", group) + except CommandFailedError: + self.fail("expected the 'fs subvolume metadata get' command to succeed") + + # remove '\n' from returned value. + ret = ret.strip('\n') + + # match received value with expected value. + self.assertEqual(new_value, ret) + + self._fs_cmd("subvolume", "rm", self.volname, subvolname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean. + self._wait_for_trash_empty() + + def test_subvolume_user_metadata_list(self): + subvolname = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + + # create group. + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group. + self._fs_cmd("subvolume", "create", self.volname, subvolname, "--group_name", group) + + # set metadata for subvolume. + input_metadata_dict = {f'key_{i}' : f'value_{i}' for i in range(3)} + + for k, v in input_metadata_dict.items(): + self._fs_cmd("subvolume", "metadata", "set", self.volname, subvolname, k, v, "--group_name", group) + + # list metadata + try: + ret = self._fs_cmd("subvolume", "metadata", "ls", self.volname, subvolname, "--group_name", group) + except CommandFailedError: + self.fail("expected the 'fs subvolume metadata ls' command to succeed") + + ret_dict = json.loads(ret) + + # compare output with expected output + self.assertDictEqual(input_metadata_dict, ret_dict) + + self._fs_cmd("subvolume", "rm", self.volname, subvolname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean. + self._wait_for_trash_empty() + + def test_subvolume_user_metadata_list_if_no_metadata_set(self): + subvolname = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + + # create group. + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group. + self._fs_cmd("subvolume", "create", self.volname, subvolname, "--group_name", group) + + # list metadata + try: + ret = self._fs_cmd("subvolume", "metadata", "ls", self.volname, subvolname, "--group_name", group) + except CommandFailedError: + self.fail("expected the 'fs subvolume metadata ls' command to succeed") + + # remove '\n' from returned value. + ret = ret.strip('\n') + + # compare output with expected output + # expecting empty json/dictionary + self.assertEqual(ret, "{}") + + self._fs_cmd("subvolume", "rm", self.volname, subvolname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean. + self._wait_for_trash_empty() + + def test_subvolume_user_metadata_remove(self): + subvolname = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + + # create group. + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group. + self._fs_cmd("subvolume", "create", self.volname, subvolname, "--group_name", group) + + # set metadata for subvolume. + key = "key" + value = "value" + self._fs_cmd("subvolume", "metadata", "set", self.volname, subvolname, key, value, "--group_name", group) + + # remove metadata against specified key. + try: + self._fs_cmd("subvolume", "metadata", "rm", self.volname, subvolname, key, "--group_name", group) + except CommandFailedError: + self.fail("expected the 'fs subvolume metadata rm' command to succeed") + + # confirm key is removed by again fetching metadata + try: + self._fs_cmd("subvolume", "metadata", "get", self.volname, subvolname, key, "--group_name", group) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.ENOENT) + else: + self.fail("Expected ENOENT because key does not exist") + + self._fs_cmd("subvolume", "rm", self.volname, subvolname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean. + self._wait_for_trash_empty() + + def test_subvolume_user_metadata_remove_for_nonexisting_key(self): + subvolname = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + + # create group. + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group. + self._fs_cmd("subvolume", "create", self.volname, subvolname, "--group_name", group) + + # set metadata for subvolume. + key = "key" + value = "value" + self._fs_cmd("subvolume", "metadata", "set", self.volname, subvolname, key, value, "--group_name", group) + + # try to remove value for nonexisting key + # Expecting ENOENT exit status because key does not exist + try: + self._fs_cmd("subvolume", "metadata", "rm", self.volname, subvolname, "key_nonexist", "--group_name", group) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.ENOENT) + else: + self.fail("Expected ENOENT because 'key_nonexist' does not exist") + + self._fs_cmd("subvolume", "rm", self.volname, subvolname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean. + self._wait_for_trash_empty() + + def test_subvolume_user_metadata_remove_for_nonexisting_section(self): + subvolname = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + + # create group. + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group. + self._fs_cmd("subvolume", "create", self.volname, subvolname, "--group_name", group) + + # try to remove value for nonexisting key (as section does not exist) + # Expecting ENOENT exit status because key does not exist + try: + self._fs_cmd("subvolume", "metadata", "rm", self.volname, subvolname, "key", "--group_name", group) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.ENOENT) + else: + self.fail("Expected ENOENT because section does not exist") + + self._fs_cmd("subvolume", "rm", self.volname, subvolname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean. + self._wait_for_trash_empty() + + def test_subvolume_user_metadata_remove_force(self): + subvolname = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + + # create group. + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group. + self._fs_cmd("subvolume", "create", self.volname, subvolname, "--group_name", group) + + # set metadata for subvolume. + key = "key" + value = "value" + self._fs_cmd("subvolume", "metadata", "set", self.volname, subvolname, key, value, "--group_name", group) + + # remove metadata against specified key with --force option. + try: + self._fs_cmd("subvolume", "metadata", "rm", self.volname, subvolname, key, "--group_name", group, "--force") + except CommandFailedError: + self.fail("expected the 'fs subvolume metadata rm' command to succeed") + + # confirm key is removed by again fetching metadata + try: + self._fs_cmd("subvolume", "metadata", "get", self.volname, subvolname, key, "--group_name", group) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.ENOENT) + else: + self.fail("Expected ENOENT because key does not exist") + + self._fs_cmd("subvolume", "rm", self.volname, subvolname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean. + self._wait_for_trash_empty() + + def test_subvolume_user_metadata_remove_force_for_nonexisting_key(self): + subvolname = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + + # create group. + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group. + self._fs_cmd("subvolume", "create", self.volname, subvolname, "--group_name", group) + + # set metadata for subvolume. + key = "key" + value = "value" + self._fs_cmd("subvolume", "metadata", "set", self.volname, subvolname, key, value, "--group_name", group) + + # remove metadata against specified key. + try: + self._fs_cmd("subvolume", "metadata", "rm", self.volname, subvolname, key, "--group_name", group) + except CommandFailedError: + self.fail("expected the 'fs subvolume metadata rm' command to succeed") + + # confirm key is removed by again fetching metadata + try: + self._fs_cmd("subvolume", "metadata", "get", self.volname, subvolname, key, "--group_name", group) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.ENOENT) + else: + self.fail("Expected ENOENT because key does not exist") + + # again remove metadata against already removed key with --force option. + try: + self._fs_cmd("subvolume", "metadata", "rm", self.volname, subvolname, key, "--group_name", group, "--force") + except CommandFailedError: + self.fail("expected the 'fs subvolume metadata rm' (with --force) command to succeed") + + self._fs_cmd("subvolume", "rm", self.volname, subvolname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean. + self._wait_for_trash_empty() + + def test_subvolume_user_metadata_set_and_get_for_legacy_subvolume(self): + subvolname = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + + # emulate a old-fashioned subvolume in a custom group + createpath = os.path.join(".", "volumes", group, subvolname) + self.mount_a.run_shell(['sudo', 'mkdir', '-p', createpath], omit_sudo=False) + + # set metadata for subvolume. + key = "key" + value = "value" + try: + self._fs_cmd("subvolume", "metadata", "set", self.volname, subvolname, key, value, "--group_name", group) + except CommandFailedError: + self.fail("expected the 'fs subvolume metadata set' command to succeed") + + # get value for specified key. + try: + ret = self._fs_cmd("subvolume", "metadata", "get", self.volname, subvolname, key, "--group_name", group) + except CommandFailedError: + self.fail("expected the 'fs subvolume metadata get' command to succeed") + + # remove '\n' from returned value. + ret = ret.strip('\n') + + # match received value with expected value. + self.assertEqual(value, ret) + + self._fs_cmd("subvolume", "rm", self.volname, subvolname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean. + self._wait_for_trash_empty() + + def test_subvolume_user_metadata_list_and_remove_for_legacy_subvolume(self): + subvolname = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + + # emulate a old-fashioned subvolume in a custom group + createpath = os.path.join(".", "volumes", group, subvolname) + self.mount_a.run_shell(['sudo', 'mkdir', '-p', createpath], omit_sudo=False) + + # set metadata for subvolume. + input_metadata_dict = {f'key_{i}' : f'value_{i}' for i in range(3)} + + for k, v in input_metadata_dict.items(): + self._fs_cmd("subvolume", "metadata", "set", self.volname, subvolname, k, v, "--group_name", group) + + # list metadata + try: + ret = self._fs_cmd("subvolume", "metadata", "ls", self.volname, subvolname, "--group_name", group) + except CommandFailedError: + self.fail("expected the 'fs subvolume metadata ls' command to succeed") + + ret_dict = json.loads(ret) + + # compare output with expected output + self.assertDictEqual(input_metadata_dict, ret_dict) + + # remove metadata against specified key. + try: + self._fs_cmd("subvolume", "metadata", "rm", self.volname, subvolname, "key_1", "--group_name", group) + except CommandFailedError: + self.fail("expected the 'fs subvolume metadata rm' command to succeed") + + # confirm key is removed by again fetching metadata + try: + self._fs_cmd("subvolume", "metadata", "get", self.volname, subvolname, "key_1", "--group_name", group) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.ENOENT) + else: + self.fail("Expected ENOENT because key_1 does not exist") + + self._fs_cmd("subvolume", "rm", self.volname, subvolname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean. + self._wait_for_trash_empty() + +class TestSubvolumeGroupSnapshots(TestVolumesHelper): + """Tests for FS subvolume group snapshot operations.""" + @unittest.skip("skipping subvolumegroup snapshot tests") + def test_nonexistent_subvolume_group_snapshot_rm(self): + subvolume = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + snapshot = self._generate_random_snapshot_name() + + # create group + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group) + + # snapshot group + self._fs_cmd("subvolumegroup", "snapshot", "create", self.volname, group, snapshot) + + # remove snapshot + self._fs_cmd("subvolumegroup", "snapshot", "rm", self.volname, group, snapshot) + + # remove snapshot + try: + self._fs_cmd("subvolumegroup", "snapshot", "rm", self.volname, group, snapshot) + except CommandFailedError as ce: + if ce.exitstatus != errno.ENOENT: + raise + else: + raise RuntimeError("expected the 'fs subvolumegroup snapshot rm' command to fail") + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume, group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + # remove group + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + @unittest.skip("skipping subvolumegroup snapshot tests") + def test_subvolume_group_snapshot_create_and_rm(self): + subvolume = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + snapshot = self._generate_random_snapshot_name() + + # create group + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group) + + # snapshot group + self._fs_cmd("subvolumegroup", "snapshot", "create", self.volname, group, snapshot) + + # remove snapshot + self._fs_cmd("subvolumegroup", "snapshot", "rm", self.volname, group, snapshot) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume, group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + # remove group + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + @unittest.skip("skipping subvolumegroup snapshot tests") + def test_subvolume_group_snapshot_idempotence(self): + subvolume = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + snapshot = self._generate_random_snapshot_name() + + # create group + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group) + + # snapshot group + self._fs_cmd("subvolumegroup", "snapshot", "create", self.volname, group, snapshot) + + # try creating snapshot w/ same snapshot name -- shoule be idempotent + self._fs_cmd("subvolumegroup", "snapshot", "create", self.volname, group, snapshot) + + # remove snapshot + self._fs_cmd("subvolumegroup", "snapshot", "rm", self.volname, group, snapshot) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume, group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + # remove group + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + @unittest.skip("skipping subvolumegroup snapshot tests") + def test_subvolume_group_snapshot_ls(self): + # tests the 'fs subvolumegroup snapshot ls' command + + snapshots = [] + + # create group + group = self._generate_random_group_name() + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolumegroup snapshots + snapshots = self._generate_random_snapshot_name(3) + for snapshot in snapshots: + self._fs_cmd("subvolumegroup", "snapshot", "create", self.volname, group, snapshot) + + subvolgrpsnapshotls = json.loads(self._fs_cmd('subvolumegroup', 'snapshot', 'ls', self.volname, group)) + if len(subvolgrpsnapshotls) == 0: + raise RuntimeError("Expected the 'fs subvolumegroup snapshot ls' command to list the created subvolume group snapshots") + else: + snapshotnames = [snapshot['name'] for snapshot in subvolgrpsnapshotls] + if collections.Counter(snapshotnames) != collections.Counter(snapshots): + raise RuntimeError("Error creating or listing subvolume group snapshots") + + @unittest.skip("skipping subvolumegroup snapshot tests") + def test_subvolume_group_snapshot_rm_force(self): + # test removing non-existing subvolume group snapshot with --force + group = self._generate_random_group_name() + snapshot = self._generate_random_snapshot_name() + # remove snapshot + try: + self._fs_cmd("subvolumegroup", "snapshot", "rm", self.volname, group, snapshot, "--force") + except CommandFailedError: + raise RuntimeError("expected the 'fs subvolumegroup snapshot rm --force' command to succeed") + + def test_subvolume_group_snapshot_unsupported_status(self): + group = self._generate_random_group_name() + snapshot = self._generate_random_snapshot_name() + + # create group + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # snapshot group + try: + self._fs_cmd("subvolumegroup", "snapshot", "create", self.volname, group, snapshot) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.ENOSYS, "invalid error code on subvolumegroup snapshot create") + else: + self.fail("expected subvolumegroup snapshot create command to fail") + + # remove group + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + +class TestSubvolumeSnapshots(TestVolumesHelper): + """Tests for FS subvolume snapshot operations.""" + def test_nonexistent_subvolume_snapshot_rm(self): + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove snapshot again + try: + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + except CommandFailedError as ce: + if ce.exitstatus != errno.ENOENT: + raise + else: + raise RuntimeError("expected the 'fs subvolume snapshot rm' command to fail") + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_snapshot_create_and_rm(self): + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_snapshot_create_idempotence(self): + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # try creating w/ same subvolume snapshot name -- should be idempotent + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_snapshot_info(self): + + """ + tests the 'fs subvolume snapshot info' command + """ + + snap_md = ["created_at", "data_pool", "has_pending_clones"] + + subvolume = self._generate_random_subvolume_name() + snapshot, snap_missing = self._generate_random_snapshot_name(2) + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # do some IO + self._do_subvolume_io(subvolume, number_of_files=1) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + snap_info = json.loads(self._get_subvolume_snapshot_info(self.volname, subvolume, snapshot)) + for md in snap_md: + self.assertIn(md, snap_info, "'{0}' key not present in metadata of snapshot".format(md)) + self.assertEqual(snap_info["has_pending_clones"], "no") + + # snapshot info for non-existent snapshot + try: + self._get_subvolume_snapshot_info(self.volname, subvolume, snap_missing) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.ENOENT, "invalid error code on snapshot info of non-existent snapshot") + else: + self.fail("expected snapshot info of non-existent snapshot to fail") + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_snapshot_in_group(self): + subvolume = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + snapshot = self._generate_random_snapshot_name() + + # create group + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group) + + # snapshot subvolume in group + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot, group) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot, group) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume, group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + # remove group + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_subvolume_snapshot_ls(self): + # tests the 'fs subvolume snapshot ls' command + + snapshots = [] + + # create subvolume + subvolume = self._generate_random_subvolume_name() + self._fs_cmd("subvolume", "create", self.volname, subvolume) + + # create subvolume snapshots + snapshots = self._generate_random_snapshot_name(3) + for snapshot in snapshots: + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + subvolsnapshotls = json.loads(self._fs_cmd('subvolume', 'snapshot', 'ls', self.volname, subvolume)) + if len(subvolsnapshotls) == 0: + self.fail("Expected the 'fs subvolume snapshot ls' command to list the created subvolume snapshots") + else: + snapshotnames = [snapshot['name'] for snapshot in subvolsnapshotls] + if collections.Counter(snapshotnames) != collections.Counter(snapshots): + self.fail("Error creating or listing subvolume snapshots") + + # remove snapshot + for snapshot in snapshots: + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_inherited_snapshot_ls(self): + # tests the scenario where 'fs subvolume snapshot ls' command + # should not list inherited snapshots created as part of snapshot + # at ancestral level + + snapshots = [] + subvolume = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + snap_count = 3 + + # create group + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group) + + # create subvolume snapshots + snapshots = self._generate_random_snapshot_name(snap_count) + for snapshot in snapshots: + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot, group) + + # Create snapshot at ancestral level + ancestral_snappath1 = os.path.join(".", "volumes", group, ".snap", "ancestral_snap_1") + ancestral_snappath2 = os.path.join(".", "volumes", group, ".snap", "ancestral_snap_2") + self.mount_a.run_shell(['sudo', 'mkdir', '-p', ancestral_snappath1, ancestral_snappath2], omit_sudo=False) + + subvolsnapshotls = json.loads(self._fs_cmd('subvolume', 'snapshot', 'ls', self.volname, subvolume, group)) + self.assertEqual(len(subvolsnapshotls), snap_count) + + # remove ancestral snapshots + self.mount_a.run_shell(['sudo', 'rmdir', ancestral_snappath1, ancestral_snappath2], omit_sudo=False) + + # remove snapshot + for snapshot in snapshots: + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot, group) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume, group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + # remove group + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_subvolume_inherited_snapshot_info(self): + """ + tests the scenario where 'fs subvolume snapshot info' command + should fail for inherited snapshots created as part of snapshot + at ancestral level + """ + + subvolume = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + + # create group + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group) + + # Create snapshot at ancestral level + ancestral_snap_name = "ancestral_snap_1" + ancestral_snappath1 = os.path.join(".", "volumes", group, ".snap", ancestral_snap_name) + self.mount_a.run_shell(['sudo', 'mkdir', '-p', ancestral_snappath1], omit_sudo=False) + + # Validate existence of inherited snapshot + group_path = os.path.join(".", "volumes", group) + inode_number_group_dir = int(self.mount_a.run_shell(['stat', '-c' '%i', group_path]).stdout.getvalue().strip()) + inherited_snap = "_{0}_{1}".format(ancestral_snap_name, inode_number_group_dir) + inherited_snappath = os.path.join(".", "volumes", group, subvolume,".snap", inherited_snap) + self.mount_a.run_shell(['ls', inherited_snappath]) + + # snapshot info on inherited snapshot + try: + self._get_subvolume_snapshot_info(self.volname, subvolume, inherited_snap, group) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EINVAL, "invalid error code on snapshot info of inherited snapshot") + else: + self.fail("expected snapshot info of inherited snapshot to fail") + + # remove ancestral snapshots + self.mount_a.run_shell(['sudo', 'rmdir', ancestral_snappath1], omit_sudo=False) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--group_name", group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + # remove group + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_subvolume_inherited_snapshot_rm(self): + """ + tests the scenario where 'fs subvolume snapshot rm' command + should fail for inherited snapshots created as part of snapshot + at ancestral level + """ + + subvolume = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + + # create group + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group) + + # Create snapshot at ancestral level + ancestral_snap_name = "ancestral_snap_1" + ancestral_snappath1 = os.path.join(".", "volumes", group, ".snap", ancestral_snap_name) + self.mount_a.run_shell(['sudo', 'mkdir', '-p', ancestral_snappath1], omit_sudo=False) + + # Validate existence of inherited snap + group_path = os.path.join(".", "volumes", group) + inode_number_group_dir = int(self.mount_a.run_shell(['stat', '-c' '%i', group_path]).stdout.getvalue().strip()) + inherited_snap = "_{0}_{1}".format(ancestral_snap_name, inode_number_group_dir) + inherited_snappath = os.path.join(".", "volumes", group, subvolume,".snap", inherited_snap) + self.mount_a.run_shell(['ls', inherited_snappath]) + + # inherited snapshot should not be deletable + try: + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, inherited_snap, "--group_name", group) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EINVAL, msg="invalid error code when removing inherited snapshot") + else: + self.fail("expected removing inheirted snapshot to fail") + + # remove ancestral snapshots + self.mount_a.run_shell(['sudo', 'rmdir', ancestral_snappath1], omit_sudo=False) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume, group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + # remove group + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_subvolume_subvolumegroup_snapshot_name_conflict(self): + """ + tests the scenario where creation of subvolume snapshot name + with same name as it's subvolumegroup snapshot name. This should + fail. + """ + + subvolume = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + group_snapshot = self._generate_random_snapshot_name() + + # create group + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--group_name", group) + + # Create subvolumegroup snapshot + group_snapshot_path = os.path.join(".", "volumes", group, ".snap", group_snapshot) + self.mount_a.run_shell(['sudo', 'mkdir', '-p', group_snapshot_path], omit_sudo=False) + + # Validate existence of subvolumegroup snapshot + self.mount_a.run_shell(['ls', group_snapshot_path]) + + # Creation of subvolume snapshot with it's subvolumegroup snapshot name should fail + try: + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, group_snapshot, "--group_name", group) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EINVAL, msg="invalid error code when creating subvolume snapshot with same name as subvolume group snapshot") + else: + self.fail("expected subvolume snapshot creation with same name as subvolumegroup snapshot to fail") + + # remove subvolumegroup snapshot + self.mount_a.run_shell(['sudo', 'rmdir', group_snapshot_path], omit_sudo=False) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume, group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + # remove group + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_subvolume_retain_snapshot_invalid_recreate(self): + """ + ensure retained subvolume recreate does not leave any incarnations in the subvolume and trash + """ + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # remove with snapshot retention + self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots") + + # recreate subvolume with an invalid pool + data_pool = "invalid_pool" + try: + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--pool_layout", data_pool) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EINVAL, "invalid error code on recreate of subvolume with invalid poolname") + else: + self.fail("expected recreate of subvolume with invalid poolname to fail") + + # fetch info + subvol_info = json.loads(self._fs_cmd("subvolume", "info", self.volname, subvolume)) + self.assertEqual(subvol_info["state"], "snapshot-retained", + msg="expected state to be 'snapshot-retained', found '{0}".format(subvol_info["state"])) + + # getpath + try: + self._fs_cmd("subvolume", "getpath", self.volname, subvolume) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.ENOENT, "invalid error code on getpath of subvolume with retained snapshots") + else: + self.fail("expected getpath of subvolume with retained snapshots to fail") + + # remove snapshot (should remove volume) + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_retain_snapshot_recreate_subvolume(self): + """ + ensure a retained subvolume can be recreated and further snapshotted + """ + snap_md = ["created_at", "data_pool", "has_pending_clones"] + + subvolume = self._generate_random_subvolume_name() + snapshot1, snapshot2 = self._generate_random_snapshot_name(2) + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot1) + + # remove with snapshot retention + self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots") + + # fetch info + subvol_info = json.loads(self._fs_cmd("subvolume", "info", self.volname, subvolume)) + self.assertEqual(subvol_info["state"], "snapshot-retained", + msg="expected state to be 'snapshot-retained', found '{0}".format(subvol_info["state"])) + + # recreate retained subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume) + + # fetch info + subvol_info = json.loads(self._fs_cmd("subvolume", "info", self.volname, subvolume)) + self.assertEqual(subvol_info["state"], "complete", + msg="expected state to be 'snapshot-retained', found '{0}".format(subvol_info["state"])) + + # snapshot info (older snapshot) + snap_info = json.loads(self._get_subvolume_snapshot_info(self.volname, subvolume, snapshot1)) + for md in snap_md: + self.assertIn(md, snap_info, "'{0}' key not present in metadata of snapshot".format(md)) + self.assertEqual(snap_info["has_pending_clones"], "no") + + # snap-create (new snapshot) + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot2) + + # remove with retain snapshots + self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots") + + # list snapshots + subvolsnapshotls = json.loads(self._fs_cmd('subvolume', 'snapshot', 'ls', self.volname, subvolume)) + self.assertEqual(len(subvolsnapshotls), 2, "Expected the 'fs subvolume snapshot ls' command to list the" + " created subvolume snapshots") + snapshotnames = [snapshot['name'] for snapshot in subvolsnapshotls] + for snap in [snapshot1, snapshot2]: + self.assertIn(snap, snapshotnames, "Missing snapshot '{0}' in snapshot list".format(snap)) + + # remove snapshots (should remove volume) + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot1) + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot2) + + # verify list subvolumes returns an empty list + subvolumels = json.loads(self._fs_cmd('subvolume', 'ls', self.volname)) + self.assertEqual(len(subvolumels), 0) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_retain_snapshot_with_snapshots(self): + """ + ensure retain snapshots based delete of a subvolume with snapshots retains the subvolume + also test allowed and dis-allowed operations on a retained subvolume + """ + snap_md = ["created_at", "data_pool", "has_pending_clones"] + + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # remove subvolume -- should fail with ENOTEMPTY since it has snapshots + try: + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.ENOTEMPTY, "invalid error code on rm of retained subvolume with snapshots") + else: + self.fail("expected rm of subvolume with retained snapshots to fail") + + # remove with snapshot retention + self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots") + + # fetch info + subvol_info = json.loads(self._fs_cmd("subvolume", "info", self.volname, subvolume)) + self.assertEqual(subvol_info["state"], "snapshot-retained", + msg="expected state to be 'snapshot-retained', found '{0}".format(subvol_info["state"])) + + ## test allowed ops in retained state + # ls + subvolumes = json.loads(self._fs_cmd('subvolume', 'ls', self.volname)) + self.assertEqual(len(subvolumes), 1, "subvolume ls count mismatch, expected '1', found {0}".format(len(subvolumes))) + self.assertEqual(subvolumes[0]['name'], subvolume, + "subvolume name mismatch in ls output, expected '{0}', found '{1}'".format(subvolume, subvolumes[0]['name'])) + + # snapshot info + snap_info = json.loads(self._get_subvolume_snapshot_info(self.volname, subvolume, snapshot)) + for md in snap_md: + self.assertIn(md, snap_info, "'{0}' key not present in metadata of snapshot".format(md)) + self.assertEqual(snap_info["has_pending_clones"], "no") + + # rm --force (allowed but should fail) + try: + self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--force") + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.ENOTEMPTY, "invalid error code on rm of subvolume with retained snapshots") + else: + self.fail("expected rm of subvolume with retained snapshots to fail") + + # rm (allowed but should fail) + try: + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.ENOTEMPTY, "invalid error code on rm of subvolume with retained snapshots") + else: + self.fail("expected rm of subvolume with retained snapshots to fail") + + ## test disallowed ops + # getpath + try: + self._fs_cmd("subvolume", "getpath", self.volname, subvolume) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.ENOENT, "invalid error code on getpath of subvolume with retained snapshots") + else: + self.fail("expected getpath of subvolume with retained snapshots to fail") + + # resize + nsize = self.DEFAULT_FILE_SIZE*1024*1024 + try: + self._fs_cmd("subvolume", "resize", self.volname, subvolume, str(nsize)) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.ENOENT, "invalid error code on resize of subvolume with retained snapshots") + else: + self.fail("expected resize of subvolume with retained snapshots to fail") + + # snap-create + try: + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, "fail") + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.ENOENT, "invalid error code on snapshot create of subvolume with retained snapshots") + else: + self.fail("expected snapshot create of subvolume with retained snapshots to fail") + + # remove snapshot (should remove volume) + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # verify list subvolumes returns an empty list + subvolumels = json.loads(self._fs_cmd('subvolume', 'ls', self.volname)) + self.assertEqual(len(subvolumels), 0) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_retain_snapshot_without_snapshots(self): + """ + ensure retain snapshots based delete of a subvolume with no snapshots, deletes the subbvolume + """ + subvolume = self._generate_random_subvolume_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume) + + # remove with snapshot retention (should remove volume, no snapshots to retain) + self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots") + + # verify list subvolumes returns an empty list + subvolumels = json.loads(self._fs_cmd('subvolume', 'ls', self.volname)) + self.assertEqual(len(subvolumels), 0) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_retain_snapshot_trash_busy_recreate(self): + """ + ensure retained subvolume recreate fails if its trash is not yet purged + """ + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # remove with snapshot retention + self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots") + + # fake a trash entry + self._update_fake_trash(subvolume) + + # recreate subvolume + try: + self._fs_cmd("subvolume", "create", self.volname, subvolume) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EAGAIN, "invalid error code on recreate of subvolume with purge pending") + else: + self.fail("expected recreate of subvolume with purge pending to fail") + + # clear fake trash entry + self._update_fake_trash(subvolume, create=False) + + # recreate subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_rm_with_snapshots(self): + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # remove subvolume -- should fail with ENOTEMPTY since it has snapshots + try: + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + except CommandFailedError as ce: + if ce.exitstatus != errno.ENOTEMPTY: + raise RuntimeError("invalid error code returned when deleting subvolume with snapshots") + else: + raise RuntimeError("expected subvolume deletion to fail") + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_snapshot_protect_unprotect_sanity(self): + """ + Snapshot protect/unprotect commands are deprecated. This test exists to ensure that + invoking the command does not cause errors, till they are removed from a subsequent release. + """ + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone = self._generate_random_clone_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # do some IO + self._do_subvolume_io(subvolume, number_of_files=64) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # now, protect snapshot + self._fs_cmd("subvolume", "snapshot", "protect", self.volname, subvolume, snapshot) + + # schedule a clone + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone) + + # check clone status + self._wait_for_clone_to_complete(clone) + + # now, unprotect snapshot + self._fs_cmd("subvolume", "snapshot", "unprotect", self.volname, subvolume, snapshot) + + # verify clone + self._verify_clone(subvolume, snapshot, clone) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + self._fs_cmd("subvolume", "rm", self.volname, clone) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_snapshot_rm_force(self): + # test removing non existing subvolume snapshot with --force + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + + # remove snapshot + try: + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot, "--force") + except CommandFailedError: + raise RuntimeError("expected the 'fs subvolume snapshot rm --force' command to succeed") + + def test_subvolume_snapshot_metadata_set(self): + """ + Set custom metadata for subvolume snapshot. + """ + subvolname = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + snapshot = self._generate_random_snapshot_name() + + # create group. + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group. + self._fs_cmd("subvolume", "create", self.volname, subvolname, group) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot, group) + + # set metadata for snapshot. + key = "key" + value = "value" + try: + self._fs_cmd("subvolume", "snapshot", "metadata", "set", self.volname, subvolname, snapshot, key, value, group) + except CommandFailedError: + self.fail("expected the 'fs subvolume snapshot metadata set' command to succeed") + + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot, group) + self._fs_cmd("subvolume", "rm", self.volname, subvolname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean. + self._wait_for_trash_empty() + + def test_subvolume_snapshot_metadata_set_idempotence(self): + """ + Set custom metadata for subvolume snapshot (Idempotency). + """ + subvolname = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + snapshot = self._generate_random_snapshot_name() + + # create group. + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group. + self._fs_cmd("subvolume", "create", self.volname, subvolname, group) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot, group) + + # set metadata for snapshot. + key = "key" + value = "value" + try: + self._fs_cmd("subvolume", "snapshot", "metadata", "set", self.volname, subvolname, snapshot, key, value, group) + except CommandFailedError: + self.fail("expected the 'fs subvolume snapshot metadata set' command to succeed") + + # set same metadata again for subvolume. + try: + self._fs_cmd("subvolume", "snapshot", "metadata", "set", self.volname, subvolname, snapshot, key, value, group) + except CommandFailedError: + self.fail("expected the 'fs subvolume snapshot metadata set' command to succeed because it is idempotent operation") + + # get value for specified key. + try: + ret = self._fs_cmd("subvolume", "snapshot", "metadata", "get", self.volname, subvolname, snapshot, key, group) + except CommandFailedError: + self.fail("expected the 'fs subvolume snapshot metadata get' command to succeed") + + # remove '\n' from returned value. + ret = ret.strip('\n') + + # match received value with expected value. + self.assertEqual(value, ret) + + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot, group) + self._fs_cmd("subvolume", "rm", self.volname, subvolname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean. + self._wait_for_trash_empty() + + def test_subvolume_snapshot_metadata_get(self): + """ + Get custom metadata for a specified key in subvolume snapshot metadata. + """ + subvolname = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + snapshot = self._generate_random_snapshot_name() + + # create group. + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group. + self._fs_cmd("subvolume", "create", self.volname, subvolname, group) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot, group) + + # set metadata for snapshot. + key = "key" + value = "value" + self._fs_cmd("subvolume", "snapshot", "metadata", "set", self.volname, subvolname, snapshot, key, value, group) + + # get value for specified key. + try: + ret = self._fs_cmd("subvolume", "snapshot", "metadata", "get", self.volname, subvolname, snapshot, key, group) + except CommandFailedError: + self.fail("expected the 'fs subvolume snapshot metadata get' command to succeed") + + # remove '\n' from returned value. + ret = ret.strip('\n') + + # match received value with expected value. + self.assertEqual(value, ret) + + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot, group) + self._fs_cmd("subvolume", "rm", self.volname, subvolname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean. + self._wait_for_trash_empty() + + def test_subvolume_snapshot_metadata_get_for_nonexisting_key(self): + """ + Get custom metadata for subvolume snapshot if specified key not exist in metadata. + """ + subvolname = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + snapshot = self._generate_random_snapshot_name() + + # create group. + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group. + self._fs_cmd("subvolume", "create", self.volname, subvolname, group) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot, group) + + # set metadata for snapshot. + key = "key" + value = "value" + self._fs_cmd("subvolume", "snapshot", "metadata", "set", self.volname, subvolname, snapshot, key, value, group) + + # try to get value for nonexisting key + # Expecting ENOENT exit status because key does not exist + try: + self._fs_cmd("subvolume", "snapshot", "metadata", "get", self.volname, subvolname, snapshot, "key_nonexist", group) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.ENOENT) + else: + self.fail("Expected ENOENT because 'key_nonexist' does not exist") + + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot, group) + self._fs_cmd("subvolume", "rm", self.volname, subvolname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean. + self._wait_for_trash_empty() + + def test_subvolume_snapshot_metadata_get_for_nonexisting_section(self): + """ + Get custom metadata for subvolume snapshot if metadata is not added for subvolume snapshot. + """ + subvolname = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + snapshot = self._generate_random_snapshot_name() + + # create group. + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group. + self._fs_cmd("subvolume", "create", self.volname, subvolname, group) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot, group) + + # try to get value for nonexisting key (as section does not exist) + # Expecting ENOENT exit status because key does not exist + try: + self._fs_cmd("subvolume", "snapshot", "metadata", "get", self.volname, subvolname, snapshot, "key", group) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.ENOENT) + else: + self.fail("Expected ENOENT because section does not exist") + + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot, group) + self._fs_cmd("subvolume", "rm", self.volname, subvolname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean. + self._wait_for_trash_empty() + + def test_subvolume_snapshot_metadata_update(self): + """ + Update custom metadata for a specified key in subvolume snapshot metadata. + """ + subvolname = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + snapshot = self._generate_random_snapshot_name() + + # create group. + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group. + self._fs_cmd("subvolume", "create", self.volname, subvolname, group) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot, group) + + # set metadata for snapshot. + key = "key" + value = "value" + self._fs_cmd("subvolume", "snapshot", "metadata", "set", self.volname, subvolname, snapshot, key, value, group) + + # update metadata against key. + new_value = "new_value" + self._fs_cmd("subvolume", "snapshot", "metadata", "set", self.volname, subvolname, snapshot, key, new_value, group) + + # get metadata for specified key of snapshot. + try: + ret = self._fs_cmd("subvolume", "snapshot", "metadata", "get", self.volname, subvolname, snapshot, key, group) + except CommandFailedError: + self.fail("expected the 'fs subvolume snapshot metadata get' command to succeed") + + # remove '\n' from returned value. + ret = ret.strip('\n') + + # match received value with expected value. + self.assertEqual(new_value, ret) + + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot, group) + self._fs_cmd("subvolume", "rm", self.volname, subvolname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean. + self._wait_for_trash_empty() + + def test_subvolume_snapshot_metadata_list(self): + """ + List custom metadata for subvolume snapshot. + """ + subvolname = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + snapshot = self._generate_random_snapshot_name() + + # create group. + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group. + self._fs_cmd("subvolume", "create", self.volname, subvolname, group) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot, group) + + # set metadata for subvolume. + input_metadata_dict = {f'key_{i}' : f'value_{i}' for i in range(3)} + + for k, v in input_metadata_dict.items(): + self._fs_cmd("subvolume", "snapshot", "metadata", "set", self.volname, subvolname, snapshot, k, v, group) + + # list metadata + try: + ret_dict = json.loads(self._fs_cmd("subvolume", "snapshot", "metadata", "ls", self.volname, subvolname, snapshot, group)) + except CommandFailedError: + self.fail("expected the 'fs subvolume snapshot metadata ls' command to succeed") + + # compare output with expected output + self.assertDictEqual(input_metadata_dict, ret_dict) + + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot, group) + self._fs_cmd("subvolume", "rm", self.volname, subvolname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean. + self._wait_for_trash_empty() + + def test_subvolume_snapshot_metadata_list_if_no_metadata_set(self): + """ + List custom metadata for subvolume snapshot if metadata is not added for subvolume snapshot. + """ + subvolname = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + snapshot = self._generate_random_snapshot_name() + + # create group. + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group. + self._fs_cmd("subvolume", "create", self.volname, subvolname, group) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot, group) + + # list metadata + try: + ret_dict = json.loads(self._fs_cmd("subvolume", "snapshot", "metadata", "ls", self.volname, subvolname, snapshot, group)) + except CommandFailedError: + self.fail("expected the 'fs subvolume snapshot metadata ls' command to succeed") + + # compare output with expected output + empty_dict = {} + self.assertDictEqual(ret_dict, empty_dict) + + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot, group) + self._fs_cmd("subvolume", "rm", self.volname, subvolname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean. + self._wait_for_trash_empty() + + def test_subvolume_snapshot_metadata_remove(self): + """ + Remove custom metadata for a specified key in subvolume snapshot metadata. + """ + subvolname = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + snapshot = self._generate_random_snapshot_name() + + # create group. + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group. + self._fs_cmd("subvolume", "create", self.volname, subvolname, group) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot, group) + + # set metadata for snapshot. + key = "key" + value = "value" + self._fs_cmd("subvolume", "snapshot", "metadata", "set", self.volname, subvolname, snapshot, key, value, group) + + # remove metadata against specified key. + try: + self._fs_cmd("subvolume", "snapshot", "metadata", "rm", self.volname, subvolname, snapshot, key, group) + except CommandFailedError: + self.fail("expected the 'fs subvolume snapshot metadata rm' command to succeed") + + # confirm key is removed by again fetching metadata + try: + self._fs_cmd("subvolume", "snapshot", "metadata", "get", self.volname, subvolname, key, snapshot, group) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.ENOENT) + else: + self.fail("Expected ENOENT because key does not exist") + + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot, group) + self._fs_cmd("subvolume", "rm", self.volname, subvolname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean. + self._wait_for_trash_empty() + + def test_subvolume_snapshot_metadata_remove_for_nonexisting_key(self): + """ + Remove custom metadata for subvolume snapshot if specified key not exist in metadata. + """ + subvolname = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + snapshot = self._generate_random_snapshot_name() + + # create group. + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group. + self._fs_cmd("subvolume", "create", self.volname, subvolname, group) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot, group) + + # set metadata for snapshot. + key = "key" + value = "value" + self._fs_cmd("subvolume", "snapshot", "metadata", "set", self.volname, subvolname, snapshot, key, value, group) + + # try to remove value for nonexisting key + # Expecting ENOENT exit status because key does not exist + try: + self._fs_cmd("subvolume", "snapshot", "metadata", "rm", self.volname, subvolname, snapshot, "key_nonexist", group) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.ENOENT) + else: + self.fail("Expected ENOENT because 'key_nonexist' does not exist") + + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot, group) + self._fs_cmd("subvolume", "rm", self.volname, subvolname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean. + self._wait_for_trash_empty() + + def test_subvolume_snapshot_metadata_remove_for_nonexisting_section(self): + """ + Remove custom metadata for subvolume snapshot if metadata is not added for subvolume snapshot. + """ + subvolname = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + snapshot = self._generate_random_snapshot_name() + + # create group. + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group. + self._fs_cmd("subvolume", "create", self.volname, subvolname, group) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot, group) + + # try to remove value for nonexisting key (as section does not exist) + # Expecting ENOENT exit status because key does not exist + try: + self._fs_cmd("subvolume", "snapshot", "metadata", "rm", self.volname, subvolname, snapshot, "key", group) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.ENOENT) + else: + self.fail("Expected ENOENT because section does not exist") + + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot, group) + self._fs_cmd("subvolume", "rm", self.volname, subvolname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean. + self._wait_for_trash_empty() + + def test_subvolume_snapshot_metadata_remove_force(self): + """ + Forcefully remove custom metadata for a specified key in subvolume snapshot metadata. + """ + subvolname = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + snapshot = self._generate_random_snapshot_name() + + # create group. + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group. + self._fs_cmd("subvolume", "create", self.volname, subvolname, group) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot, group) + + # set metadata for snapshot. + key = "key" + value = "value" + self._fs_cmd("subvolume", "snapshot", "metadata", "set", self.volname, subvolname, snapshot, key, value, group) + + # remove metadata against specified key with --force option. + try: + self._fs_cmd("subvolume", "snapshot", "metadata", "rm", self.volname, subvolname, snapshot, key, group, "--force") + except CommandFailedError: + self.fail("expected the 'fs subvolume snapshot metadata rm' command to succeed") + + # confirm key is removed by again fetching metadata + try: + self._fs_cmd("subvolume", "snapshot", "metadata", "get", self.volname, subvolname, snapshot, key, group) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.ENOENT) + else: + self.fail("Expected ENOENT because key does not exist") + + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot, group) + self._fs_cmd("subvolume", "rm", self.volname, subvolname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean. + self._wait_for_trash_empty() + + def test_subvolume_snapshot_metadata_remove_force_for_nonexisting_key(self): + """ + Forcefully remove custom metadata for subvolume snapshot if specified key not exist in metadata. + """ + subvolname = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + snapshot = self._generate_random_snapshot_name() + + # create group. + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group. + self._fs_cmd("subvolume", "create", self.volname, subvolname, group) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot, group) + + # set metadata for snapshot. + key = "key" + value = "value" + self._fs_cmd("subvolume", "snapshot", "metadata", "set", self.volname, subvolname, snapshot, key, value, group) + + # remove metadata against specified key. + try: + self._fs_cmd("subvolume", "snapshot", "metadata", "rm", self.volname, subvolname, snapshot, key, group) + except CommandFailedError: + self.fail("expected the 'fs subvolume snapshot metadata rm' command to succeed") + + # confirm key is removed by again fetching metadata + try: + self._fs_cmd("subvolume", "snapshot", "metadata", "get", self.volname, subvolname, snapshot, key, group) + except CommandFailedError as e: + self.assertEqual(e.exitstatus, errno.ENOENT) + else: + self.fail("Expected ENOENT because key does not exist") + + # again remove metadata against already removed key with --force option. + try: + self._fs_cmd("subvolume", "snapshot", "metadata", "rm", self.volname, subvolname, snapshot, key, group, "--force") + except CommandFailedError: + self.fail("expected the 'fs subvolume snapshot metadata rm' (with --force) command to succeed") + + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot, group) + self._fs_cmd("subvolume", "rm", self.volname, subvolname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean. + self._wait_for_trash_empty() + + def test_subvolume_snapshot_metadata_after_snapshot_remove(self): + """ + Verify metadata removal of subvolume snapshot after snapshot removal. + """ + subvolname = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + snapshot = self._generate_random_snapshot_name() + + # create group. + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group. + self._fs_cmd("subvolume", "create", self.volname, subvolname, group) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot, group) + + # set metadata for snapshot. + key = "key" + value = "value" + self._fs_cmd("subvolume", "snapshot", "metadata", "set", self.volname, subvolname, snapshot, key, value, group) + + # get value for specified key. + ret = self._fs_cmd("subvolume", "snapshot", "metadata", "get", self.volname, subvolname, snapshot, key, group) + + # remove '\n' from returned value. + ret = ret.strip('\n') + + # match received value with expected value. + self.assertEqual(value, ret) + + # remove subvolume snapshot. + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot, group) + + # try to get metadata after removing snapshot. + # Expecting error ENOENT with error message of snapshot does not exist + cmd_ret = self.mgr_cluster.mon_manager.run_cluster_cmd( + args=["fs", "subvolume", "snapshot", "metadata", "get", self.volname, subvolname, snapshot, key, group], + check_status=False, stdout=StringIO(), stderr=StringIO()) + self.assertEqual(cmd_ret.returncode, errno.ENOENT, "Expecting ENOENT error") + self.assertIn(f"snapshot '{snapshot}' does not exist", cmd_ret.stderr.getvalue(), + f"Expecting message: snapshot '{snapshot}' does not exist ") + + # confirm metadata is removed by searching section name in .meta file + meta_path = os.path.join(".", "volumes", group, subvolname, ".meta") + section_name = "SNAP_METADATA_" + snapshot + + try: + self.mount_a.run_shell(f"sudo grep {section_name} {meta_path}", omit_sudo=False) + except CommandFailedError as e: + self.assertNotEqual(e.exitstatus, 0) + else: + self.fail("Expected non-zero exist status because section should not exist") + + self._fs_cmd("subvolume", "rm", self.volname, subvolname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean. + self._wait_for_trash_empty() + + def test_clean_stale_subvolume_snapshot_metadata(self): + """ + Validate cleaning of stale subvolume snapshot metadata. + """ + subvolname = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + snapshot = self._generate_random_snapshot_name() + + # create group. + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume in group. + self._fs_cmd("subvolume", "create", self.volname, subvolname, group) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolname, snapshot, group) + + # set metadata for snapshot. + key = "key" + value = "value" + try: + self._fs_cmd("subvolume", "snapshot", "metadata", "set", self.volname, subvolname, snapshot, key, value, group) + except CommandFailedError: + self.fail("expected the 'fs subvolume snapshot metadata set' command to succeed") + + # save the subvolume config file. + meta_path = os.path.join(".", "volumes", group, subvolname, ".meta") + tmp_meta_path = os.path.join(".", "volumes", group, subvolname, ".meta.stale_snap_section") + self.mount_a.run_shell(['sudo', 'cp', '-p', meta_path, tmp_meta_path], omit_sudo=False) + + # Delete snapshot, this would remove user snap metadata + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolname, snapshot, group) + + # Copy back saved subvolume config file. This would have stale snapshot metadata + self.mount_a.run_shell(['sudo', 'cp', '-p', tmp_meta_path, meta_path], omit_sudo=False) + + # Verify that it has stale snapshot metadata + section_name = "SNAP_METADATA_" + snapshot + try: + self.mount_a.run_shell(f"sudo grep {section_name} {meta_path}", omit_sudo=False) + except CommandFailedError: + self.fail("Expected grep cmd to succeed because stale snapshot metadata exist") + + # Do any subvolume operation to clean the stale snapshot metadata + _ = json.loads(self._get_subvolume_info(self.volname, subvolname, group)) + + # Verify that the stale snapshot metadata is cleaned + try: + self.mount_a.run_shell(f"sudo grep {section_name} {meta_path}", omit_sudo=False) + except CommandFailedError as e: + self.assertNotEqual(e.exitstatus, 0) + else: + self.fail("Expected non-zero exist status because stale snapshot metadata should not exist") + + self._fs_cmd("subvolume", "rm", self.volname, subvolname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean. + self._wait_for_trash_empty() + # Clean tmp config file + self.mount_a.run_shell(['sudo', 'rm', '-f', tmp_meta_path], omit_sudo=False) + + +class TestSubvolumeSnapshotClones(TestVolumesHelper): + """ Tests for FS subvolume snapshot clone operations.""" + def test_clone_subvolume_info(self): + # tests the 'fs subvolume info' command for a clone + subvol_md = ["atime", "bytes_pcent", "bytes_quota", "bytes_used", "created_at", "ctime", + "data_pool", "gid", "mode", "mon_addrs", "mtime", "path", "pool_namespace", + "type", "uid"] + + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone = self._generate_random_clone_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # do some IO + self._do_subvolume_io(subvolume, number_of_files=1) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # schedule a clone + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone) + + # check clone status + self._wait_for_clone_to_complete(clone) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + subvol_info = json.loads(self._get_subvolume_info(self.volname, clone)) + if len(subvol_info) == 0: + raise RuntimeError("Expected the 'fs subvolume info' command to list metadata of subvolume") + for md in subvol_md: + if md not in subvol_info.keys(): + raise RuntimeError("%s not present in the metadata of subvolume" % md) + if subvol_info["type"] != "clone": + raise RuntimeError("type should be set to clone") + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + self._fs_cmd("subvolume", "rm", self.volname, clone) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_snapshot_info_without_snapshot_clone(self): + """ + Verify subvolume snapshot info output without cloning snapshot. + If no clone is performed then path /volumes/_index/clone/{track_id} + will not exist. + """ + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + + # create subvolume. + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # list snapshot info + result = json.loads(self._fs_cmd("subvolume", "snapshot", "info", self.volname, subvolume, snapshot)) + + # verify snapshot info + self.assertEqual(result['has_pending_clones'], "no") + self.assertFalse('orphan_clones_count' in result) + self.assertFalse('pending_clones' in result) + + # remove snapshot, subvolume, clone + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_snapshot_info_if_no_clone_pending(self): + """ + Verify subvolume snapshot info output if no clone is in pending state. + """ + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone_list = [f'clone_{i}' for i in range(3)] + + # create subvolume. + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # schedule a clones + for clone in clone_list: + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone) + + # check clones status + for clone in clone_list: + self._wait_for_clone_to_complete(clone) + + # list snapshot info + result = json.loads(self._fs_cmd("subvolume", "snapshot", "info", self.volname, subvolume, snapshot)) + + # verify snapshot info + self.assertEqual(result['has_pending_clones'], "no") + self.assertFalse('orphan_clones_count' in result) + self.assertFalse('pending_clones' in result) + + # remove snapshot, subvolume, clone + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + for clone in clone_list: + self._fs_cmd("subvolume", "rm", self.volname, clone) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_snapshot_info_if_clone_pending_for_no_group(self): + """ + Verify subvolume snapshot info output if clones are in pending state. + Clones are not specified for particular target_group. Hence target_group + should not be in the output as we don't show _nogroup (default group) + """ + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone_list = [f'clone_{i}' for i in range(3)] + + # create subvolume. + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # insert delay at the beginning of snapshot clone + self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 5) + + # schedule a clones + for clone in clone_list: + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone) + + # list snapshot info + result = json.loads(self._fs_cmd("subvolume", "snapshot", "info", self.volname, subvolume, snapshot)) + + # verify snapshot info + expected_clone_list = [] + for clone in clone_list: + expected_clone_list.append({"name": clone}) + self.assertEqual(result['has_pending_clones'], "yes") + self.assertFalse('orphan_clones_count' in result) + self.assertListEqual(result['pending_clones'], expected_clone_list) + self.assertEqual(len(result['pending_clones']), 3) + + # check clones status + for clone in clone_list: + self._wait_for_clone_to_complete(clone) + + # remove snapshot, subvolume, clone + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + for clone in clone_list: + self._fs_cmd("subvolume", "rm", self.volname, clone) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_snapshot_info_if_clone_pending_for_target_group(self): + """ + Verify subvolume snapshot info output if clones are in pending state. + Clones are not specified for target_group. + """ + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone = self._generate_random_clone_name() + group = self._generate_random_group_name() + target_group = self._generate_random_group_name() + + # create groups + self._fs_cmd("subvolumegroup", "create", self.volname, group) + self._fs_cmd("subvolumegroup", "create", self.volname, target_group) + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, group, "--mode=777") + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot, group) + + # insert delay at the beginning of snapshot clone + self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 5) + + # schedule a clone + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone, + "--group_name", group, "--target_group_name", target_group) + + # list snapshot info + result = json.loads(self._fs_cmd("subvolume", "snapshot", "info", self.volname, subvolume, snapshot, "--group_name", group)) + + # verify snapshot info + expected_clone_list = [{"name": clone, "target_group": target_group}] + self.assertEqual(result['has_pending_clones'], "yes") + self.assertFalse('orphan_clones_count' in result) + self.assertListEqual(result['pending_clones'], expected_clone_list) + self.assertEqual(len(result['pending_clones']), 1) + + # check clone status + self._wait_for_clone_to_complete(clone, clone_group=target_group) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot, group) + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume, group) + self._fs_cmd("subvolume", "rm", self.volname, clone, target_group) + + # remove groups + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, target_group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_snapshot_info_if_orphan_clone(self): + """ + Verify subvolume snapshot info output if orphan clones exists. + Orphan clones should not list under pending clones. + orphan_clones_count should display correct count of orphan clones' + """ + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone_list = [f'clone_{i}' for i in range(3)] + + # create subvolume. + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # insert delay at the beginning of snapshot clone + self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 15) + + # schedule a clones + for clone in clone_list: + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone) + + # remove track file for third clone to make it orphan + meta_path = os.path.join(".", "volumes", "_nogroup", subvolume, ".meta") + pending_clones_result = self.mount_a.run_shell(['sudo', 'grep', 'clone snaps', '-A3', meta_path], omit_sudo=False, stdout=StringIO(), stderr=StringIO()) + third_clone_track_id = pending_clones_result.stdout.getvalue().splitlines()[3].split(" = ")[0] + third_clone_track_path = os.path.join(".", "volumes", "_index", "clone", third_clone_track_id) + self.mount_a.run_shell(f"sudo rm -f {third_clone_track_path}", omit_sudo=False) + + # list snapshot info + result = json.loads(self._fs_cmd("subvolume", "snapshot", "info", self.volname, subvolume, snapshot)) + + # verify snapshot info + expected_clone_list = [] + for i in range(len(clone_list)-1): + expected_clone_list.append({"name": clone_list[i]}) + self.assertEqual(result['has_pending_clones'], "yes") + self.assertEqual(result['orphan_clones_count'], 1) + self.assertListEqual(result['pending_clones'], expected_clone_list) + self.assertEqual(len(result['pending_clones']), 2) + + # check clones status + for i in range(len(clone_list)-1): + self._wait_for_clone_to_complete(clone_list[i]) + + # list snapshot info after cloning completion + res = json.loads(self._fs_cmd("subvolume", "snapshot", "info", self.volname, subvolume, snapshot)) + + # verify snapshot info (has_pending_clones should be no) + self.assertEqual(res['has_pending_clones'], "no") + + def test_non_clone_status(self): + subvolume = self._generate_random_subvolume_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume) + + try: + self._fs_cmd("clone", "status", self.volname, subvolume) + except CommandFailedError as ce: + if ce.exitstatus != errno.ENOTSUP: + raise RuntimeError("invalid error code when fetching status of a non cloned subvolume") + else: + raise RuntimeError("expected fetching of clone status of a subvolume to fail") + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_clone_inherit_snapshot_namespace_and_size(self): + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone = self._generate_random_clone_name() + osize = self.DEFAULT_FILE_SIZE*1024*1024*12 + + # create subvolume, in an isolated namespace with a specified size + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--namespace-isolated", "--size", str(osize), "--mode=777") + + # do some IO + self._do_subvolume_io(subvolume, number_of_files=8) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # create a pool different from current subvolume pool + subvol_path = self._get_subvolume_path(self.volname, subvolume) + default_pool = self.mount_a.getfattr(subvol_path, "ceph.dir.layout.pool") + new_pool = "new_pool" + self.assertNotEqual(default_pool, new_pool) + self.fs.add_data_pool(new_pool) + + # update source subvolume pool + self._do_subvolume_pool_and_namespace_update(subvolume, pool=new_pool, pool_namespace="") + + # schedule a clone, with NO --pool specification + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone) + + # check clone status + self._wait_for_clone_to_complete(clone) + + # verify clone + self._verify_clone(subvolume, snapshot, clone) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + self._fs_cmd("subvolume", "rm", self.volname, clone) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_clone_inherit_quota_attrs(self): + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone = self._generate_random_clone_name() + osize = self.DEFAULT_FILE_SIZE*1024*1024*12 + + # create subvolume with a specified size + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777", "--size", str(osize)) + + # do some IO + self._do_subvolume_io(subvolume, number_of_files=8) + + # get subvolume path + subvolpath = self._get_subvolume_path(self.volname, subvolume) + + # set quota on number of files + self.mount_a.setfattr(subvolpath, 'ceph.quota.max_files', "20", sudo=True) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # schedule a clone + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone) + + # check clone status + self._wait_for_clone_to_complete(clone) + + # verify clone + self._verify_clone(subvolume, snapshot, clone) + + # get subvolume path + clonepath = self._get_subvolume_path(self.volname, clone) + + # verify quota max_files is inherited from source snapshot + subvol_quota = self.mount_a.getfattr(subvolpath, "ceph.quota.max_files") + clone_quota = self.mount_a.getfattr(clonepath, "ceph.quota.max_files") + self.assertEqual(subvol_quota, clone_quota) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + self._fs_cmd("subvolume", "rm", self.volname, clone) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_clone_in_progress_getpath(self): + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone = self._generate_random_clone_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # do some IO + self._do_subvolume_io(subvolume, number_of_files=64) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # Insert delay at the beginning of snapshot clone + self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 2) + + # schedule a clone + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone) + + # clone should not be accessible right now + try: + self._get_subvolume_path(self.volname, clone) + except CommandFailedError as ce: + if ce.exitstatus != errno.EAGAIN: + raise RuntimeError("invalid error code when fetching path of an pending clone") + else: + raise RuntimeError("expected fetching path of an pending clone to fail") + + # check clone status + self._wait_for_clone_to_complete(clone) + + # clone should be accessible now + subvolpath = self._get_subvolume_path(self.volname, clone) + self.assertNotEqual(subvolpath, None) + + # verify clone + self._verify_clone(subvolume, snapshot, clone) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + self._fs_cmd("subvolume", "rm", self.volname, clone) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_clone_in_progress_snapshot_rm(self): + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone = self._generate_random_clone_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # do some IO + self._do_subvolume_io(subvolume, number_of_files=64) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # Insert delay at the beginning of snapshot clone + self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 2) + + # schedule a clone + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone) + + # snapshot should not be deletable now + try: + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EAGAIN, msg="invalid error code when removing source snapshot of a clone") + else: + self.fail("expected removing source snapshot of a clone to fail") + + # check clone status + self._wait_for_clone_to_complete(clone) + + # clone should be accessible now + subvolpath = self._get_subvolume_path(self.volname, clone) + self.assertNotEqual(subvolpath, None) + + # verify clone + self._verify_clone(subvolume, snapshot, clone) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + self._fs_cmd("subvolume", "rm", self.volname, clone) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_clone_in_progress_source(self): + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone = self._generate_random_clone_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # do some IO + self._do_subvolume_io(subvolume, number_of_files=64) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # Insert delay at the beginning of snapshot clone + self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 2) + + # schedule a clone + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone) + + # verify clone source + result = json.loads(self._fs_cmd("clone", "status", self.volname, clone)) + source = result['status']['source'] + self.assertEqual(source['volume'], self.volname) + self.assertEqual(source['subvolume'], subvolume) + self.assertEqual(source.get('group', None), None) + self.assertEqual(source['snapshot'], snapshot) + + # check clone status + self._wait_for_clone_to_complete(clone) + + # clone should be accessible now + subvolpath = self._get_subvolume_path(self.volname, clone) + self.assertNotEqual(subvolpath, None) + + # verify clone + self._verify_clone(subvolume, snapshot, clone) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + self._fs_cmd("subvolume", "rm", self.volname, clone) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_clone_retain_snapshot_with_snapshots(self): + """ + retain snapshots of a cloned subvolume and check disallowed operations + """ + subvolume = self._generate_random_subvolume_name() + snapshot1, snapshot2 = self._generate_random_snapshot_name(2) + clone = self._generate_random_clone_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # store path for clone verification + subvol1_path = self._get_subvolume_path(self.volname, subvolume) + + # do some IO + self._do_subvolume_io(subvolume, number_of_files=16) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot1) + + # remove with snapshot retention + self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots") + + # clone retained subvolume snapshot + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot1, clone) + + # check clone status + self._wait_for_clone_to_complete(clone) + + # verify clone + self._verify_clone(subvolume, snapshot1, clone, subvol_path=subvol1_path) + + # create a snapshot on the clone + self._fs_cmd("subvolume", "snapshot", "create", self.volname, clone, snapshot2) + + # retain a clone + self._fs_cmd("subvolume", "rm", self.volname, clone, "--retain-snapshots") + + # list snapshots + clonesnapshotls = json.loads(self._fs_cmd('subvolume', 'snapshot', 'ls', self.volname, clone)) + self.assertEqual(len(clonesnapshotls), 1, "Expected the 'fs subvolume snapshot ls' command to list the" + " created subvolume snapshots") + snapshotnames = [snapshot['name'] for snapshot in clonesnapshotls] + for snap in [snapshot2]: + self.assertIn(snap, snapshotnames, "Missing snapshot '{0}' in snapshot list".format(snap)) + + ## check disallowed operations on retained clone + # clone-status + try: + self._fs_cmd("clone", "status", self.volname, clone) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.ENOENT, "invalid error code on clone status of clone with retained snapshots") + else: + self.fail("expected clone status of clone with retained snapshots to fail") + + # clone-cancel + try: + self._fs_cmd("clone", "cancel", self.volname, clone) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.ENOENT, "invalid error code on clone cancel of clone with retained snapshots") + else: + self.fail("expected clone cancel of clone with retained snapshots to fail") + + # remove snapshots (removes subvolumes as all are in retained state) + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot1) + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, clone, snapshot2) + + # verify list subvolumes returns an empty list + subvolumels = json.loads(self._fs_cmd('subvolume', 'ls', self.volname)) + self.assertEqual(len(subvolumels), 0) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_retain_snapshot_clone(self): + """ + clone a snapshot from a snapshot retained subvolume + """ + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone = self._generate_random_clone_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # store path for clone verification + subvol_path = self._get_subvolume_path(self.volname, subvolume) + + # do some IO + self._do_subvolume_io(subvolume, number_of_files=16) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # remove with snapshot retention + self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots") + + # clone retained subvolume snapshot + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone) + + # check clone status + self._wait_for_clone_to_complete(clone) + + # verify clone + self._verify_clone(subvolume, snapshot, clone, subvol_path=subvol_path) + + # remove snapshots (removes retained volume) + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, clone) + + # verify list subvolumes returns an empty list + subvolumels = json.loads(self._fs_cmd('subvolume', 'ls', self.volname)) + self.assertEqual(len(subvolumels), 0) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_retain_snapshot_clone_from_newer_snapshot(self): + """ + clone a subvolume from recreated subvolume's latest snapshot + """ + subvolume = self._generate_random_subvolume_name() + snapshot1, snapshot2 = self._generate_random_snapshot_name(2) + clone = self._generate_random_clone_name(1) + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # do some IO + self._do_subvolume_io(subvolume, number_of_files=16) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot1) + + # remove with snapshot retention + self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots") + + # recreate subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # get and store path for clone verification + subvol2_path = self._get_subvolume_path(self.volname, subvolume) + + # do some IO + self._do_subvolume_io(subvolume, number_of_files=16) + + # snapshot newer subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot2) + + # remove with snapshot retention + self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots") + + # clone retained subvolume's newer snapshot + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot2, clone) + + # check clone status + self._wait_for_clone_to_complete(clone) + + # verify clone + self._verify_clone(subvolume, snapshot2, clone, subvol_path=subvol2_path) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot1) + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot2) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, clone) + + # verify list subvolumes returns an empty list + subvolumels = json.loads(self._fs_cmd('subvolume', 'ls', self.volname)) + self.assertEqual(len(subvolumels), 0) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_retain_snapshot_recreate(self): + """ + recreate a subvolume from one of its retained snapshots + """ + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # store path for clone verification + subvol_path = self._get_subvolume_path(self.volname, subvolume) + + # do some IO + self._do_subvolume_io(subvolume, number_of_files=16) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # remove with snapshot retention + self._fs_cmd("subvolume", "rm", self.volname, subvolume, "--retain-snapshots") + + # recreate retained subvolume using its own snapshot to clone + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, subvolume) + + # check clone status + self._wait_for_clone_to_complete(subvolume) + + # verify clone + self._verify_clone(subvolume, snapshot, subvolume, subvol_path=subvol_path) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + + # verify list subvolumes returns an empty list + subvolumels = json.loads(self._fs_cmd('subvolume', 'ls', self.volname)) + self.assertEqual(len(subvolumels), 0) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_retain_snapshot_trash_busy_recreate_clone(self): + """ + ensure retained clone recreate fails if its trash is not yet purged + """ + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone = self._generate_random_clone_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # clone subvolume snapshot + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone) + + # check clone status + self._wait_for_clone_to_complete(clone) + + # snapshot clone + self._fs_cmd("subvolume", "snapshot", "create", self.volname, clone, snapshot) + + # remove clone with snapshot retention + self._fs_cmd("subvolume", "rm", self.volname, clone, "--retain-snapshots") + + # fake a trash entry + self._update_fake_trash(clone) + + # clone subvolume snapshot (recreate) + try: + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EAGAIN, "invalid error code on recreate of clone with purge pending") + else: + self.fail("expected recreate of clone with purge pending to fail") + + # clear fake trash entry + self._update_fake_trash(clone, create=False) + + # recreate subvolume + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone) + + # check clone status + self._wait_for_clone_to_complete(clone) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, clone, snapshot) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + self._fs_cmd("subvolume", "rm", self.volname, clone) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_snapshot_attr_clone(self): + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone = self._generate_random_clone_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # do some IO + self._do_subvolume_io_mixed(subvolume) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # schedule a clone + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone) + + # check clone status + self._wait_for_clone_to_complete(clone) + + # verify clone + self._verify_clone(subvolume, snapshot, clone) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + self._fs_cmd("subvolume", "rm", self.volname, clone) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_clone_failure_status_pending_in_progress_complete(self): + """ + ensure failure status is not shown when clone is not in failed/cancelled state + """ + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone1 = self._generate_random_clone_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # do some IO + self._do_subvolume_io(subvolume, number_of_files=200) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # Insert delay at the beginning of snapshot clone + self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 5) + + # schedule a clone1 + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone1) + + # pending clone shouldn't show failure status + clone1_result = self._get_clone_status(clone1) + try: + clone1_result["status"]["failure"]["errno"] + except KeyError as e: + self.assertEqual(str(e), "'failure'") + else: + self.fail("clone status shouldn't show failure for pending clone") + + # check clone1 to be in-progress + self._wait_for_clone_to_be_in_progress(clone1) + + # in-progress clone1 shouldn't show failure status + clone1_result = self._get_clone_status(clone1) + try: + clone1_result["status"]["failure"]["errno"] + except KeyError as e: + self.assertEqual(str(e), "'failure'") + else: + self.fail("clone status shouldn't show failure for in-progress clone") + + # wait for clone1 to complete + self._wait_for_clone_to_complete(clone1) + + # complete clone1 shouldn't show failure status + clone1_result = self._get_clone_status(clone1) + try: + clone1_result["status"]["failure"]["errno"] + except KeyError as e: + self.assertEqual(str(e), "'failure'") + else: + self.fail("clone status shouldn't show failure for complete clone") + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + self._fs_cmd("subvolume", "rm", self.volname, clone1) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_clone_failure_status_failed(self): + """ + ensure failure status is shown when clone is in failed state and validate the reason + """ + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone1 = self._generate_random_clone_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # do some IO + self._do_subvolume_io(subvolume, number_of_files=200) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # Insert delay at the beginning of snapshot clone + self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 5) + + # schedule a clone1 + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone1) + + # remove snapshot from backend to force the clone failure. + snappath = os.path.join(".", "volumes", "_nogroup", subvolume, ".snap", snapshot) + self.mount_a.run_shell(['sudo', 'rmdir', snappath], omit_sudo=False) + + # wait for clone1 to fail. + self._wait_for_clone_to_fail(clone1) + + # check clone1 status + clone1_result = self._get_clone_status(clone1) + self.assertEqual(clone1_result["status"]["state"], "failed") + self.assertEqual(clone1_result["status"]["failure"]["errno"], "2") + self.assertEqual(clone1_result["status"]["failure"]["error_msg"], "snapshot '{0}' does not exist".format(snapshot)) + + # clone removal should succeed after failure, remove clone1 + self._fs_cmd("subvolume", "rm", self.volname, clone1, "--force") + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_clone_failure_status_pending_cancelled(self): + """ + ensure failure status is shown when clone is cancelled during pending state and validate the reason + """ + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone1 = self._generate_random_clone_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # do some IO + self._do_subvolume_io(subvolume, number_of_files=200) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # Insert delay at the beginning of snapshot clone + self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 5) + + # schedule a clone1 + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone1) + + # cancel pending clone1 + self._fs_cmd("clone", "cancel", self.volname, clone1) + + # check clone1 status + clone1_result = self._get_clone_status(clone1) + self.assertEqual(clone1_result["status"]["state"], "canceled") + self.assertEqual(clone1_result["status"]["failure"]["errno"], "4") + self.assertEqual(clone1_result["status"]["failure"]["error_msg"], "user interrupted clone operation") + + # clone removal should succeed with force after cancelled, remove clone1 + self._fs_cmd("subvolume", "rm", self.volname, clone1, "--force") + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_clone_failure_status_in_progress_cancelled(self): + """ + ensure failure status is shown when clone is cancelled during in-progress state and validate the reason + """ + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone1 = self._generate_random_clone_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # do some IO + self._do_subvolume_io(subvolume, number_of_files=200) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # Insert delay at the beginning of snapshot clone + self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 5) + + # schedule a clone1 + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone1) + + # wait for clone1 to be in-progress + self._wait_for_clone_to_be_in_progress(clone1) + + # cancel in-progess clone1 + self._fs_cmd("clone", "cancel", self.volname, clone1) + + # check clone1 status + clone1_result = self._get_clone_status(clone1) + self.assertEqual(clone1_result["status"]["state"], "canceled") + self.assertEqual(clone1_result["status"]["failure"]["errno"], "4") + self.assertEqual(clone1_result["status"]["failure"]["error_msg"], "user interrupted clone operation") + + # clone removal should succeed with force after cancelled, remove clone1 + self._fs_cmd("subvolume", "rm", self.volname, clone1, "--force") + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_snapshot_clone(self): + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone = self._generate_random_clone_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # do some IO + self._do_subvolume_io(subvolume, number_of_files=64) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # schedule a clone + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone) + + # check clone status + self._wait_for_clone_to_complete(clone) + + # verify clone + self._verify_clone(subvolume, snapshot, clone) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + self._fs_cmd("subvolume", "rm", self.volname, clone) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_snapshot_clone_quota_exceeded(self): + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone = self._generate_random_clone_name() + + # create subvolume with 20MB quota + osize = self.DEFAULT_FILE_SIZE*1024*1024*20 + self._fs_cmd("subvolume", "create", self.volname, subvolume,"--mode=777", "--size", str(osize)) + + # do IO, write 50 files of 1MB each to exceed quota. This mostly succeeds as quota enforcement takes time. + try: + self._do_subvolume_io(subvolume, number_of_files=50) + except CommandFailedError: + # ignore quota enforcement error. + pass + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # schedule a clone + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone) + + # check clone status + self._wait_for_clone_to_complete(clone) + + # verify clone + self._verify_clone(subvolume, snapshot, clone) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + self._fs_cmd("subvolume", "rm", self.volname, clone) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_snapshot_in_complete_clone_rm(self): + """ + Validates the removal of clone when it is not in 'complete|cancelled|failed' state. + The forceful removl of subvolume clone succeeds only if it's in any of the + 'complete|cancelled|failed' states. It fails with EAGAIN in any other states. + """ + + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone = self._generate_random_clone_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # do some IO + self._do_subvolume_io(subvolume, number_of_files=64) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # Insert delay at the beginning of snapshot clone + self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 2) + + # schedule a clone + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone) + + # Use --force since clone is not complete. Returns EAGAIN as clone is not either complete or cancelled. + try: + self._fs_cmd("subvolume", "rm", self.volname, clone, "--force") + except CommandFailedError as ce: + if ce.exitstatus != errno.EAGAIN: + raise RuntimeError("invalid error code when trying to remove failed clone") + else: + raise RuntimeError("expected error when removing a failed clone") + + # cancel on-going clone + self._fs_cmd("clone", "cancel", self.volname, clone) + + # verify canceled state + self._check_clone_canceled(clone) + + # clone removal should succeed after cancel + self._fs_cmd("subvolume", "rm", self.volname, clone, "--force") + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_snapshot_clone_retain_suid_guid(self): + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone = self._generate_random_clone_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # Create a file with suid, guid bits set along with executable bit. + args = ["subvolume", "getpath", self.volname, subvolume] + args = tuple(args) + subvolpath = self._fs_cmd(*args) + self.assertNotEqual(subvolpath, None) + subvolpath = subvolpath[1:].rstrip() # remove "/" prefix and any trailing newline + + file_path = subvolpath + file_path = os.path.join(subvolpath, "test_suid_file") + self.mount_a.run_shell(["touch", file_path]) + self.mount_a.run_shell(["chmod", "u+sx,g+sx", file_path]) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # schedule a clone + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone) + + # check clone status + self._wait_for_clone_to_complete(clone) + + # verify clone + self._verify_clone(subvolume, snapshot, clone) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + self._fs_cmd("subvolume", "rm", self.volname, clone) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_snapshot_clone_and_reclone(self): + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone1, clone2 = self._generate_random_clone_name(2) + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # do some IO + self._do_subvolume_io(subvolume, number_of_files=32) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # schedule a clone + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone1) + + # check clone status + self._wait_for_clone_to_complete(clone1) + + # verify clone + self._verify_clone(subvolume, snapshot, clone1) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # now the clone is just like a normal subvolume -- snapshot the clone and fork + # another clone. before that do some IO so it's can be differentiated. + self._do_subvolume_io(clone1, create_dir="data", number_of_files=32) + + # snapshot clone -- use same snap name + self._fs_cmd("subvolume", "snapshot", "create", self.volname, clone1, snapshot) + + # schedule a clone + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, clone1, snapshot, clone2) + + # check clone status + self._wait_for_clone_to_complete(clone2) + + # verify clone + self._verify_clone(clone1, snapshot, clone2) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, clone1, snapshot) + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + self._fs_cmd("subvolume", "rm", self.volname, clone1) + self._fs_cmd("subvolume", "rm", self.volname, clone2) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_snapshot_clone_cancel_in_progress(self): + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone = self._generate_random_clone_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # do some IO + self._do_subvolume_io(subvolume, number_of_files=128) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # Insert delay at the beginning of snapshot clone + self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 2) + + # schedule a clone + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone) + + # cancel on-going clone + self._fs_cmd("clone", "cancel", self.volname, clone) + + # verify canceled state + self._check_clone_canceled(clone) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + self._fs_cmd("subvolume", "rm", self.volname, clone, "--force") + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_snapshot_clone_cancel_pending(self): + """ + this test is a bit more involved compared to canceling an in-progress clone. + we'd need to ensure that a to-be canceled clone has still not been picked up + by cloner threads. exploit the fact that clones are picked up in an FCFS + fashion and there are four (4) cloner threads by default. When the number of + cloner threads increase, this test _may_ start tripping -- so, the number of + clone operations would need to be jacked up. + """ + # default number of clone threads + NR_THREADS = 4 + # good enough for 4 threads + NR_CLONES = 5 + # yeh, 1gig -- we need the clone to run for sometime + FILE_SIZE_MB = 1024 + + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clones = self._generate_random_clone_name(NR_CLONES) + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # do some IO + self._do_subvolume_io(subvolume, number_of_files=4, file_size=FILE_SIZE_MB) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # schedule clones + for clone in clones: + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone) + + to_wait = clones[0:NR_THREADS] + to_cancel = clones[NR_THREADS:] + + # cancel pending clones and verify + for clone in to_cancel: + status = json.loads(self._fs_cmd("clone", "status", self.volname, clone)) + self.assertEqual(status["status"]["state"], "pending") + self._fs_cmd("clone", "cancel", self.volname, clone) + self._check_clone_canceled(clone) + + # let's cancel on-going clones. handle the case where some of the clones + # _just_ complete + for clone in list(to_wait): + try: + self._fs_cmd("clone", "cancel", self.volname, clone) + to_cancel.append(clone) + to_wait.remove(clone) + except CommandFailedError as ce: + if ce.exitstatus != errno.EINVAL: + raise RuntimeError("invalid error code when cancelling on-going clone") + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + for clone in to_wait: + self._fs_cmd("subvolume", "rm", self.volname, clone) + for clone in to_cancel: + self._fs_cmd("subvolume", "rm", self.volname, clone, "--force") + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_snapshot_clone_different_groups(self): + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone = self._generate_random_clone_name() + s_group, c_group = self._generate_random_group_name(2) + + # create groups + self._fs_cmd("subvolumegroup", "create", self.volname, s_group) + self._fs_cmd("subvolumegroup", "create", self.volname, c_group) + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, s_group, "--mode=777") + + # do some IO + self._do_subvolume_io(subvolume, subvolume_group=s_group, number_of_files=32) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot, s_group) + + # schedule a clone + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone, + '--group_name', s_group, '--target_group_name', c_group) + + # check clone status + self._wait_for_clone_to_complete(clone, clone_group=c_group) + + # verify clone + self._verify_clone(subvolume, snapshot, clone, source_group=s_group, clone_group=c_group) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot, s_group) + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume, s_group) + self._fs_cmd("subvolume", "rm", self.volname, clone, c_group) + + # remove groups + self._fs_cmd("subvolumegroup", "rm", self.volname, s_group) + self._fs_cmd("subvolumegroup", "rm", self.volname, c_group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_snapshot_clone_fail_with_remove(self): + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone1, clone2 = self._generate_random_clone_name(2) + + pool_capacity = 32 * 1024 * 1024 + # number of files required to fill up 99% of the pool + nr_files = int((pool_capacity * 0.99) / (TestVolumes.DEFAULT_FILE_SIZE * 1024 * 1024)) + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # do some IO + self._do_subvolume_io(subvolume, number_of_files=nr_files) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # add data pool + new_pool = "new_pool" + self.fs.add_data_pool(new_pool) + + self.fs.mon_manager.raw_cluster_cmd("osd", "pool", "set-quota", new_pool, + "max_bytes", "{0}".format(pool_capacity // 4)) + + # schedule a clone + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone1, "--pool_layout", new_pool) + + # check clone status -- this should dramatically overshoot the pool quota + self._wait_for_clone_to_complete(clone1) + + # verify clone + self._verify_clone(subvolume, snapshot, clone1, clone_pool=new_pool) + + # wait a bit so that subsequent I/O will give pool full error + time.sleep(120) + + # schedule a clone + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone2, "--pool_layout", new_pool) + + # check clone status + self._wait_for_clone_to_fail(clone2) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + self._fs_cmd("subvolume", "rm", self.volname, clone1) + try: + self._fs_cmd("subvolume", "rm", self.volname, clone2) + except CommandFailedError as ce: + if ce.exitstatus != errno.EAGAIN: + raise RuntimeError("invalid error code when trying to remove failed clone") + else: + raise RuntimeError("expected error when removing a failed clone") + + # ... and with force, failed clone can be removed + self._fs_cmd("subvolume", "rm", self.volname, clone2, "--force") + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_snapshot_clone_on_existing_subvolumes(self): + subvolume1, subvolume2 = self._generate_random_subvolume_name(2) + snapshot = self._generate_random_snapshot_name() + clone = self._generate_random_clone_name() + + # create subvolumes + self._fs_cmd("subvolume", "create", self.volname, subvolume1, "--mode=777") + self._fs_cmd("subvolume", "create", self.volname, subvolume2, "--mode=777") + + # do some IO + self._do_subvolume_io(subvolume1, number_of_files=32) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume1, snapshot) + + # schedule a clone with target as subvolume2 + try: + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume1, snapshot, subvolume2) + except CommandFailedError as ce: + if ce.exitstatus != errno.EEXIST: + raise RuntimeError("invalid error code when cloning to existing subvolume") + else: + raise RuntimeError("expected cloning to fail if the target is an existing subvolume") + + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume1, snapshot, clone) + + # schedule a clone with target as clone + try: + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume1, snapshot, clone) + except CommandFailedError as ce: + if ce.exitstatus != errno.EEXIST: + raise RuntimeError("invalid error code when cloning to existing clone") + else: + raise RuntimeError("expected cloning to fail if the target is an existing clone") + + # check clone status + self._wait_for_clone_to_complete(clone) + + # verify clone + self._verify_clone(subvolume1, snapshot, clone) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume1, snapshot) + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume1) + self._fs_cmd("subvolume", "rm", self.volname, subvolume2) + self._fs_cmd("subvolume", "rm", self.volname, clone) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_snapshot_clone_pool_layout(self): + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone = self._generate_random_clone_name() + + # add data pool + new_pool = "new_pool" + newid = self.fs.add_data_pool(new_pool) + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # do some IO + self._do_subvolume_io(subvolume, number_of_files=32) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # schedule a clone + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone, "--pool_layout", new_pool) + + # check clone status + self._wait_for_clone_to_complete(clone) + + # verify clone + self._verify_clone(subvolume, snapshot, clone, clone_pool=new_pool) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + subvol_path = self._get_subvolume_path(self.volname, clone) + desired_pool = self.mount_a.getfattr(subvol_path, "ceph.dir.layout.pool") + try: + self.assertEqual(desired_pool, new_pool) + except AssertionError: + self.assertEqual(int(desired_pool), newid) # old kernel returns id + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + self._fs_cmd("subvolume", "rm", self.volname, clone) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_snapshot_clone_under_group(self): + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone = self._generate_random_clone_name() + group = self._generate_random_group_name() + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode=777") + + # do some IO + self._do_subvolume_io(subvolume, number_of_files=32) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # create group + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # schedule a clone + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone, '--target_group_name', group) + + # check clone status + self._wait_for_clone_to_complete(clone, clone_group=group) + + # verify clone + self._verify_clone(subvolume, snapshot, clone, clone_group=group) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + self._fs_cmd("subvolume", "rm", self.volname, clone, group) + + # remove group + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_snapshot_clone_with_attrs(self): + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone = self._generate_random_clone_name() + + mode = "777" + uid = "1000" + gid = "1000" + new_uid = "1001" + new_gid = "1001" + new_mode = "700" + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode", mode, "--uid", uid, "--gid", gid) + + # do some IO + self._do_subvolume_io(subvolume, number_of_files=32) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # change subvolume attrs (to ensure clone picks up snapshot attrs) + self._do_subvolume_attr_update(subvolume, new_uid, new_gid, new_mode) + + # schedule a clone + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone) + + # check clone status + self._wait_for_clone_to_complete(clone) + + # verify clone + self._verify_clone(subvolume, snapshot, clone) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + self._fs_cmd("subvolume", "rm", self.volname, clone) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_snapshot_clone_with_upgrade(self): + """ + yet another poor man's upgrade test -- rather than going through a full + upgrade cycle, emulate old types subvolumes by going through the wormhole + and verify clone operation. + further ensure that a legacy volume is not updated to v2, but clone is. + """ + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone = self._generate_random_clone_name() + + # emulate a old-fashioned subvolume + createpath = os.path.join(".", "volumes", "_nogroup", subvolume) + self.mount_a.run_shell_payload(f"sudo mkdir -p -m 777 {createpath}", omit_sudo=False) + + # add required xattrs to subvolume + default_pool = self.mount_a.getfattr(".", "ceph.dir.layout.pool") + self.mount_a.setfattr(createpath, 'ceph.dir.layout.pool', default_pool, sudo=True) + + # do some IO + self._do_subvolume_io(subvolume, number_of_files=64) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # ensure metadata file is in legacy location, with required version v1 + self._assert_meta_location_and_version(self.volname, subvolume, version=1, legacy=True) + + # Insert delay at the beginning of snapshot clone + self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 2) + + # schedule a clone + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone) + + # snapshot should not be deletable now + try: + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EAGAIN, msg="invalid error code when removing source snapshot of a clone") + else: + self.fail("expected removing source snapshot of a clone to fail") + + # check clone status + self._wait_for_clone_to_complete(clone) + + # verify clone + self._verify_clone(subvolume, snapshot, clone, source_version=1) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + + # ensure metadata file is in v2 location, with required version v2 + self._assert_meta_location_and_version(self.volname, clone) + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + self._fs_cmd("subvolume", "rm", self.volname, clone) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_snapshot_reconf_max_concurrent_clones(self): + """ + Validate 'max_concurrent_clones' config option + """ + + # get the default number of cloner threads + default_max_concurrent_clones = int(self.config_get('mgr', 'mgr/volumes/max_concurrent_clones')) + self.assertEqual(default_max_concurrent_clones, 4) + + # Increase number of cloner threads + self.config_set('mgr', 'mgr/volumes/max_concurrent_clones', 6) + max_concurrent_clones = int(self.config_get('mgr', 'mgr/volumes/max_concurrent_clones')) + self.assertEqual(max_concurrent_clones, 6) + + # Decrease number of cloner threads + self.config_set('mgr', 'mgr/volumes/max_concurrent_clones', 2) + max_concurrent_clones = int(self.config_get('mgr', 'mgr/volumes/max_concurrent_clones')) + self.assertEqual(max_concurrent_clones, 2) + + def test_subvolume_snapshot_config_snapshot_clone_delay(self): + """ + Validate 'snapshot_clone_delay' config option + """ + + # get the default delay before starting the clone + default_timeout = int(self.config_get('mgr', 'mgr/volumes/snapshot_clone_delay')) + self.assertEqual(default_timeout, 0) + + # Insert delay of 2 seconds at the beginning of the snapshot clone + self.config_set('mgr', 'mgr/volumes/snapshot_clone_delay', 2) + default_timeout = int(self.config_get('mgr', 'mgr/volumes/snapshot_clone_delay')) + self.assertEqual(default_timeout, 2) + + # Decrease number of cloner threads + self.config_set('mgr', 'mgr/volumes/max_concurrent_clones', 2) + max_concurrent_clones = int(self.config_get('mgr', 'mgr/volumes/max_concurrent_clones')) + self.assertEqual(max_concurrent_clones, 2) + + def test_subvolume_under_group_snapshot_clone(self): + subvolume = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + snapshot = self._generate_random_snapshot_name() + clone = self._generate_random_clone_name() + + # create group + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolume + self._fs_cmd("subvolume", "create", self.volname, subvolume, group, "--mode=777") + + # do some IO + self._do_subvolume_io(subvolume, subvolume_group=group, number_of_files=32) + + # snapshot subvolume + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot, group) + + # schedule a clone + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone, '--group_name', group) + + # check clone status + self._wait_for_clone_to_complete(clone) + + # verify clone + self._verify_clone(subvolume, snapshot, clone, source_group=group) + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot, group) + + # remove subvolumes + self._fs_cmd("subvolume", "rm", self.volname, subvolume, group) + self._fs_cmd("subvolume", "rm", self.volname, clone) + + # remove group + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + +class TestMisc(TestVolumesHelper): + """Miscellaneous tests related to FS volume, subvolume group, and subvolume operations.""" + def test_connection_expiration(self): + # unmount any cephfs mounts + for i in range(0, self.CLIENTS_REQUIRED): + self.mounts[i].umount_wait() + sessions = self._session_list() + self.assertLessEqual(len(sessions), 1) # maybe mgr is already mounted + + # Get the mgr to definitely mount cephfs + subvolume = self._generate_random_subvolume_name() + self._fs_cmd("subvolume", "create", self.volname, subvolume) + sessions = self._session_list() + self.assertEqual(len(sessions), 1) + + # Now wait for the mgr to expire the connection: + self.wait_until_evicted(sessions[0]['id'], timeout=90) + + def test_mgr_eviction(self): + # unmount any cephfs mounts + for i in range(0, self.CLIENTS_REQUIRED): + self.mounts[i].umount_wait() + sessions = self._session_list() + self.assertLessEqual(len(sessions), 1) # maybe mgr is already mounted + + # Get the mgr to definitely mount cephfs + subvolume = self._generate_random_subvolume_name() + self._fs_cmd("subvolume", "create", self.volname, subvolume) + sessions = self._session_list() + self.assertEqual(len(sessions), 1) + + # Now fail the mgr, check the session was evicted + mgr = self.mgr_cluster.get_active_id() + self.mgr_cluster.mgr_fail(mgr) + self.wait_until_evicted(sessions[0]['id']) + + def test_names_can_only_be_goodchars(self): + """ + Test the creating vols, subvols subvolgroups fails when their names uses + characters beyond [a-zA-Z0-9 -_.]. + """ + volname, badname = 'testvol', 'abcd@#' + + with self.assertRaises(CommandFailedError): + self._fs_cmd('volume', 'create', badname) + self._fs_cmd('volume', 'create', volname) + + with self.assertRaises(CommandFailedError): + self._fs_cmd('subvolumegroup', 'create', volname, badname) + + with self.assertRaises(CommandFailedError): + self._fs_cmd('subvolume', 'create', volname, badname) + self._fs_cmd('volume', 'rm', volname, '--yes-i-really-mean-it') + + def test_subvolume_ops_on_nonexistent_vol(self): + # tests the fs subvolume operations on non existing volume + + volname = "non_existent_subvolume" + + # try subvolume operations + for op in ("create", "rm", "getpath", "info", "resize", "pin", "ls"): + try: + if op == "resize": + self._fs_cmd("subvolume", "resize", volname, "subvolname_1", "inf") + elif op == "pin": + self._fs_cmd("subvolume", "pin", volname, "subvolname_1", "export", "1") + elif op == "ls": + self._fs_cmd("subvolume", "ls", volname) + else: + self._fs_cmd("subvolume", op, volname, "subvolume_1") + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.ENOENT) + else: + self.fail("expected the 'fs subvolume {0}' command to fail".format(op)) + + # try subvolume snapshot operations and clone create + for op in ("create", "rm", "info", "protect", "unprotect", "ls", "clone"): + try: + if op == "ls": + self._fs_cmd("subvolume", "snapshot", op, volname, "subvolume_1") + elif op == "clone": + self._fs_cmd("subvolume", "snapshot", op, volname, "subvolume_1", "snapshot_1", "clone_1") + else: + self._fs_cmd("subvolume", "snapshot", op, volname, "subvolume_1", "snapshot_1") + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.ENOENT) + else: + self.fail("expected the 'fs subvolume snapshot {0}' command to fail".format(op)) + + # try, clone status + try: + self._fs_cmd("clone", "status", volname, "clone_1") + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.ENOENT) + else: + self.fail("expected the 'fs clone status' command to fail") + + # try subvolumegroup operations + for op in ("create", "rm", "getpath", "pin", "ls"): + try: + if op == "pin": + self._fs_cmd("subvolumegroup", "pin", volname, "group_1", "export", "0") + elif op == "ls": + self._fs_cmd("subvolumegroup", op, volname) + else: + self._fs_cmd("subvolumegroup", op, volname, "group_1") + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.ENOENT) + else: + self.fail("expected the 'fs subvolumegroup {0}' command to fail".format(op)) + + # try subvolumegroup snapshot operations + for op in ("create", "rm", "ls"): + try: + if op == "ls": + self._fs_cmd("subvolumegroup", "snapshot", op, volname, "group_1") + else: + self._fs_cmd("subvolumegroup", "snapshot", op, volname, "group_1", "snapshot_1") + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.ENOENT) + else: + self.fail("expected the 'fs subvolumegroup snapshot {0}' command to fail".format(op)) + + def test_subvolume_upgrade_legacy_to_v1(self): + """ + poor man's upgrade test -- rather than going through a full upgrade cycle, + emulate subvolumes by going through the wormhole and verify if they are + accessible. + further ensure that a legacy volume is not updated to v2. + """ + subvolume1, subvolume2 = self._generate_random_subvolume_name(2) + group = self._generate_random_group_name() + + # emulate a old-fashioned subvolume -- one in the default group and + # the other in a custom group + createpath1 = os.path.join(".", "volumes", "_nogroup", subvolume1) + self.mount_a.run_shell(['sudo', 'mkdir', '-p', createpath1], omit_sudo=False) + + # create group + createpath2 = os.path.join(".", "volumes", group, subvolume2) + self.mount_a.run_shell(['sudo', 'mkdir', '-p', createpath2], omit_sudo=False) + + # this would auto-upgrade on access without anyone noticing + subvolpath1 = self._fs_cmd("subvolume", "getpath", self.volname, subvolume1) + self.assertNotEqual(subvolpath1, None) + subvolpath1 = subvolpath1.rstrip() # remove "/" prefix and any trailing newline + + subvolpath2 = self._fs_cmd("subvolume", "getpath", self.volname, subvolume2, group) + self.assertNotEqual(subvolpath2, None) + subvolpath2 = subvolpath2.rstrip() # remove "/" prefix and any trailing newline + + # and... the subvolume path returned should be what we created behind the scene + self.assertEqual(createpath1[1:], subvolpath1) + self.assertEqual(createpath2[1:], subvolpath2) + + # ensure metadata file is in legacy location, with required version v1 + self._assert_meta_location_and_version(self.volname, subvolume1, version=1, legacy=True) + self._assert_meta_location_and_version(self.volname, subvolume2, subvol_group=group, version=1, legacy=True) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume1) + self._fs_cmd("subvolume", "rm", self.volname, subvolume2, group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + # remove group + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_subvolume_no_upgrade_v1_sanity(self): + """ + poor man's upgrade test -- theme continues... + + This test is to ensure v1 subvolumes are retained as is, due to a snapshot being present, and runs through + a series of operations on the v1 subvolume to ensure they work as expected. + """ + subvol_md = ["atime", "bytes_pcent", "bytes_quota", "bytes_used", "created_at", "ctime", + "data_pool", "gid", "mode", "mon_addrs", "mtime", "path", "pool_namespace", + "type", "uid", "features", "state"] + snap_md = ["created_at", "data_pool", "has_pending_clones"] + + subvolume = self._generate_random_subvolume_name() + snapshot = self._generate_random_snapshot_name() + clone1, clone2 = self._generate_random_clone_name(2) + mode = "777" + uid = "1000" + gid = "1000" + + # emulate a v1 subvolume -- in the default group + subvolume_path = self._create_v1_subvolume(subvolume) + + # getpath + subvolpath = self._get_subvolume_path(self.volname, subvolume) + self.assertEqual(subvolpath, subvolume_path) + + # ls + subvolumes = json.loads(self._fs_cmd('subvolume', 'ls', self.volname)) + self.assertEqual(len(subvolumes), 1, "subvolume ls count mismatch, expected '1', found {0}".format(len(subvolumes))) + self.assertEqual(subvolumes[0]['name'], subvolume, + "subvolume name mismatch in ls output, expected '{0}', found '{1}'".format(subvolume, subvolumes[0]['name'])) + + # info + subvol_info = json.loads(self._get_subvolume_info(self.volname, subvolume)) + for md in subvol_md: + self.assertIn(md, subvol_info, "'{0}' key not present in metadata of subvolume".format(md)) + + self.assertEqual(subvol_info["state"], "complete", + msg="expected state to be 'complete', found '{0}".format(subvol_info["state"])) + self.assertEqual(len(subvol_info["features"]), 2, + msg="expected 1 feature, found '{0}' ({1})".format(len(subvol_info["features"]), subvol_info["features"])) + for feature in ['snapshot-clone', 'snapshot-autoprotect']: + self.assertIn(feature, subvol_info["features"], msg="expected feature '{0}' in subvolume".format(feature)) + + # resize + nsize = self.DEFAULT_FILE_SIZE*1024*1024*10 + self._fs_cmd("subvolume", "resize", self.volname, subvolume, str(nsize)) + subvol_info = json.loads(self._get_subvolume_info(self.volname, subvolume)) + for md in subvol_md: + self.assertIn(md, subvol_info, "'{0}' key not present in metadata of subvolume".format(md)) + self.assertEqual(subvol_info["bytes_quota"], nsize, "bytes_quota should be set to '{0}'".format(nsize)) + + # create (idempotent) (change some attrs, to ensure attrs are preserved from the snapshot on clone) + self._fs_cmd("subvolume", "create", self.volname, subvolume, "--mode", mode, "--uid", uid, "--gid", gid) + + # do some IO + self._do_subvolume_io(subvolume, number_of_files=8) + + # snap-create + self._fs_cmd("subvolume", "snapshot", "create", self.volname, subvolume, snapshot) + + # clone + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, snapshot, clone1) + + # check clone status + self._wait_for_clone_to_complete(clone1) + + # ensure clone is v2 + self._assert_meta_location_and_version(self.volname, clone1, version=2) + + # verify clone + self._verify_clone(subvolume, snapshot, clone1, source_version=1) + + # clone (older snapshot) + self._fs_cmd("subvolume", "snapshot", "clone", self.volname, subvolume, 'fake', clone2) + + # check clone status + self._wait_for_clone_to_complete(clone2) + + # ensure clone is v2 + self._assert_meta_location_and_version(self.volname, clone2, version=2) + + # verify clone + # TODO: rentries will mismatch till this is fixed https://tracker.ceph.com/issues/46747 + #self._verify_clone(subvolume, 'fake', clone2, source_version=1) + + # snap-info + snap_info = json.loads(self._get_subvolume_snapshot_info(self.volname, subvolume, snapshot)) + for md in snap_md: + self.assertIn(md, snap_info, "'{0}' key not present in metadata of snapshot".format(md)) + self.assertEqual(snap_info["has_pending_clones"], "no") + + # snap-ls + subvol_snapshots = json.loads(self._fs_cmd('subvolume', 'snapshot', 'ls', self.volname, subvolume)) + self.assertEqual(len(subvol_snapshots), 2, "subvolume ls count mismatch, expected 2', found {0}".format(len(subvol_snapshots))) + snapshotnames = [snapshot['name'] for snapshot in subvol_snapshots] + for name in [snapshot, 'fake']: + self.assertIn(name, snapshotnames, msg="expected snapshot '{0}' in subvolume snapshot ls".format(name)) + + # snap-rm + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, snapshot) + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume, "fake") + + # ensure volume is still at version 1 + self._assert_meta_location_and_version(self.volname, subvolume, version=1) + + # rm + self._fs_cmd("subvolume", "rm", self.volname, subvolume) + self._fs_cmd("subvolume", "rm", self.volname, clone1) + self._fs_cmd("subvolume", "rm", self.volname, clone2) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_no_upgrade_v1_to_v2(self): + """ + poor man's upgrade test -- theme continues... + ensure v1 to v2 upgrades are not done automatically due to various states of v1 + """ + subvolume1, subvolume2, subvolume3 = self._generate_random_subvolume_name(3) + group = self._generate_random_group_name() + + # emulate a v1 subvolume -- in the default group + subvol1_path = self._create_v1_subvolume(subvolume1) + + # emulate a v1 subvolume -- in a custom group + subvol2_path = self._create_v1_subvolume(subvolume2, subvol_group=group) + + # emulate a v1 subvolume -- in a clone pending state + self._create_v1_subvolume(subvolume3, subvol_type='clone', has_snapshot=False, state='pending') + + # this would attempt auto-upgrade on access, but fail to do so as snapshots exist + subvolpath1 = self._get_subvolume_path(self.volname, subvolume1) + self.assertEqual(subvolpath1, subvol1_path) + + subvolpath2 = self._get_subvolume_path(self.volname, subvolume2, group_name=group) + self.assertEqual(subvolpath2, subvol2_path) + + # this would attempt auto-upgrade on access, but fail to do so as volume is not complete + # use clone status, as only certain operations are allowed in pending state + status = json.loads(self._fs_cmd("clone", "status", self.volname, subvolume3)) + self.assertEqual(status["status"]["state"], "pending") + + # remove snapshot + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume1, "fake") + self._fs_cmd("subvolume", "snapshot", "rm", self.volname, subvolume2, "fake", group) + + # ensure metadata file is in v1 location, with version retained as v1 + self._assert_meta_location_and_version(self.volname, subvolume1, version=1) + self._assert_meta_location_and_version(self.volname, subvolume2, subvol_group=group, version=1) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume1) + self._fs_cmd("subvolume", "rm", self.volname, subvolume2, group) + try: + self._fs_cmd("subvolume", "rm", self.volname, subvolume3) + except CommandFailedError as ce: + self.assertEqual(ce.exitstatus, errno.EAGAIN, "invalid error code on rm of subvolume undergoing clone") + else: + self.fail("expected rm of subvolume undergoing clone to fail") + + # ensure metadata file is in v1 location, with version retained as v1 + self._assert_meta_location_and_version(self.volname, subvolume3, version=1) + self._fs_cmd("subvolume", "rm", self.volname, subvolume3, "--force") + + # verify list subvolumes returns an empty list + subvolumels = json.loads(self._fs_cmd('subvolume', 'ls', self.volname)) + self.assertEqual(len(subvolumels), 0) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_subvolume_upgrade_v1_to_v2(self): + """ + poor man's upgrade test -- theme continues... + ensure v1 to v2 upgrades work + """ + subvolume1, subvolume2 = self._generate_random_subvolume_name(2) + group = self._generate_random_group_name() + + # emulate a v1 subvolume -- in the default group + subvol1_path = self._create_v1_subvolume(subvolume1, has_snapshot=False) + + # emulate a v1 subvolume -- in a custom group + subvol2_path = self._create_v1_subvolume(subvolume2, subvol_group=group, has_snapshot=False) + + # this would attempt auto-upgrade on access + subvolpath1 = self._get_subvolume_path(self.volname, subvolume1) + self.assertEqual(subvolpath1, subvol1_path) + + subvolpath2 = self._get_subvolume_path(self.volname, subvolume2, group_name=group) + self.assertEqual(subvolpath2, subvol2_path) + + # ensure metadata file is in v2 location, with version retained as v2 + self._assert_meta_location_and_version(self.volname, subvolume1, version=2) + self._assert_meta_location_and_version(self.volname, subvolume2, subvol_group=group, version=2) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvolume1) + self._fs_cmd("subvolume", "rm", self.volname, subvolume2, group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_malicious_metafile_on_legacy_to_v1_upgrade(self): + """ + Validate handcrafted .meta file on legacy subvol root doesn't break the system + on legacy subvol upgrade to v1 + poor man's upgrade test -- theme continues... + """ + subvol1, subvol2 = self._generate_random_subvolume_name(2) + + # emulate a old-fashioned subvolume in the default group + createpath1 = os.path.join(".", "volumes", "_nogroup", subvol1) + self.mount_a.run_shell(['sudo', 'mkdir', '-p', createpath1], omit_sudo=False) + + # add required xattrs to subvolume + default_pool = self.mount_a.getfattr(".", "ceph.dir.layout.pool") + self.mount_a.setfattr(createpath1, 'ceph.dir.layout.pool', default_pool, sudo=True) + + # create v2 subvolume + self._fs_cmd("subvolume", "create", self.volname, subvol2) + + # Create malicious .meta file in legacy subvolume root. Copy v2 subvolume + # .meta into legacy subvol1's root + subvol2_metapath = os.path.join(".", "volumes", "_nogroup", subvol2, ".meta") + self.mount_a.run_shell(['sudo', 'cp', subvol2_metapath, createpath1], omit_sudo=False) + + # Upgrade legacy subvol1 to v1 + subvolpath1 = self._fs_cmd("subvolume", "getpath", self.volname, subvol1) + self.assertNotEqual(subvolpath1, None) + subvolpath1 = subvolpath1.rstrip() + + # the subvolume path returned should not be of subvol2 from handcrafted + # .meta file + self.assertEqual(createpath1[1:], subvolpath1) + + # ensure metadata file is in legacy location, with required version v1 + self._assert_meta_location_and_version(self.volname, subvol1, version=1, legacy=True) + + # Authorize alice authID read-write access to subvol1. Verify it authorizes subvol1 path and not subvol2 + # path whose '.meta' file is copied to subvol1 root + authid1 = "alice" + self._fs_cmd("subvolume", "authorize", self.volname, subvol1, authid1) + + # Validate that the mds path added is of subvol1 and not of subvol2 + out = json.loads(self.fs.mon_manager.raw_cluster_cmd("auth", "get", "client.alice", "--format=json-pretty")) + self.assertEqual("client.alice", out[0]["entity"]) + self.assertEqual("allow rw path={0}".format(createpath1[1:]), out[0]["caps"]["mds"]) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvol1) + self._fs_cmd("subvolume", "rm", self.volname, subvol2) + + # verify trash dir is clean + self._wait_for_trash_empty() + + def test_binary_metafile_on_legacy_to_v1_upgrade(self): + """ + Validate binary .meta file on legacy subvol root doesn't break the system + on legacy subvol upgrade to v1 + poor man's upgrade test -- theme continues... + """ + subvol = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + + # emulate a old-fashioned subvolume -- in a custom group + createpath = os.path.join(".", "volumes", group, subvol) + self.mount_a.run_shell(['sudo', 'mkdir', '-p', createpath], omit_sudo=False) + + # add required xattrs to subvolume + default_pool = self.mount_a.getfattr(".", "ceph.dir.layout.pool") + self.mount_a.setfattr(createpath, 'ceph.dir.layout.pool', default_pool, sudo=True) + + # Create unparseable binary .meta file on legacy subvol's root + meta_contents = os.urandom(4096) + meta_filepath = os.path.join(self.mount_a.mountpoint, createpath, ".meta") + self.mount_a.client_remote.write_file(meta_filepath, meta_contents, sudo=True) + + # Upgrade legacy subvol to v1 + subvolpath = self._fs_cmd("subvolume", "getpath", self.volname, subvol, group) + self.assertNotEqual(subvolpath, None) + subvolpath = subvolpath.rstrip() + + # The legacy subvolume path should be returned for subvol. + # Should ignore unparseable binary .meta file in subvol's root + self.assertEqual(createpath[1:], subvolpath) + + # ensure metadata file is in legacy location, with required version v1 + self._assert_meta_location_and_version(self.volname, subvol, subvol_group=group, version=1, legacy=True) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvol, group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + # remove group + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + def test_unparseable_metafile_on_legacy_to_v1_upgrade(self): + """ + Validate unparseable text .meta file on legacy subvol root doesn't break the system + on legacy subvol upgrade to v1 + poor man's upgrade test -- theme continues... + """ + subvol = self._generate_random_subvolume_name() + group = self._generate_random_group_name() + + # emulate a old-fashioned subvolume -- in a custom group + createpath = os.path.join(".", "volumes", group, subvol) + self.mount_a.run_shell(['sudo', 'mkdir', '-p', createpath], omit_sudo=False) + + # add required xattrs to subvolume + default_pool = self.mount_a.getfattr(".", "ceph.dir.layout.pool") + self.mount_a.setfattr(createpath, 'ceph.dir.layout.pool', default_pool, sudo=True) + + # Create unparseable text .meta file on legacy subvol's root + meta_contents = "unparseable config\nfile ...\nunparseable config\nfile ...\n" + meta_filepath = os.path.join(self.mount_a.mountpoint, createpath, ".meta") + self.mount_a.client_remote.write_file(meta_filepath, meta_contents, sudo=True) + + # Upgrade legacy subvol to v1 + subvolpath = self._fs_cmd("subvolume", "getpath", self.volname, subvol, group) + self.assertNotEqual(subvolpath, None) + subvolpath = subvolpath.rstrip() + + # The legacy subvolume path should be returned for subvol. + # Should ignore unparseable binary .meta file in subvol's root + self.assertEqual(createpath[1:], subvolpath) + + # ensure metadata file is in legacy location, with required version v1 + self._assert_meta_location_and_version(self.volname, subvol, subvol_group=group, version=1, legacy=True) + + # remove subvolume + self._fs_cmd("subvolume", "rm", self.volname, subvol, group) + + # verify trash dir is clean + self._wait_for_trash_empty() + + # remove group + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + +class TestPerModuleFinsherThread(TestVolumesHelper): + """ + Per module finisher thread tests related to mgr/volume cmds. + This is used in conjuction with check_counter with min val being 4 + as four subvolume cmds are run + """ + def test_volumes_module_finisher_thread(self): + subvol1, subvol2, subvol3 = self._generate_random_subvolume_name(3) + group = self._generate_random_group_name() + + # create group + self._fs_cmd("subvolumegroup", "create", self.volname, group) + + # create subvolumes in group + self._fs_cmd("subvolume", "create", self.volname, subvol1, "--group_name", group) + self._fs_cmd("subvolume", "create", self.volname, subvol2, "--group_name", group) + self._fs_cmd("subvolume", "create", self.volname, subvol3, "--group_name", group) + + self._fs_cmd("subvolume", "rm", self.volname, subvol1, group) + self._fs_cmd("subvolume", "rm", self.volname, subvol2, group) + self._fs_cmd("subvolume", "rm", self.volname, subvol3, group) + self._fs_cmd("subvolumegroup", "rm", self.volname, group) + + # verify trash dir is clean + self._wait_for_trash_empty() diff --git a/qa/tasks/cephfs/xfstests_dev.py b/qa/tasks/cephfs/xfstests_dev.py new file mode 100644 index 000000000..cbb344305 --- /dev/null +++ b/qa/tasks/cephfs/xfstests_dev.py @@ -0,0 +1,303 @@ +from io import StringIO +from logging import getLogger +from os import getcwd as os_getcwd +from os.path import join +from textwrap import dedent + + +from tasks.cephfs.cephfs_test_case import CephFSTestCase +from tasks.cephfs.fuse_mount import FuseMount +from tasks.cephfs.kernel_mount import KernelMount + + +log = getLogger(__name__) + + +# TODO: add code to run non-ACL tests too. +# TODO: make xfstests-dev tests running without running `make install`. +class XFSTestsDev(CephFSTestCase): + + RESULTS_DIR = "results" + + def setUp(self): + super(XFSTestsDev, self).setUp() + self.setup_xfsprogs_devs() + self.prepare_xfstests_devs() + + def setup_xfsprogs_devs(self): + self.install_xfsprogs = False + + def prepare_xfstests_devs(self): + # NOTE: To run a quick test with vstart_runner.py, enable next line + # and disable calls to get_repo(), install_deps(), and + # build_and_install() and also disable lines in tearDown() for repo + # deletion. + #self.xfstests_repo_path = '/path/to/xfstests-dev' + + self.get_repos() + self.get_test_and_scratch_dirs_ready() + self.install_deps() + self.create_reqd_users() + self.write_local_config() + self.write_ceph_exclude() + self.build_and_install() + + def tearDown(self): + self.del_users_and_groups() + self.del_repos() + super(XFSTestsDev, self).tearDown() + + def del_users_and_groups(self): + self.mount_a.client_remote.run(args=['sudo', 'userdel', '--force', + '--remove', 'fsgqa'], + omit_sudo=False, check_status=False) + self.mount_a.client_remote.run(args=['sudo', 'userdel', '--force', + '--remove', '123456-fsgqa'], + omit_sudo=False, check_status=False) + self.mount_a.client_remote.run(args=['sudo', 'groupdel', 'fsgqa'], + omit_sudo=False, check_status=False) + + def del_repos(self): + self.save_results_dir() + self.mount_a.client_remote.run(args=f'sudo rm -rf {self.xfstests_repo_path}', + omit_sudo=False, check_status=False) + + if self.install_xfsprogs: + self.mount_a.client_remote.run(args=f'sudo rm -rf {self.xfsprogs_repo_path}', + omit_sudo=False, check_status=False) + + def save_results_dir(self): + """ + When tests in xfstests-dev repo are executed, logs are created and + saved, under a directory named "results" that lies at the repo root. + In case a test from xfstests-dev repo fails, these logs will help find + the cause of the failure. + + Since there's no option in teuthology to copy a directory lying at a + custom location in order to save it from teuthology test runner's tear + down, let's copy this directory to a standard location that teuthology + copies away before erasing all data on the test machine. The standard + location chosen in the case here is the Ceph log directory. + + In case of vstart_runner.py, this methods does nothing. + """ + # No need to save results dir in case of vstart_runner.py. + for x in ('LocalFuseMount', 'LocalKernelMount'): + if x in self.mount_a.__class__.__name__: + return + + src = join(self.xfstests_repo_path, self.RESULTS_DIR) + + if self.mount_a.run_shell(f'sudo stat {src}', + check_status=False, omit_sudo=False).returncode != 0: + log.info(f'xfstests-dev repo contains not directory named ' + f'"{self.RESULTS_DIR}". repo location: {self.xfstests_repo_path}') + return + + std_loc = '/var/log/ceph' # standard location + dst = join(std_loc, 'xfstests-dev-results') + self.mount_a.run_shell(f'sudo mkdir -p {dst}', omit_sudo=False) + self.mount_a.run_shell(f'sudo cp -r {src} {dst}', omit_sudo=False) + log.info(f'results dir from xfstests-dev has been saved; it was ' + f'copied from {self.xfstests_repo_path} to {std_loc}.') + + def build_and_install(self): + # NOTE: On teuthology machines it's necessary to run "make" as + # superuser since the repo is cloned somewhere in /tmp. + self.mount_a.client_remote.run(args=['sudo', 'make'], + cwd=self.xfstests_repo_path, stdout=StringIO(), + stderr=StringIO()) + self.mount_a.client_remote.run(args=['sudo', 'make', 'install'], + cwd=self.xfstests_repo_path, omit_sudo=False, + stdout=StringIO(), stderr=StringIO()) + + if self.install_xfsprogs: + self.mount_a.client_remote.run(args=['sudo', 'make'], + cwd=self.xfsprogs_repo_path, + stdout=StringIO(), stderr=StringIO()) + self.mount_a.client_remote.run(args=['sudo', 'make', 'install'], + cwd=self.xfsprogs_repo_path, omit_sudo=False, + stdout=StringIO(), stderr=StringIO()) + + def get_repos(self): + """ + Clone xfstests_dev and xfsprogs-dev repositories. If already present, + update them. The xfsprogs-dev will be used to test the encrypt. + """ + # TODO: make sure that repo is not cloned for every test. it should + # happen only once. + remoteurl = 'https://git.ceph.com/xfstests-dev.git' + self.xfstests_repo_path = self.mount_a.client_remote.mkdtemp(suffix= + 'xfstests-dev') + self.mount_a.run_shell(['git', 'clone', remoteurl, '--depth', '1', + self.xfstests_repo_path]) + + if self.install_xfsprogs: + remoteurl = 'https://git.ceph.com/xfsprogs-dev.git' + self.xfsprogs_repo_path = self.mount_a.client_remote.mkdtemp(suffix= + 'xfsprogs-dev') + self.mount_a.run_shell(['git', 'clone', remoteurl, '--depth', '1', + self.xfsprogs_repo_path]) + + def get_admin_key(self): + import configparser + + cp = configparser.ConfigParser() + cp.read_string(self.fs.mon_manager.raw_cluster_cmd( + 'auth', 'get-or-create', 'client.admin')) + + return cp['client.admin']['key'] + + def get_test_and_scratch_dirs_ready(self): + """ "test" and "scratch" directories are directories inside Ceph FS. + And, test and scratch mounts are path on the local FS where "test" + and "scratch" directories would be mounted. Look at xfstests-dev + local.config's template inside this file to get some context. + """ + self.test_dirname = 'test' + self.mount_a.run_shell(['mkdir', self.test_dirname]) + # read var name as "test dir's mount path" + self.test_dirs_mount_path = self.mount_a.client_remote.mkdtemp( + suffix=self.test_dirname) + + self.scratch_dirname = 'scratch' + self.mount_a.run_shell(['mkdir', self.scratch_dirname]) + # read var name as "scratch dir's mount path" + self.scratch_dirs_mount_path = self.mount_a.client_remote.mkdtemp( + suffix=self.scratch_dirname) + + def install_deps(self): + from teuthology.misc import get_system_type + + distro, version = get_system_type(self.mount_a.client_remote, + distro=True, version=True) + distro = distro.lower() + major_ver_num = int(version.split('.')[0]) # only keep major release + # number + log.info(f'distro and version detected is "{distro}" and "{version}".') + + # we keep fedora here so that right deps are installed when this test + # is run locally by a dev. + if distro in ('redhatenterpriseserver', 'redhatenterprise', 'fedora', + 'centos', 'centosstream', 'rhel'): + deps = """acl attr automake bc dbench dump e2fsprogs fio \ + gawk gcc indent libtool lvm2 make psmisc quota sed \ + xfsdump xfsprogs \ + libacl-devel libattr-devel libaio-devel libuuid-devel \ + xfsprogs-devel btrfs-progs-devel python3 sqlite""".split() + + if self.install_xfsprogs: + if distro == 'centosstream' and major_ver_num == 8: + deps += ['--enablerepo=powertools'] + deps += ['inih-devel', 'userspace-rcu-devel', 'libblkid-devel', + 'gettext', 'libedit-devel', 'libattr-devel', + 'device-mapper-devel', 'libicu-devel'] + + deps_old_distros = ['xfsprogs-qa-devel'] + + if distro != 'fedora' and major_ver_num > 7: + deps.remove('btrfs-progs-devel') + + args = ['sudo', 'yum', 'install', '-y'] + deps + deps_old_distros + elif distro == 'ubuntu': + deps = """xfslibs-dev uuid-dev libtool-bin \ + e2fsprogs automake gcc libuuid1 quota attr libattr1-dev make \ + libacl1-dev libaio-dev xfsprogs libgdbm-dev gawk fio dbench \ + uuid-runtime python sqlite3""".split() + + if self.install_xfsprogs: + deps += ['libinih-dev', 'liburcu-dev', 'libblkid-dev', + 'gettext', 'libedit-dev', 'libattr1-dev', + 'libdevmapper-dev', 'libicu-dev', 'pkg-config'] + + if major_ver_num >= 19: + deps[deps.index('python')] ='python2' + args = ['sudo', 'apt-get', 'install', '-y'] + deps + else: + raise RuntimeError('expected a yum based or a apt based system') + + self.mount_a.client_remote.run(args=args, omit_sudo=False) + + def create_reqd_users(self): + self.mount_a.client_remote.run(args=['sudo', 'useradd', '-m', 'fsgqa'], + omit_sudo=False, check_status=False) + self.mount_a.client_remote.run(args=['sudo', 'groupadd', 'fsgqa'], + omit_sudo=False, check_status=False) + self.mount_a.client_remote.run(args=['sudo', 'useradd', 'fsgqa2'], + omit_sudo=False, check_status=False) + self.mount_a.client_remote.run(args=['sudo', 'useradd', + '123456-fsgqa'], omit_sudo=False, + check_status=False) + + def write_local_config(self, options=None): + if isinstance(self.mount_a, KernelMount): + conf_contents = self._gen_conf_for_kernel_mnt(options) + elif isinstance(self.mount_a, FuseMount): + conf_contents = self._gen_conf_for_fuse_mnt(options) + + self.mount_a.client_remote.write_file(join(self.xfstests_repo_path, + 'local.config'), + conf_contents, sudo=True) + log.info(f'local.config\'s contents -\n{conf_contents}') + + def _gen_conf_for_kernel_mnt(self, options=None): + """ + Generate local.config for CephFS kernel client. + """ + _options = '' if not options else ',' + options + mon_sock = self.fs.mon_manager.get_msgrv1_mon_socks()[0] + test_dev = mon_sock + ':/' + self.test_dirname + scratch_dev = mon_sock + ':/' + self.scratch_dirname + + return dedent(f'''\ + export FSTYP=ceph + export TEST_DEV={test_dev} + export TEST_DIR={self.test_dirs_mount_path} + export SCRATCH_DEV={scratch_dev} + export SCRATCH_MNT={self.scratch_dirs_mount_path} + export CEPHFS_MOUNT_OPTIONS="-o name=admin,secret={self.get_admin_key()}{_options}" + ''') + + def _gen_conf_for_fuse_mnt(self, options=None): + """ + Generate local.config for CephFS FUSE client. + """ + mon_sock = self.fs.mon_manager.get_msgrv1_mon_socks()[0] + test_dev = 'ceph-fuse' + scratch_dev = '' + # XXX: Please note that ceph_fuse_bin_path is not ideally required + # because ceph-fuse binary ought to be present in one of the standard + # locations during teuthology tests. But then testing with + # vstart_runner.py will not work since ceph-fuse binary won't be + # present in a standard locations during these sessions. Thus, this + # workaround. + ceph_fuse_bin_path = 'ceph-fuse' # bin expected to be in env + if 'LocalFuseMount' in str(type(self.mount_a)): # for vstart_runner.py runs + ceph_fuse_bin_path = join(os_getcwd(), 'bin', 'ceph-fuse') + + keyring_path = self.mount_a.client_remote.mktemp( + data=self.fs.mon_manager.get_keyring('client.admin')+'\n') + + lastline = (f'export CEPHFS_MOUNT_OPTIONS="-m {mon_sock} -k ' + f'{keyring_path} --client_mountpoint /{self.test_dirname}') + lastline += f'-o {options}"' if options else '"' + + return dedent(f'''\ + export FSTYP=ceph-fuse + export CEPH_FUSE_BIN_PATH={ceph_fuse_bin_path} + export TEST_DEV={test_dev} # without this tests won't get started + export TEST_DIR={self.test_dirs_mount_path} + export SCRATCH_DEV={scratch_dev} + export SCRATCH_MNT={self.scratch_dirs_mount_path} + {lastline} + ''') + + def write_ceph_exclude(self): + # These tests will fail or take too much time and will + # make the test timedout, just skip them for now. + xfstests_exclude_contents = dedent('''\ + {c}/001 {g}/003 {g}/020 {g}/075 {g}/317 {g}/538 {g}/531 + ''').format(g="generic", c="ceph") + + self.mount_a.client_remote.write_file(join(self.xfstests_repo_path, 'ceph.exclude'), + xfstests_exclude_contents, sudo=True) |